In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('Lung Cancer.csv')
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [3]:
#DROPPING COLUMNS --> id,family_history,other_cancer,end_treatment_date
df.drop(['id','family_history','other_cancer','end_treatment_date'],axis=1,inplace=True)

In [4]:
df['diagnosis_date']=pd.to_datetime(df['diagnosis_date'],format='mixed').dt.year

In [5]:
df=pd.DataFrame(df)

In [6]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
encoder=OneHotEncoder(drop='first',sparse_output=False)
df.gender=encoder.fit_transform(df[['gender']])

In [7]:
df['cancer_stage']=df['cancer_stage'].map({'Stage I':1,'Stage II':2,'Stage III':3,'Stage IV':4})

In [8]:
le=LabelEncoder()
df['smoking_status']=le.fit_transform(df['smoking_status'])

In [9]:
df['treatment_type']=le.fit_transform(df['treatment_type'])

In [10]:
pd.get_dummies(df,columns=['country'])

Unnamed: 0,age,gender,diagnosis_date,cancer_stage,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,...,country_Luxembourg,country_Malta,country_Netherlands,country_Poland,country_Portugal,country_Romania,country_Slovakia,country_Slovenia,country_Spain,country_Sweden
0,64.0,1.0,2016,1,3,29.4,199,0,0,1,...,False,False,False,False,False,False,False,False,False,True
1,50.0,0.0,2023,3,3,41.2,280,1,1,0,...,False,False,True,False,False,False,False,False,False,False
2,65.0,0.0,2023,3,1,44.0,268,1,1,0,...,False,False,False,False,False,False,False,False,False,False
3,51.0,0.0,2016,1,3,43.0,241,1,1,0,...,False,False,False,False,False,False,False,False,False,False
4,37.0,1.0,2023,1,3,19.7,178,0,0,0,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,40.0,1.0,2022,4,3,44.8,243,1,1,1,...,False,True,False,False,False,False,False,False,False,False
889996,62.0,0.0,2015,2,1,21.6,240,0,0,0,...,False,False,False,False,False,False,False,False,False,False
889997,48.0,0.0,2016,3,2,38.6,242,1,0,0,...,False,False,False,False,False,False,False,False,False,False
889998,67.0,0.0,2015,4,1,18.6,194,1,1,0,...,False,False,False,False,False,False,True,False,False,False


In [11]:
df.groupby('country')['survived'].mean().sort_values()


country
France            0.215760
Sweden            0.216067
Italy             0.216493
Slovakia          0.217362
Austria           0.217867
Lithuania         0.218986
Ireland           0.219024
Denmark           0.219416
Greece            0.219503
Belgium           0.219578
Hungary           0.219884
Croatia           0.219899
Bulgaria          0.220369
Cyprus            0.220392
Slovenia          0.220421
Spain             0.220901
Germany           0.221008
Poland            0.221039
Portugal          0.221995
Romania           0.222006
Luxembourg        0.222114
Finland           0.222453
Latvia            0.222509
Malta             0.222555
Netherlands       0.222609
Czech Republic    0.222928
Estonia           0.223117
Name: survived, dtype: float64

In [12]:
df.drop(['country'],axis=1,inplace=True)

In [13]:
df['gender']=df['gender'].astype(int)

In [14]:
x=df.drop('survived',axis=1)
y=df['survived']

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [16]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.5,random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)


In [17]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42,n_jobs=-1,
    class_weight='balanced'  # Optional: gives additional weight to minority class
)

In [18]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

In [19]:
param_grid_cv={
            'n_estimators':[50,100,200,300],
            'criterion':['gini','entropy','log_loss'],
            'max_depth':[10,20],
            'min_samples_split':[2,5,10],
            'min_samples_leaf': [1,2,4]}

In [20]:
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [21]:
randomised_search=RandomizedSearchCV(estimator=model,param_distributions=param_grid_cv,n_iter=10,scoring='roc_auc',n_jobs=-1,cv=cv,verbose=2,random_state=42)

In [None]:
randomised_search.fit(x_train_resampled, y_train_resampled)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,accuracy_score

y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
print(accuracy_score(y_pred,y_test))