# <b>1 <span style='color:#6497b1'>|</span> Importing Libraries and Loading dataset</b>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings

In [None]:
df = pd.read_csv("../input/lung-cancer/survey lung cancer.csv")
df.head()

# <b>2 <span style='color:#6497b1'>|</span> Understanding Our Data</b>

In [None]:
#Check data shape 
print("Data shape:")
print(df.shape)
print("\n")
#Check summary info
print("Data summary:")
print(df.info())
print("\n")

In [None]:
#Check detailed info
df.describe()

In [None]:
#Check data missing 
df.isnull().sum()

In [None]:
#Remove duplicates in the dataset
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print

In [None]:
#Run again to check data
#Check data shape 
print("Data shape:")
print(df.shape)
print("\n")
#Check summary info
print("Data summary:")
print(df.info())
print("\n")

In [None]:
#Check detailed info
df.describe()

In [None]:
#Encode data from float to integer
encoder = LabelEncoder()
df['LUNG_CANCER']=encoder.fit_transform(df['LUNG_CANCER'])
df['GENDER']=encoder.fit_transform(df['GENDER'])
df.head()

In [None]:
#Get average age of data
sum = 0
count = 0
for age in df['AGE']:
    sum += age
    count +=1
print(count)
avg = sum/count
print(avg)

# <b>3 <span style='color:#6497b1'>|</span> Exploratory Data Analysis</b>


In [None]:
#separating continuous and categorical columns
con_col = ['AGE']
cat_col=[]
for i in df.columns:
    if i!='AGE':
        cat_col.append(i)

In [None]:
warnings.filterwarnings('ignore')
fig,ax = plt.subplots(1,3,figsize=(20,6))
sns.distplot(df['AGE'],ax=ax[0])
sns.histplot(data =df,x='AGE',ax=ax[1],hue='LUNG_CANCER',kde=True)
sns.boxplot(x=df['LUNG_CANCER'],y=df['AGE'],ax=ax[2])
plt.suptitle("Visualizing AGE column",size=20)
plt.show()

In [None]:
fig,ax = plt.subplots(15,2,figsize=(30,90))
for index,i in enumerate(cat_col):
    sns.countplot(data=df,x=i,ax=ax[index,0])
    sns.countplot(data=df,x=i,ax=ax[index,1],hue='LUNG_CANCER')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.suptitle("Visualizing Categorical Columns",fontsize=50)

In [None]:
fig,ax = plt.subplots(15,3,figsize=(30,90))
for index,i in enumerate(cat_col):
    sns.boxplot(x=df[i],y=df['AGE'],ax=ax[index,0])
    sns.boxplot(x=df[i],y=df['AGE'],ax=ax[index,1],hue=df['LUNG_CANCER'])
    sns.violinplot(x=df[i],y=df['AGE'],ax=ax[index,2])
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.suptitle("Visualizing AGE vs Categorical Columns",fontsize=50)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,linewidth=0.5,fmt='0.2f')

# <b>4 <span style='color:#6497b1'>|</span> Data Preprocessing</b>

In [None]:
#Split the data into X as train data and y as resulted label
X=df.drop(['LUNG_CANCER'],axis=1)
y=df['LUNG_CANCER']

In [None]:
#Change data from (1,2) to (0,1)
for i in X.columns[2:]:
    temp=[]
    for j in X[i]:
        temp.append(j-1)
    X[i]=temp
X.head()

In [None]:
#Random data shuffle
from imblearn.over_sampling import RandomOverSampler
X_over,y_over=RandomOverSampler().fit_resample(X,y)

In [None]:
#Split data to train set and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_over,y_over,random_state=42,stratify=y_over)
print(f'Train shape : {X_train.shape}\nTest shape: {X_test.shape}')

In [None]:
#Encode age to 0 and 1 by average age
for i, age in enumerate(X['AGE']):
    if age < avg:
        X['AGE'][i] = 0 #Equal 0 if age < 62.09
    else: 
        X['AGE'][i] = 1 #Equal 1 if age > 62.09
X.head()

# <b>5 <span style='color:#6497b1'>|</span>Model Building</b>

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
import lightgbm as lgb
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
model_list = []
result = []

In [None]:
#1. K-neighbors classifier 
knn_scores=[]
for k in range(1,20):
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X_train,y_train,cv=5)
    knn_scores.append(scores.mean())

x_ticks = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
x_labels = x_ticks

plt.plot([k for k in range(1,20)],knn_scores)
plt.xticks(ticks=x_ticks, labels=x_labels)
plt.grid()

knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)

confusion_knn=confusion_matrix(y_test,knn.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_knn,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")

print(classification_report(y_test,knn.predict(X_test)))
model_list.append("K-neighbor Classifier: ")
result.append(round(accuracy_score(y_test,knn.predict(X_test)) *100, 2))

In [None]:
#2. Support Vector Machine
param_grid={'C':[0.001,0.01,0.1,1,10,100], 'gamma':[0.001,0.01,0.1,1,10,100]}
rcv=RandomizedSearchCV(SVC(),param_grid,cv=5)
rcv.fit(X_train,y_train)
y_pred_svc=rcv.predict(X_test)
confusion_svc=confusion_matrix(y_test,rcv.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_svc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_svc))
print(f'\nBest Parameters of SVC model is : {rcv.best_params_}\n')
model_list.append("Support Vector Machine: ")
result.append(round(accuracy_score(y_test,y_pred_svc) *100, 2))

In [None]:
#3. Logistic Regression
param_grid={
    'C':[0.001,0.01,0.1,1,10,100], 
    'max_iter':[50,75,100,200,300,400,500,700]
}
log=RandomizedSearchCV(LogisticRegression(solver='lbfgs'),param_grid,cv=5)
log.fit(X_train,y_train)
y_pred_log=log.predict(X_test)
confusion_log=confusion_matrix(y_test,log.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_log,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_log))
model_list.append("Logistic Regression: ")
result.append(round(accuracy_score(y_test,y_pred_log) *100, 2))

In [None]:
#4. Random Forest Classifier
param_grid = {'n_estimators': [50, 75,100, 150, 200,300],}
rcv=RandomizedSearchCV(RandomForestClassifier(random_state=42),param_grid,cv=5)
rcv.fit(X_train,y_train)
y_pred_rcv=rcv.predict(X_test)
confusion_rcv=confusion_matrix(y_test,rcv.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_rcv,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_rcv))
print(f'\nBest Parameter: {rcv.best_params_}\n')
model_list.append("Random Forest Classifier: ")
result.append(round(accuracy_score(y_test,y_pred_rcv) *100, 2))

In [None]:
#5. Gradient Boosting Classifier
param_grid = {
    'learning_rate' : [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1],
    'n_estimators': [50, 75,100, 150, 200,300],
}

gbc=RandomizedSearchCV(GradientBoostingClassifier(random_state=42),param_grid,cv=5)
gbc.fit(X_train,y_train)
y_pred_gbc=gbc.predict(X_test)
confusion_gbc=confusion_matrix(y_test,y_pred_gbc)
plt.figure(figsize=(8,8))
sns.heatmap(confusion_gbc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_gbc))
print(f'\nBest Parameter: {gbc.best_params_}\n')
model_list.append("Gradient Boosting Classifier: ")
result.append(round(accuracy_score(y_test,y_pred_gbc) *100, 2))

In [None]:
#6. LGBM Classifier
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
confusion=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(8,8))
sns.heatmap(confusion,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred))
model_list.append("LGBM Classifier: ")
result.append(round(accuracy_score(y_test,y_pred) *100, 2))

In [None]:
#7. Support Vector Classifier
model = SVC(gamma=10,C=100)
model.fit(X_train,y_train)
y_pred_svc=model.predict(X_test)
confusion_svc=confusion_matrix(y_test,y_pred_svc)
plt.figure(figsize=(8,8))
sns.heatmap(confusion_svc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_svc))
model_list.append("Support Vector Classifier: ")
result.append(round(accuracy_score(y_test,y_pred_svc) *100, 2))

# <b>6<span style='color:#6497b1'>|</span>RESULT AND CONCLUSION</b>

In [None]:
max = 0
dir = 0
for i, model_name in enumerate(model_list):
    if result[i]> max:
        max = result[i]
        dir = i
    print(model_name, result[i], "%")
model_final = model_list[dir].rstrip(": ")
print("\nConclusion:")
print(f"Choose model {model_final}")
print(f"The best accuracy is {result[i]}%")
print(f"The test set has {X_test.shape[0]} patients, choosen randomly from the dataset {X_over.shape[0]} patients")