# Load packages

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import seaborn as sns
from scipy import stats
from statistics import stdev

# Load data

In [None]:
df = pd.read_excel("D:/BA/Contest/cancer_datasets.xlsx")
df.head(10)

# Check data

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().round(2)

In [None]:
df.isnull().sum()

# Data Exploration

In [None]:
df['Level'].value_counts()

In [None]:
plt.figure(figsize=(6,6))
plt.title('Cancer Rate \nCountplot')
sns.countplot(x= 'Level', data= df, palette= 'pastel')

# EDA

Age, Gender and Cancer Level of patient

In [None]:
df[['Age', 'Gender']].describe().round(2)

In [None]:
def boxplot_create(x, y, z, width, color):
    plt.figure(figsize = (width, 8))
    sns.boxplot(x = x, y = y, hue = z, data = df, palette = color, showfliers = False)

In [None]:
boxplot_create('Level', 'Age', 'Gender',  8, 'Set2')

Personal Risks (Occupational & Genetic) related to Cancer Level

In [None]:
plt.figure(figsize = (5, 5))
sns.scatterplot(data=df, x='OccuPational Hazards', y='Genetic Risk', hue='Level' ,palette='tab10')

Background Diseases of patients and their impact on Cancer Level

In [None]:
background_diseases = df[['chronic Lung Disease', 'Obesity', 'Swallowing Difficulty', 'Level']]

In [None]:
sns.pairplot(data = background_diseases, hue = 'Level',palette= 'YlOrBr', kind= 'hist', height= 2)

Overview of Impact of Internal Factor(Smoking) and External Factor(Passive Smoker)

In [None]:
plt.figure(figsize= (8,8))
plt.title('Internal Factor(Smoking) impact')
sns.countplot(data=df, x='Level', hue= 'Smoking', palette= 'mako_r')

In [None]:
plt.figure(figsize= (8,8))
plt.title('External Factor(Passive Smoker) impact')
sns.countplot(data=df, x='Level', hue= 'Passive Smoker', palette= 'flare')

# Features correlation

In [None]:
habits = df[['Alcohol use', 'Balanced Diet', 'Smoking', 'Snoring']]
corr1 = habits.corr()

In [None]:
plt.figure(figsize= (8,8))
plt.title('Impact of Routine Habits on Cancer Level \nCorrelation Plot')
sns.heatmap(corr1, xticklabels = corr1.columns, yticklabels = corr1.columns, 
            cmap= 'viridis', linewidths=.1, vmax = 1, vmin = -1)

In [None]:
r_issues = df[['Chest Pain', 'Shortness of Breath', 'Wheezing', 'Coughing of Blood',]]
corr2 = r_issues.corr()

In [None]:
plt.figure(figsize= (8,8))
plt.title('Relation of Respiratory issues on Cancer Level \nCorrelation Plot')
sns.heatmap(corr2, xticklabels = corr2.columns, yticklabels = corr2.columns, 
            cmap= 'rocket', linewidths=.1, vmax = 1, vmin = -1)

In [None]:
symptoms = df[[ 'Fatigue', 'Weight Loss', 'Frequent Cold', 'Clubbing of Finger Nails']]
corr3 = symptoms.corr()

In [None]:
plt.figure(figsize= (8,8))
plt.title('Symptoms based on Cancer Level \nCorrelation Plot')
sns.heatmap(corr3, xticklabels = corr3.columns, yticklabels = corr3.columns, 
            cmap= 'mako', linewidths=.1, vmax = 1, vmin = -1)

# Feature Selection

Though there are a lot of variables to look at we can we can just find the most important ones by using the SelectKBest Algorithm with ANOVA F-ratio statistic. This method will generate the F-ratio scores of all features and we can determine which ones to use for machine learning.

In [None]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import f_classif 

In [None]:
X=df.drop(['Level','Patient Id'], axis=1)
Y=df['Level']
bestfeatures = SelectKBest(score_func=f_classif, k='all')
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score'] 

In [None]:
plt.figure(figsize = (10, 10))
sns.barplot(data=featureScores, x='Score', y='Feature', palette='viridis',linewidth=0.5, saturation=2, orient='h')


In [None]:
selection=featureScores[featureScores['Score']>=200]
selection=list(selection['Feature'])
selection.append('Level')
cancer=df[selection]
cancer.head(10)

# Preprocessing data

In [None]:
y_data = cancer['Level'].replace({'Low', 'Medium', 'High'}, {0, 1, 2})
x_data = cancer.drop('Level', axis=1)

In [None]:
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, 
                                                    y_data, 
                                                    test_size = 0.35,
                                                    random_state=0, 
                                                    shuffle=True)

In [None]:
x_test = scaler.fit_transform(x_test)
x_train = scaler.fit_transform(x_train)

In [None]:
print("X_train shape :",x_train.shape)
print("Y_train shape :",y_train.shape)
print("X_test shape :",x_test.shape)
print("Y_test shape :",y_test.shape)

# Machine Learning Model

The size of the input data is relatively small, so at first, we would like to perform 2 simple machine learning models, which is Decision Tree Classifier and Random Forest Classifier. Decision trees are implemented when it involves a mixture of feature data types and easy interpretation. The random forest algorithm model handles multiple trees so that the performance is not affected. 

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

* Decision Tree Classifier

In [None]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_dt = dt.predict(x_test)

In [None]:
score_dt = accuracy_score(y_test, y_dt)
score_dt

In [None]:
cfm_dt = pd.crosstab(y_test, y_dt, rownames=['Y_test'], colnames= ['Y_predict'])
plt.figure(figsize=(6,6))
plt.title('Confusion Matrix')
sns.heatmap(cfm_dt, annot= True, linewidths=.2, linecolor= 'Darkblue', cmap= 'Blues')

In [None]:
print(classification_report(y_test, y_dt))

* Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_rf = rf.predict(x_test)

In [None]:
score_rf = accuracy_score(y_test, y_rf)
score_rf

In [None]:
cfm_rf = pd.crosstab(y_test, y_rf, rownames=['Y_test'], colnames= ['Y_predict'])
plt.figure(figsize=(6,6))
plt.title('Confusion Matrix')
sns.heatmap(cfm_rf, annot= True, linewidths=.2, linecolor= 'Darkred', cmap= 'Reds')

In [None]:
print(classification_report(y_test, y_rf))

As we can see, the accuracy indexs of 2 models above is absolutely too high, which mean these simple models are overfitting or the data is not big enough. To reach the best results of the research without getting overfitted, we will continue building 2 advanced models. Moreover, to avoid the same problems above, we will search for the most appropriate function and parameters, then we will take a cross-validation.   

# Advanced Machine Learning Model

We will use the support vector machines classifiers (SVC). The SVC's can handle higher dimensional data and genearte hyperplanes for separation. We can evalute multiple parameters at one using Grid or Randomization Search functions. Grid Search evalutes several input parameters at all combinations input while randomized search looks for the best.

In [None]:
from sklearn.svm import SVC 
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV 
import itertools 

* Grid Search

In [None]:
svm_grid= {
    "C": [.01, .1, 1, 5, 10, 100], 
    "gamma": [0, .01, .1, 1, 5, 10, 100],
    "kernel": ['linear', 'poly', 'rbf'],
    "random_state": [1]}

In [None]:
gs = GridSearchCV(
    estimator=SVC(), 
    param_grid=svm_grid, 
    scoring=None,
    n_jobs=-1, 
    cv=10, 
    verbose=0,
    return_train_score=True
    )

In [None]:
gs.fit(x_train, y_train)

In [None]:
y_gs = gs.predict(x_test)

In [None]:
cfm_gs = pd.crosstab(y_test, y_gs, rownames=['Y_test'], colnames= ['Y_predict'])
plt.figure(figsize=(6,6))
plt.title('Confusion Matrix')
sns.heatmap(cfm_gs, annot= True, linewidths=.2, linecolor= 'Darkgreen', cmap= 'Greens')

In [None]:
print("**Grid search results of SVC Grid Search**")
print("The best parameters are:",gs.best_params_)
print("Best training accuracy:\t", gs.best_score_)
print('Classification Report:')
print(classification_report(y_test, y_gs))

* Randomized Search

In [None]:
svm_rds = {
    "C": np.arange(0.01,2, 0.01),   
    "gamma": np.arange(0,1, 0.01),
    "kernel": ["rbf","linear","poly"],
    "random_state": [1]}

In [None]:
rds = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=svm_rds,
    n_iter=10,
    n_jobs=-1,
    cv=10,
    verbose=0,
    random_state=1,
    return_train_score=True
)

In [None]:
rds.fit(x_train, y_train)

In [None]:
y_rds = rds.predict(x_test)

In [None]:
cfm_rds = pd.crosstab(y_test, y_rds, rownames=['Y_test'], colnames= ['Y_predict'])
plt.figure(figsize=(6,6))
plt.title('Confusion Matrix')
sns.heatmap(cfm_rds, annot= True, linewidths=.2, linecolor= 'Grey', cmap= 'Greys')

In [None]:
print("**Grid search results of SVC Randomized Search**")
print("The best parameters are:",rds.best_params_)
print("Best training accuracy:\t", rds.best_score_)
print('Classification Report:')
print(classification_report(y_test, y_rds))

# Conclusion

We investigated the data, checking for data correlationship, visualizing the features and understanding the relationship between different features. Through EDA and Modelling of the Data, we observed that:

* Many features have some prominent separation alone for distincting the level of cancer.
* Using KBestSelection we were able to use the most important features that have the most effect on the Result(Cancer Level) from the dataset.
* The 2 simple machine learning models all result to highest level of accuracy because of the small size of data in general and train data in particular.
* The 13 selected features provided 100% accuracy when modeled with either Grid/Randomized Searches on support vector machine classifier.