In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from datetime import datetime
from sklearn.impute import SimpleImputer

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier 
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
thoracic_data = np.loadtxt('ThoraricSurgery.arff',dtype = 'str',delimiter=',',comments='@')
thoracic_df = pd.DataFrame(data=thoracic_data[:,:])
thoracic_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [3]:
col_names = ['Diagnosis','FVC','FEV1','Performance','Pain','Haemoptysis','Dyspnoea','Cough','Weakness','Tumor_Size',
             'Diabetes_Mellitus','MI_6mo','PAD','Smoking','Asthma','Age','Risk1YrDeath']
#col_names = ['DGN','PRE4','PRE5','PRE6','PRE7','PRE8','PRE9','PRE10','PRE11','PRE14','PRE17','PRE19','PRE25','PRE30','PRE32','Age','Risk1Yr']
thoracic_df.columns = col_names
thoracic_df.head()

Unnamed: 0,Diagnosis,FVC,FEV1,Performance,Pain,Haemoptysis,Dyspnoea,Cough,Weakness,Tumor_Size,Diabetes_Mellitus,MI_6mo,PAD,Smoking,Asthma,Age,Risk1YrDeath
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


In [4]:
thoracic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 17 columns):
Diagnosis            470 non-null object
FVC                  470 non-null object
FEV1                 470 non-null object
Performance          470 non-null object
Pain                 470 non-null object
Haemoptysis          470 non-null object
Dyspnoea             470 non-null object
Cough                470 non-null object
Weakness             470 non-null object
Tumor_Size           470 non-null object
Diabetes_Mellitus    470 non-null object
MI_6mo               470 non-null object
PAD                  470 non-null object
Smoking              470 non-null object
Asthma               470 non-null object
Age                  470 non-null object
Risk1YrDeath         470 non-null object
dtypes: object(17)
memory usage: 62.5+ KB


In [5]:
#Checking any null values
thoracic_df.replace(r'^\s*$', np.nan, regex=True, inplace = True)
thoracic_df.replace('?', np.nan, inplace = True)
print(thoracic_df.isnull().sum())

Diagnosis            0
FVC                  0
FEV1                 0
Performance          0
Pain                 0
Haemoptysis          0
Dyspnoea             0
Cough                0
Weakness             0
Tumor_Size           0
Diabetes_Mellitus    0
MI_6mo               0
PAD                  0
Smoking              0
Asthma               0
Age                  0
Risk1YrDeath         0
dtype: int64


In [6]:
#Handling True/False data values by converting them to 1/0.
t_f_cols = ['Pain', 'Haemoptysis', 'Dyspnoea', 'Cough', 'Weakness', 'Diabetes_Mellitus', 'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Risk1YrDeath']
thoracic_df[t_f_cols] = (thoracic_df[t_f_cols] == 'T').astype(float)
thoracic_df.head()

Unnamed: 0,Diagnosis,FVC,FEV1,Performance,Pain,Haemoptysis,Dyspnoea,Cough,Weakness,Tumor_Size,Diabetes_Mellitus,MI_6mo,PAD,Smoking,Asthma,Age,Risk1YrDeath
0,DGN2,2.88,2.16,PRZ1,0.0,0.0,0.0,1.0,1.0,OC14,0.0,0.0,0.0,1.0,0.0,60,0.0
1,DGN3,3.4,1.88,PRZ0,0.0,0.0,0.0,0.0,0.0,OC12,0.0,0.0,0.0,1.0,0.0,51,0.0
2,DGN3,2.76,2.08,PRZ1,0.0,0.0,0.0,1.0,0.0,OC11,0.0,0.0,0.0,1.0,0.0,59,0.0
3,DGN3,3.68,3.04,PRZ0,0.0,0.0,0.0,0.0,0.0,OC11,0.0,0.0,0.0,0.0,0.0,54,0.0
4,DGN3,2.44,0.96,PRZ2,0.0,1.0,0.0,1.0,1.0,OC11,0.0,0.0,0.0,1.0,0.0,73,1.0


In [7]:
#Diagnosis,Performance,Tumor_Size has alphanumerical categorical data with consistent numeric part we can only extract the numeric part.
thoracic_df['Diagnosis'] = thoracic_df.Diagnosis.str[-1:].astype(float)
thoracic_df['Performance'] = thoracic_df.Performance.str[-1:].astype(float)
thoracic_df['Tumor_Size'] = thoracic_df.Tumor_Size.str[-1:].astype(float)

#Convertig other numeric types to float
thoracic_df['FVC'] = thoracic_df.FVC.str[-1:].astype(float)
thoracic_df['FEV1'] = thoracic_df.FEV1.str[-1:].astype(float)
thoracic_df['Age'] = thoracic_df.Age.str[-1:].astype(float)

thoracic_df.head()

Unnamed: 0,Diagnosis,FVC,FEV1,Performance,Pain,Haemoptysis,Dyspnoea,Cough,Weakness,Tumor_Size,Diabetes_Mellitus,MI_6mo,PAD,Smoking,Asthma,Age,Risk1YrDeath
0,2.0,8.0,6.0,1.0,0.0,0.0,0.0,1.0,1.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,3.0,6.0,8.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9.0,0.0
3,3.0,8.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,3.0,4.0,6.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0


In [8]:
thoracic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 17 columns):
Diagnosis            470 non-null float64
FVC                  470 non-null float64
FEV1                 470 non-null float64
Performance          470 non-null float64
Pain                 470 non-null float64
Haemoptysis          470 non-null float64
Dyspnoea             470 non-null float64
Cough                470 non-null float64
Weakness             470 non-null float64
Tumor_Size           470 non-null float64
Diabetes_Mellitus    470 non-null float64
MI_6mo               470 non-null float64
PAD                  470 non-null float64
Smoking              470 non-null float64
Asthma               470 non-null float64
Age                  470 non-null float64
Risk1YrDeath         470 non-null float64
dtypes: float64(17)
memory usage: 62.5 KB


In [9]:
#Prepare Training and Testing Data
X_train, X_test, y_train, y_test = train_test_split(thoracic_df.iloc[:,:16],thoracic_df.iloc[:,16], test_size=0.33, random_state=0)

In [23]:
#k-Nearest neighbours classification
print("now ="+str(datetime.now()))
knn_model = KNeighborsClassifier(n_jobs=-1)
param_grid = {'n_neighbors':(np.arange(2,52,5))}
mdls = model_selection.GridSearchCV(knn_model, param_grid, verbose=1, cv=3, n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-12 10:52:31.524115
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=2, p=2,
                     weights='uniform')
0.8141025641025641
now =2019-11-12 10:52:32.813653


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.0s finished


In [21]:
#SVM classifier
print("now ="+str(datetime.now()))
svm_model = svm.SVC()
Kernels = ['linear', 'poly', 'rbf']
Gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'kernel':Kernels, 'gamma' : Gammas}
mdls = model_selection.GridSearchCV(svm_model, param_grid, verbose=1, cv=3, n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-12 10:51:42.320199
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.8461538461538461
now =2019-11-12 10:51:46.077015


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    3.6s finished


In [12]:
#Decision tree classification
print("now ="+str(datetime.now()))
DTC_model = DecisionTreeClassifier(random_state=0)
Max_features = ['auto', 'sqrt', 'log2']
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
param_grid = {'max_features': Max_features, 'min_samples_leaf': Min_samples_leafs}
mdls = model_selection.GridSearchCV(DTC_model, param_grid, verbose=1,cv=3,n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


now =2019-11-12 10:45:34.821910
Fitting 3 folds for each of 15 candidates, totalling 45 fits
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.05, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')
0.8461538461538461
now =2019-11-12 10:45:35.231641


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.3s finished


In [13]:
#Random forest classification
print("now ="+str(datetime.now()))
RFC_model = ensemble.RandomForestClassifier(random_state=0)
Estimators = np.arange(100,105,5)
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
Max_features = ['auto', 'sqrt', 'log2']
param_grid = {'n_estimators': Estimators,'max_features': Max_features, 'min_samples_leaf': Min_samples_leafs}
mdls = model_selection.GridSearchCV(RFC_model, param_grid, verbose=1,cv=3,n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


now =2019-11-12 10:45:35.247358
Fitting 3 folds for each of 15 candidates, totalling 45 fits
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.01, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
0.8461538461538461
now =2019-11-12 10:45:38.052775


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    2.6s finished


In [14]:
#AdaBoost classification
print("now ="+str(datetime.now()))
ABC_model = ensemble.AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0),random_state=0)
Estimators = np.arange(50,110,10)
Learning_rates = [0.05,0.1,0.3,1]
param_grid = {'n_estimators': Estimators, 'learning_rate': Learning_rates}
mdls = model_selection.GridSearchCV(ABC_model, param_grid, verbose=1,cv=3,n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


now =2019-11-12 10:45:38.079130
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.7s


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=0,
                             

[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    4.6s finished


In [15]:
#Logistic regression
print("now ="+str(datetime.now()))
logistic_model = linear_model.LogisticRegression(n_jobs=-1,random_state=0)
param_grid = { "fit_intercept":[True], "solver":['newton-cg', 'lbfgs', 'saga'], 
             "max_iter":np.arange(100,400, 100)}
mdls = model_selection.GridSearchCV(logistic_model, param_grid, verbose=1,cv=3,n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


now =2019-11-12 10:45:42.965637
Fitting 3 folds for each of 9 candidates, totalling 27 fits
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,
                   solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
0.8461538461538461
now =2019-11-12 10:45:44.265053


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    1.1s finished


In [16]:
#Gaussian naive Bayes classification
print("now ="+str(datetime.now()))
zero_prob = y_train[y_train == 0].shape[0]/y_train.shape[0]
one_prob = 1 - zero_prob
prob = np.array([zero_prob,one_prob])
GNB_model = GaussianNB(priors = prob)
GNB_model.fit(X_train, y_train)
# mdls = model_selection.GridSearchCV(GNB_model, param_grid, verbose=1,cv=5,, n_jobs=-1,iid=False).fit(X_train, y_train)
# print(mdls.best_estimator_)
y_pred = GNB_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-12 10:45:44.286825
0.16025641025641027
now =2019-11-12 10:45:44.324521


In [19]:
#Neural network classification
print("now ="+str(datetime.now()))
NNC_model = MLPClassifier(max_iter=500)
Hidden_Layer_Sizes = [1, 5, 10, (5,5), (10,5)]
Learning_rates = ['constant','adaptive']
Learning_rates_init = [0.001, 0.01, 0.1]
Activations = ['logistic', 'tanh', 'relu']
Alphas = [0.0001,0.002]
param_grid = {'learning_rate': Learning_rates, 'learning_rate_init': Learning_rates_init, 'hidden_layer_sizes': Hidden_Layer_Sizes, 'activation': Activations, 'alpha': Alphas}
mdls = model_selection.GridSearchCV(NNC_model, param_grid, verbose=1,cv=3,n_jobs=-1,iid=False).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-12 10:47:21.957083
Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   26.4s finished


MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 5), learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
0.7692307692307693
now =2019-11-12 10:47:49.451072
