In [1]:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier




In [2]:
df = pd.read_csv('data.csv')
list = ['Unnamed: 32','id']
df.drop(list,axis = 1, inplace = True)

#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['diagnosis'] = labelencoder.fit_transform(df['diagnosis'])

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df['radius_mean'] = ss.fit_transform(df['radius_mean'].values.reshape(-1,1))
df['texture_mean'] = ss.fit_transform(df['texture_mean'].values.reshape(-1,1))
df['perimeter_mean'] = ss.fit_transform(df['perimeter_mean'].values.reshape(-1,1))
df['area_mean'] = ss.fit_transform(df['area_mean'].values.reshape(-1,1))
df['perimeter_se'] = ss.fit_transform(df['perimeter_se'].values.reshape(-1,1))
df['area_se'] = ss.fit_transform(df['area_se'].values.reshape(-1,1))
df['radius_worst'] = ss.fit_transform(df['radius_worst'].values.reshape(-1,1))
df['texture_worst'] = ss.fit_transform(df['texture_worst'].values.reshape(-1,1))
df['perimeter_worst'] = ss.fit_transform(df['perimeter_worst'].values.reshape(-1,1))
df['area_worst'] = ss.fit_transform(df['area_worst'].values.reshape(-1,1))

df_1 = pd.DataFrame(df['diagnosis'])
df.drop(['diagnosis'], axis =1, inplace = True)
df_2 = pd.DataFrame(df[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])


from sklearn.decomposition import PCA
pca = PCA()
df_pca = pd.DataFrame(pca.fit_transform(df_2), columns=df_2.columns)
print(df_pca.head())

   radius_mean  texture_mean  perimeter_mean  smoothness_mean  area_mean  \
0     2.327415     -2.569512        1.486073         0.159885  -0.435486   
1     2.980117     -1.146063       -0.663482        -0.220002   0.243566   
2     2.988588     -0.332936       -0.294597         0.230955  -0.013079   
3    -1.135178      0.474079        0.131484         0.323124  -0.221959   
4     2.968990     -1.890360       -0.142690        -0.293323   0.052689   

   concavity_mean  compactness_mean  texture_se   area_se  \
0        0.034616          0.002777    0.003918  0.020190   
1       -0.035380          0.003260    0.016865  0.009204   
2        0.018910          0.016429    0.015743 -0.001196   
3        0.221830         -0.004916    0.056660  0.014218   
4        0.051094          0.004643   -0.026062 -0.010812   

   fractal_dimension_mean  
0                0.000661  
1                0.005190  
2               -0.002060  
3                0.004241  
4                0.001895  


In [3]:
X = df_pca
Y = df_1['diagnosis']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, 
                                                    stratify = Y)



In [4]:
# Logistic Regression
lr_model = Pipeline([("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Decision Tree
dt_model = Pipeline([("model", DecisionTreeClassifier(class_weight="balanced"))])

# Random Forest
rf_model = Pipeline([("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

# XGBoost
xgb_model = Pipeline([ # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - Y.mean()), n_jobs=-1))])

Logistic Regression 

In [5]:
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__C': [1, 1.3, 1.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [6]:
print(gs.best_params_)
print(gs.best_score_)

{'model__C': 1.3}
0.9142857142857143


In [7]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, kind = 'borderline1')

X_resampled, Y_resampled = sm.fit_resample(X_train, Y_train)

In [8]:
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_resampled, Y_resampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__C': [1, 1.3, 1.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [9]:
print(gs.best_params_)
print(gs.best_score_)

{'model__C': 1.3}
0.8807017543859649


In [10]:
lr_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [11]:
lr_model.get_params("model")

{'memory': None,
 'steps': [('model',
   LogisticRegression(C=1.3, class_weight='balanced', dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
             solver='liblinear', tol=0.0001, verbose=0, warm_start=False))],
 'model': LogisticRegression(C=1.3, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'model__C': 1.3,
 'model__class_weight': 'balanced',
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__max_iter': 100,
 'model__multi_class': 'warn',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'liblinear',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [12]:
lr_model.fit(X_resampled, Y_resampled)

Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [13]:
y_pred = lr_model.predict(X_test)

In [14]:
accuracy_score(Y_test, y_pred)

0.9035087719298246

In [15]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92        72
           1       0.80      0.98      0.88        42

   micro avg       0.90      0.90      0.90       114
   macro avg       0.89      0.92      0.90       114
weighted avg       0.92      0.90      0.90       114



In [16]:
import eli5
eli5.show_weights(lr_model.named_steps["model"])

Weight?,Feature
4.814,x5
2.625,x0
2.321,x3
1.107,x2
0.761,<BIAS>
0.571,x7
0.415,x6
0.277,x1
-0.049,x9
-0.312,x8


In [17]:
feature_names = ['radius_mean', 'texture_mean', 'perimeter_mean', 'smoothness_mean', 
                 'area_mean', 'concavity_mean', 'compactness_mean', 
                 'texture_se', 'area_se', 'fractal_dimension_mean']

eli5.show_weights(lr_model.named_steps["model"], feature_names=feature_names)

Weight?,Feature
4.814,concavity_mean
2.625,radius_mean
2.321,smoothness_mean
1.107,perimeter_mean
0.761,<BIAS>
0.571,texture_se
0.415,compactness_mean
0.277,texture_mean
-0.049,fractal_dimension_mean
-0.312,area_se


In [18]:
i = 2
X_test.iloc[[i]]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,smoothness_mean,area_mean,concavity_mean,compactness_mean,texture_se,area_se,fractal_dimension_mean
256,3.548776,1.38265,-0.217227,0.451982,-0.090403,0.045625,-0.086096,-0.044711,-0.001226,-0.000537


In [19]:
i = 4
X_test.iloc[[i]]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,smoothness_mean,area_mean,concavity_mean,compactness_mean,texture_se,area_se,fractal_dimension_mean
136,-1.226457,0.099682,0.998848,-0.976663,0.004055,-0.060597,0.003884,0.003109,-0.010959,-0.003062


In [20]:
Y_test.iloc[i]

0

In [21]:
eli5.show_prediction(lr_model.named_steps["model"], X_test.iloc[7],
                     feature_names=feature_names, show_feature_values=True)

Contribution?,Feature,Value
6.123,radius_mean,2.332
1.941,smoothness_mean,0.836
0.761,<BIAS>,1.0
0.523,perimeter_mean,0.472
0.415,area_mean,-0.093
0.353,texture_mean,1.274
0.009,texture_se,0.016
0.002,compactness_mean,0.005
0.0,area_se,-0.0
-0.0,fractal_dimension_mean,0.005


Decision Tree

In [22]:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [3, 5, 7], 'model__min_samples_split': [2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [23]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_samples_split': 5}
0.9054945054945055


In [24]:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_resampled, Y_resampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [3, 5, 7], 'model__min_samples_split': [2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_samples_split': 2}
0.9192982456140351


In [26]:
dt_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [27]:
dt_model.fit(X_resampled, Y_resampled)
y_pred = dt_model.predict(X_test)

accuracy_score(Y_test, y_pred)

0.8771929824561403

In [28]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90        72
           1       0.85      0.81      0.83        42

   micro avg       0.88      0.88      0.88       114
   macro avg       0.87      0.86      0.87       114
weighted avg       0.88      0.88      0.88       114



In [29]:
eli5.show_weights(dt_model.named_steps["model"], feature_names=feature_names)

Weight,Feature
0.6122,radius_mean
0.1492,area_mean
0.0929,concavity_mean
0.0378,smoothness_mean
0.0264,area_se
0.0261,perimeter_mean
0.0257,compactness_mean
0.0153,texture_mean
0.0143,fractal_dimension_mean
0.0,texture_se


In [30]:
eli5.show_prediction(dt_model.named_steps["model"], 
                     X_test.iloc[1],
                     feature_names=feature_names, show_feature_values=True)

Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.356,radius_mean,-1.321
0.11,area_mean,0.048
0.012,perimeter_mean,0.033
0.008,area_se,0.004


In [31]:
eli5.show_prediction(dt_model.named_steps["model"], 
                     X_test.iloc[7],
                     feature_names=feature_names, show_feature_values=True)

Contribution?,Feature,Value
0.734,radius_mean,2.332
0.5,<BIAS>,1.0
0.071,smoothness_mean,0.836
-0.306,concavity_mean,-0.128


Random Forest

In [32]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [10, 15], 'model__min_samples_split': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [33]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 10, 'model__min_samples_split': 5}
0.9296703296703297


In [34]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_resampled, Y_resampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [10, 15], 'model__min_samples_split': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [35]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 10, 'model__min_samples_split': 5}
0.9385964912280702


In [36]:
rf_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [37]:
rf_model.fit(X_resampled, Y_resampled)
y_pred = rf_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.9385964912280702

In [38]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95        72
           1       0.91      0.93      0.92        42

   micro avg       0.94      0.94      0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114



In [39]:
eli5.show_weights(rf_model.named_steps["model"], 
                  feature_names=feature_names)

Weight,Feature
0.4477  ± 0.3187,radius_mean
0.1886  ± 0.2602,area_mean
0.0866  ± 0.1131,concavity_mean
0.0701  ± 0.1307,perimeter_mean
0.0468  ± 0.0757,compactness_mean
0.0407  ± 0.0673,texture_se
0.0327  ± 0.0448,area_se
0.0319  ± 0.0442,texture_mean
0.0317  ± 0.0493,smoothness_mean
0.0233  ± 0.0414,fractal_dimension_mean


In [40]:
eli5.show_prediction(rf_model.named_steps["model"], 
                     X_test.iloc[1],
                     feature_names=feature_names, show_feature_values=True)

Contribution?,Feature,Value
0.498,<BIAS>,1.0
0.247,radius_mean,-1.321
0.123,area_mean,0.048
0.053,concavity_mean,-0.052
0.028,perimeter_mean,0.033
0.027,texture_se,-0.013
0.012,texture_mean,-0.395
0.008,area_se,0.004
0.006,compactness_mean,0.012
0.002,fractal_dimension_mean,0.0


In [41]:
eli5.show_prediction(rf_model.named_steps["model"], 
                     X_test.iloc[7],
                     feature_names=feature_names, show_feature_values=True)

Contribution?,Feature,Value
0.502,<BIAS>,1.0
0.315,radius_mean,2.332
0.17,area_mean,-0.093
0.031,texture_se,0.016
0.029,texture_mean,1.274
0.024,compactness_mean,0.005
0.019,smoothness_mean,0.836
0.009,area_se,-0.0
-0.0,perimeter_mean,0.472
-0.066,fractal_dimension_mean,0.005


Extreme Gradient Boosting

In [42]:
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [25]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=0.6274165202108963, seed=None, silent=True,
       subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [5, 10], 'model__min_child_weight': [5, 10], 'model__n_estimators': [25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [43]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 25}
0.9274725274725275


In [44]:
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [25]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_resampled, Y_resampled)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=0.6274165202108963, seed=None, silent=True,
       subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [5, 10], 'model__min_child_weight': [5, 10], 'model__n_estimators': [25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [45]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 25}
0.9403508771929825


In [46]:
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('model', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=5, missing=None, n_estimators=25,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=0.6274165202108963, seed=None, silent=True,
       subsample=1))])

In [47]:
y_pred = xgb_model.predict(X_test)
xgb_model.score(X_test, Y_test)

0.9298245614035088

In [48]:
print(classification_report(Y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.99      0.95        72
           1       0.97      0.83      0.90        42

   micro avg       0.93      0.93      0.93       114
   macro avg       0.94      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114

