In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('data.csv')
list = ['Unnamed: 32','id']
df.drop(list,axis = 1, inplace = True)

#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['diagnosis'] = labelencoder.fit_transform(df['diagnosis'])

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df['radius_mean'] = ss.fit_transform(df['radius_mean'].values.reshape(-1,1))
df['texture_mean'] = ss.fit_transform(df['texture_mean'].values.reshape(-1,1))
df['perimeter_mean'] = ss.fit_transform(df['perimeter_mean'].values.reshape(-1,1))
df['area_mean'] = ss.fit_transform(df['area_mean'].values.reshape(-1,1))
df['perimeter_se'] = ss.fit_transform(df['perimeter_se'].values.reshape(-1,1))
df['area_se'] = ss.fit_transform(df['area_se'].values.reshape(-1,1))
df['radius_worst'] = ss.fit_transform(df['radius_worst'].values.reshape(-1,1))
df['texture_worst'] = ss.fit_transform(df['texture_worst'].values.reshape(-1,1))
df['perimeter_worst'] = ss.fit_transform(df['perimeter_worst'].values.reshape(-1,1))
df['area_worst'] = ss.fit_transform(df['area_worst'].values.reshape(-1,1))

df_1 = pd.DataFrame(df['diagnosis'])
df.drop(['diagnosis'], axis =1, inplace = True)
df_2 = pd.DataFrame(df[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])


from sklearn.decomposition import PCA
pca = PCA()
df_pca = pd.DataFrame(pca.fit_transform(df_2), columns=df_2.columns)
print(df_pca.head())


X = df_pca.values
Y = df_1['diagnosis'].values

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, 
                                                    stratify = Y)


sm = SMOTE(random_state=42, kind = 'borderline1')

X_resampled, Y_resampled = sm.fit_resample(X_train, Y_train)

# Logistic Regression
lr_model = Pipeline([("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Decision Tree
dt_model = Pipeline([("model", DecisionTreeClassifier(class_weight="balanced"))])

# Random Forest
rf_model = Pipeline([("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

   radius_mean  texture_mean  perimeter_mean  smoothness_mean  area_mean  \
0     2.327415     -2.569512        1.486073         0.159885  -0.435486   
1     2.980117     -1.146063       -0.663482        -0.220002   0.243566   
2     2.988588     -0.332936       -0.294597         0.230955  -0.013079   
3    -1.135178      0.474079        0.131484         0.323124  -0.221959   
4     2.968990     -1.890360       -0.142690        -0.293323   0.052689   

   concavity_mean  compactness_mean  texture_se   area_se  \
0        0.034616          0.002777    0.003918  0.020190   
1       -0.035380          0.003260    0.016865  0.009204   
2        0.018910          0.016429    0.015743 -0.001196   
3        0.221830         -0.004916    0.056660  0.014218   
4        0.051094          0.004643   -0.026062 -0.010812   

   fractal_dimension_mean  
0                0.000661  
1                0.005190  
2               -0.002060  
3                0.004241  
4                0.001895  


Logistic Regression

In [2]:
gs = RandomizedSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="f1")
gs.fit(X_resampled, Y_resampled)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'model__C': [1, 1.3, 1.5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [3]:
print(gs.best_params_)
print(gs.best_score_)

{'model__C': 1.3}
0.8940587197910107


In [4]:
lr_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [5]:
lr_model.get_params("model")

{'memory': None,
 'steps': [('model',
   LogisticRegression(C=1.3, class_weight='balanced', dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
             solver='liblinear', tol=0.0001, verbose=0, warm_start=False))],
 'model': LogisticRegression(C=1.3, class_weight='balanced', dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'model__C': 1.3,
 'model__class_weight': 'balanced',
 'model__dual': False,
 'model__fit_intercept': True,
 'model__intercept_scaling': 1,
 'model__max_iter': 100,
 'model__multi_class': 'warn',
 'model__n_jobs': None,
 'model__penalty': 'l2',
 'model__random_state': 42,
 'model__solver': 'liblinear',
 'model__tol': 0.0001,
 'model__verbose': 0,
 'model__warm_start': False}

In [6]:
lr_model.fit(X_resampled, Y_resampled)

Pipeline(memory=None,
     steps=[('model', LogisticRegression(C=1.3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [7]:
y_pred = lr_model.predict(X_test)

In [8]:
accuracy_score(Y_test, y_pred)

0.8771929824561403

In [9]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90        72
           1       0.80      0.88      0.84        42

   micro avg       0.88      0.88      0.88       114
   macro avg       0.87      0.88      0.87       114
weighted avg       0.88      0.88      0.88       114



In [10]:
print(confusion_matrix(Y_test, y_pred))

[[63  9]
 [ 5 37]]


In [11]:
lr_model.score(X_test, Y_test)

0.8771929824561403

In [12]:
lr_model.score(X_resampled, Y_resampled)

0.9105263157894737

Decision Tree

In [13]:
gs = RandomizedSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  n_jobs=-1, cv=5, scoring="f1")

gs.fit(X_resampled, Y_resampled)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('model', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'model__max_depth': [3, 5, 7], 'model__min_samples_split': [2, 5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [14]:
print(gs.best_params_)
print(gs.best_score_)

{'model__min_samples_split': 2, 'model__max_depth': 5}
0.903429820448073


In [15]:
dt_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [16]:
dt_model.get_params("model")

{'memory': None,
 'steps': [('model',
   DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=None,
               splitter='best'))],
 'model': DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'model__class_weight': 'balanced',
 'model__criterion': 'gini',
 'model__max_depth': 5,
 'model__max_features': None,
 'model__max_leaf_nodes': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_impurity_split': 

In [17]:
dt_model.fit(X_resampled, Y_resampled)
y_pred = dt_model.predict(X_test)

accuracy_score(Y_test, y_pred)

0.8771929824561403

In [18]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90        72
           1       0.80      0.88      0.84        42

   micro avg       0.88      0.88      0.88       114
   macro avg       0.87      0.88      0.87       114
weighted avg       0.88      0.88      0.88       114



In [19]:
print(confusion_matrix(Y_test, y_pred))

[[63  9]
 [ 5 37]]


In [20]:
dt_model.score(X_test, Y_test)

0.8771929824561403

In [21]:
dt_model.score(X_resampled, Y_resampled)

0.9842105263157894

Random Forest

In [22]:
gs = RandomizedSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="f1")

gs.fit(X_resampled, Y_resampled)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'model__max_depth': [10, 15], 'model__min_samples_split': [5, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [23]:
print(gs.best_params_)
print(gs.best_score_)

{'model__min_samples_split': 10, 'model__max_depth': 10}
0.9438393515106853


In [24]:
rf_model.set_params(**gs.best_params_)

Pipeline(memory=None,
     steps=[('model', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [25]:
rf_model.get_params("model")

{'memory': None,
 'steps': [('model',
   RandomForestClassifier(bootstrap=True, class_weight='balanced',
               criterion='gini', max_depth=10, max_features='auto',
               max_leaf_nodes=None, min_impurity_decrease=0.0,
               min_impurity_split=None, min_samples_leaf=1,
               min_samples_split=10, min_weight_fraction_leaf=0.0,
               n_estimators=100, n_jobs=-1, oob_score=False,
               random_state=None, verbose=0, warm_start=False))],
 'model': RandomForestClassifier(bootstrap=True, class_weight='balanced',
             criterion='gini', max_depth=10, max_features='auto',
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_jobs=-1, oob_score=False,
             random_state=None, verbose=0, warm_start=False),
 'model__bootstrap': True,
 'model__class_weight': 'balanced',
 '

In [26]:
rf_model.fit(X_resampled, Y_resampled)
y_pred = rf_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.9035087719298246

In [27]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92        72
           1       0.83      0.93      0.88        42

   micro avg       0.90      0.90      0.90       114
   macro avg       0.89      0.91      0.90       114
weighted avg       0.91      0.90      0.90       114



In [28]:
print(confusion_matrix(Y_test, y_pred))

[[64  8]
 [ 3 39]]


In [29]:
rf_model.score(X_test, Y_test)

0.9035087719298246

In [30]:
rf_model.score(X_resampled, Y_resampled)

0.9964912280701754