In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 

from functools import partial




df = pd.read_csv('data.csv')
list = ['Unnamed: 32','id']
df.drop(list,axis = 1, inplace = True)

#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['diagnosis'] = labelencoder.fit_transform(df['diagnosis'])


from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df['radius_mean'] = ss.fit_transform(df['radius_mean'].values.reshape(-1,1))
df['texture_mean'] = ss.fit_transform(df['texture_mean'].values.reshape(-1,1))
df['perimeter_mean'] = ss.fit_transform(df['perimeter_mean'].values.reshape(-1,1))
df['area_mean'] = ss.fit_transform(df['area_mean'].values.reshape(-1,1))
df['perimeter_se'] = ss.fit_transform(df['perimeter_se'].values.reshape(-1,1))
df['area_se'] = ss.fit_transform(df['area_se'].values.reshape(-1,1))
df['radius_worst'] = ss.fit_transform(df['radius_worst'].values.reshape(-1,1))
df['texture_worst'] = ss.fit_transform(df['texture_worst'].values.reshape(-1,1))
df['perimeter_worst'] = ss.fit_transform(df['perimeter_worst'].values.reshape(-1,1))
df['area_worst'] = ss.fit_transform(df['area_worst'].values.reshape(-1,1))

df_1 = pd.DataFrame(df['diagnosis'])
df.drop(['diagnosis'], axis =1, inplace = True)

from sklearn.decomposition import PCA
pca = PCA()
df_pca = pd.DataFrame(pca.fit_transform(df), columns=df.columns)


X = df_pca
Y = df_1['diagnosis']




from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, 
                                                    stratify = Y)

sm = SMOTE(random_state=42, kind = 'borderline1')

X_resampled, Y_resampled = sm.fit_resample(X_train, Y_train)




In [2]:
from sklearn.linear_model import LogisticRegression
from bayes_opt import BayesianOptimization

def get_model(C):
    model = LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42, C=C)

    # Train the model with the train dataset.
    model.fit(X_resampled, Y_resampled)

    # Evaluate the model with the eval dataset.
    score = model.score(X_test,Y_test)
    
    # Return the accuracy.

    return score

verbose = 1
fit_with_partial = partial(get_model)

# Bounded region of parameter space
pbounds = {'C': (1, 1.5)}

optimizer = BayesianOptimization(
    f=fit_with_partial,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer.maximize(n_iter=10)


for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

print(optimizer.max)



|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9825  [0m | [0m 1.209   [0m |
| [0m 2       [0m | [0m 0.9825  [0m | [0m 1.36    [0m |
| [0m 3       [0m | [0m 0.9825  [0m | [0m 1.0     [0m |
| [0m 4       [0m | [0m 0.9825  [0m | [0m 1.151   [0m |
| [0m 5       [0m | [0m 0.9825  [0m | [0m 1.073   [0m |
| [0m 6       [0m | [0m 0.9825  [0m | [0m 1.384   [0m |
| [0m 7       [0m | [0m 0.9825  [0m | [0m 1.161   [0m |
| [0m 8       [0m | [0m 0.9825  [0m | [0m 1.008   [0m |
| [0m 9       [0m | [0m 0.9825  [0m | [0m 1.262   [0m |
| [0m 10      [0m | [0m 0.9825  [0m | [0m 1.457   [0m |
| [0m 11      [0m | [0m 0.9825  [0m | [0m 1.18    [0m |
| [0m 12      [0m | [0m 0.9825  [0m | [0m 1.03    [0m |
| [0m 13      [0m | [0m 0.9825  [0m | [0m 1.327   [0m |
| [0m 14      [0m | [0m 0.9825  [0m | [0m 1.045   [0m |
| [0m 15      [0m | [0m 0.9825  [0m | [0m 1.019   

In [3]:
lr_model = LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42, C=1.20)

In [5]:
from sklearn.metrics import accuracy_score, classification_report
lr_model.fit(X_resampled, Y_resampled)
y_pred = lr_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.9824561403508771

In [6]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        72
           1       0.95      1.00      0.98        42

   micro avg       0.98      0.98      0.98       114
   macro avg       0.98      0.99      0.98       114
weighted avg       0.98      0.98      0.98       114



In [8]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test, y_pred))

[[70  2]
 [ 0 42]]


In [9]:
lr_model.score(X_test, Y_test)

0.9824561403508771

In [10]:
lr_model.score(X_resampled, Y_resampled)

0.9649122807017544

In [11]:
from sklearn.tree import DecisionTreeClassifier

def get_model1(max_depth,min_samples_split):
    model = DecisionTreeClassifier(class_weight="balanced", max_depth = max_depth,
                                   min_samples_split = min_samples_split)

    # Train the model with the train dataset.
    model.fit(X_resampled, Y_resampled)

    # Evaluate the model with the eval dataset.
    score = model.score(X_test,Y_test)
    
    # Return the accuracy.

    return score

verbose = 1
fit_with_partial = partial(get_model1)

# Bounded region of parameter space
pbounds = {"max_depth": (3, 7), "min_samples_split": (0.1, 0.9)}

optimizer1 = BayesianOptimization(
    f=fit_with_partial,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer1.maximize(n_iter=10)


for i, res in enumerate(optimizer1.res):
    print("Iteration {}: \n\t{}".format(i, res))

print(optimizer1.max)




|   iter    |  target   | max_depth | min_sa... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.9123  [0m | [0m 4.668   [0m | [0m 0.6763  [0m |
| [0m 2       [0m | [0m 0.886   [0m | [0m 3.0     [0m | [0m 0.3419  [0m |
| [95m 3       [0m | [95m 0.9474  [0m | [95m 3.587   [0m | [95m 0.1739  [0m |
| [0m 4       [0m | [0m 0.886   [0m | [0m 3.745   [0m | [0m 0.3764  [0m |
| [0m 5       [0m | [0m 0.9123  [0m | [0m 4.587   [0m | [0m 0.5311  [0m |
| [0m 6       [0m | [0m 0.9298  [0m | [0m 7.0     [0m | [0m 0.1     [0m |
| [0m 7       [0m | [0m 0.9123  [0m | [0m 7.0     [0m | [0m 0.9     [0m |
| [0m 8       [0m | [0m 0.9298  [0m | [0m 5.86    [0m | [0m 0.1     [0m |
| [0m 9       [0m | [0m 0.9123  [0m | [0m 5.864   [0m | [0m 0.9     [0m |
| [0m 10      [0m | [0m 0.9474  [0m | [0m 3.0     [0m | [0m 0.1     [0m |
| [0m 11      [0m | [0m 0.9298  [0m | [0m 4.998   [0m | [0m 0.1     

In [13]:
dt_model = DecisionTreeClassifier(class_weight="balanced", max_depth = 3.5,
                                   min_samples_split = 0.17)



In [14]:
from sklearn.metrics import accuracy_score, classification_report
dt_model.fit(X_resampled, Y_resampled)
y_pred = dt_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.9473684210526315

In [15]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96        72
           1       0.89      0.98      0.93        42

   micro avg       0.95      0.95      0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



In [16]:
print(confusion_matrix(Y_test, y_pred))

[[67  5]
 [ 1 41]]


In [17]:
dt_model.score(X_test, Y_test)

0.9473684210526315

In [18]:
dt_model.score(X_resampled, Y_resampled)

0.9491228070175438

In [19]:
from sklearn.ensemble import RandomForestClassifier



def get_model2(max_depth,min_samples_split):
    model = RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1,max_depth = max_depth,
                                   min_samples_split = min_samples_split )

    # Train the model with the train dataset.
    model.fit(X_resampled, Y_resampled)

    # Evaluate the model with the eval dataset.
    score = model.score(X_test,Y_test)
    
    # Return the accuracy.

    return score

verbose = 1
fit_with_partial = partial(get_model2)

# Bounded region of parameter space
pbounds = {"max_depth": (10, 15), "min_samples_split": (0.1, 0.9)}

optimizer2 = BayesianOptimization(
    f=fit_with_partial,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer2.maximize(n_iter=10)


for i, res in enumerate(optimizer2.res):
    print("Iteration {}: \n\t{}".format(i, res))

print(optimizer1.max)



|   iter    |  target   | max_depth | min_sa... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.6316  [0m | [0m 12.09   [0m | [0m 0.6763  [0m |
| [95m 2       [0m | [95m 0.9561  [0m | [95m 10.0    [0m | [95m 0.3419  [0m |
| [0m 3       [0m | [0m 0.9386  [0m | [0m 10.73   [0m | [0m 0.1739  [0m |
| [0m 4       [0m | [0m 0.9211  [0m | [0m 10.93   [0m | [0m 0.3764  [0m |
| [0m 5       [0m | [0m 0.886   [0m | [0m 11.98   [0m | [0m 0.5311  [0m |
| [0m 6       [0m | [0m 0.9386  [0m | [0m 15.0    [0m | [0m 0.1     [0m |
| [0m 7       [0m | [0m 0.9561  [0m | [0m 13.49   [0m | [0m 0.1     [0m |
| [0m 8       [0m | [0m 0.6316  [0m | [0m 15.0    [0m | [0m 0.9     [0m |
| [95m 9       [0m | [95m 0.9649  [0m | [95m 12.39   [0m | [95m 0.1     [0m |
| [0m 10      [0m | [0m 0.9474  [0m | [0m 11.56   [0m | [0m 0.1     [0m |
| [0m 11      [0m | [0m 0.6316  [0m | [0m 10.0    [0m | [0m 0.9  

In [20]:
rf_model = RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1,max_depth = 3.5,
                                   min_samples_split = 0.17)


In [21]:
rf_model.fit(X_resampled, Y_resampled)
y_pred = rf_model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.9385964912280702

In [22]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95        72
           1       0.87      0.98      0.92        42

   micro avg       0.94      0.94      0.94       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.94      0.94      0.94       114



In [23]:
print(confusion_matrix(Y_test, y_pred))

[[66  6]
 [ 1 41]]


In [24]:
rf_model.score(X_test, Y_test)

0.9385964912280702

In [25]:
rf_model.score(X_resampled, Y_resampled)

0.956140350877193