# Import libraries and load dataset

In [71]:
# Step 1: Import libraries and load data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('wdbc-data.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Assign a title to each column of the datasets

In [4]:
# Step 2: Assign a title to each column of the dataset
column_names = ['id', 'diagnosis', 'mean_radius', 'mean_texture', 'mean_perimeter',
                'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity',
                'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
                'se_radius', 'se_texture', 'se_perimeter', 'se_area', 'se_smoothness',
                'se_compactness', 'se_concavity', 'se_concave_points', 'se_symmetry',
                'se_fractal_dimension', 'worst_radius', 'worst_texture', 'worst_perimeter',
                'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity',
                'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension']
df.columns = column_names

In [8]:
df.head()

Unnamed: 0,id,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [18]:
# Step 3: Data preprocessing (Min-max Normalization and Feature selection using selectkbest)
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis'].map({'B': 0, 'M': 1})  # Convert B to 0 and M to 1

In [19]:
# Min-max normalization
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [110]:
# Feature selection using SelectKBest with chi2 scoring function
k_best_features = 20
feature_selector = SelectKBest(chi2, k=k_best_features)
X_selected = feature_selector.fit_transform(X_normalized, y)

In [111]:
# Get the selected feature indices
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X.columns[selected_feature_indices]

# Print the names of the selected features
print("Selected Features:")
print(selected_feature_names)

Selected Features:
Index(['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
       'mean_compactness', 'mean_concavity', 'mean_concave_points',
       'se_radius', 'se_perimeter', 'se_area', 'se_concave_points',
       'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
       'worst_smoothness', 'worst_compactness', 'worst_concavity',
       'worst_concave_points', 'worst_symmetry'],
      dtype='object')


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [113]:
# Step 5: Training using Logistic Regression with L2 regularization to prevent overfitting
logreg_model = LogisticRegression(penalty='l2')
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

In [114]:
# Step 6: Training using Random Forest with max_depth parameter to prevent overfitting
rf_model = RandomForestClassifier(max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [115]:
# Step 6: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [116]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)

Logistic Regression Accuracy: 0.9736842105263158
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        71
           1       1.00      0.93      0.96        43

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Random Forest Accuracy: 0.9649122807017544
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Naïve Bayes Accuracy: 0.9649122807017544
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97

In [117]:
# Step 10: Apply k-fold Cross-Validation to check generalization performance
cv_scores_logreg = cross_val_score(logreg_model, X_normalized, y, cv=5)
cv_scores_rf = cross_val_score(rf_model, X_normalized, y, cv=5)
cv_scores_nb = cross_val_score(nb_model, X_normalized, y, cv=5)

print("Cross-Validation Scores (Logistic Regression):", cv_scores_logreg)
print("Cross-Validation Scores (Random Forest):", cv_scores_rf)
print("Cross-Validation Scores (Naïve Bayes):", cv_scores_nb)
print("Mean Cross-Validation Score (Logistic Regression):", np.mean(cv_scores_logreg))
print("Mean Cross-Validation Score (Random Forest):", np.mean(cv_scores_rf))
print("Mean Cross-Validation Score (Naïve Bayes):", np.mean(cv_scores_nb))

Cross-Validation Scores (Logistic Regression): [0.95614035 0.96491228 0.97368421 0.95614035 0.96460177]
Cross-Validation Scores (Random Forest): [0.92105263 0.93859649 0.98245614 0.96491228 0.96460177]
Cross-Validation Scores (Naïve Bayes): [0.90350877 0.9122807  0.95614035 0.94736842 0.92035398]
Mean Cross-Validation Score (Logistic Regression): 0.9630957925787922
Mean Cross-Validation Score (Random Forest): 0.9543238627542306
Mean Cross-Validation Score (Naïve Bayes): 0.927930445582984


In [86]:
# Data Augmentation: Add random noise to feature values
num_augmented_samples = 1000  # Number of synthetic samples to generate
augmented_data = []
for _ in range(num_augmented_samples):
    augmented_sample = X_normalized + np.random.normal(loc=0, scale=0.01, size=X_normalized.shape)
    augmented_data.append(augmented_sample)

X_augmented = np.vstack(augmented_data)
y_augmented = np.tile(y, num_augmented_samples)

# Combine original data with augmented data
X_combined = np.vstack((X_normalized, X_augmented))
y_combined = np.concatenate((y, y_augmented))

# Step 4: Training using Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

# Step 6: Training using Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Step 7: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [87]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)

Logistic Regression Accuracy: 0.9866566005934301
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     71597
           1       0.99      0.98      0.98     42317

    accuracy                           0.99    113914
   macro avg       0.99      0.98      0.99    113914
weighted avg       0.99      0.99      0.99    113914

Random Forest Accuracy: 1.0
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71597
           1       1.00      1.00      1.00     42317

    accuracy                           1.00    113914
   macro avg       1.00      1.00      1.00    113914
weighted avg       1.00      1.00      1.00    113914

Naïve Bayes Accuracy: 0.938049756834103
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     71597
     

In [101]:
from imblearn.over_sampling import SMOTE
# Step 4: Handling class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_combined, y_combined)

# Step 5: Training using Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

# Step 6: Training using Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Step 7: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [102]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)

Logistic Regression Accuracy: 0.9827973387993816
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     71473
           1       0.99      0.98      0.98     71470

    accuracy                           0.98    142943
   macro avg       0.98      0.98      0.98    142943
weighted avg       0.98      0.98      0.98    142943

Random Forest Accuracy: 1.0
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71473
           1       1.00      1.00      1.00     71470

    accuracy                           1.00    142943
   macro avg       1.00      1.00      1.00    142943
weighted avg       1.00      1.00      1.00    142943

Naïve Bayes Accuracy: 0.9300000699579553
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     71473
    

In [92]:
y_balanced

0      1
1      1
2      1
3      1
4      1
      ..
709    1
710    1
711    1
712    1
713    1
Name: diagnosis, Length: 714, dtype: int64

In [98]:
print (y_balanced.values)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0
 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1
 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0
 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 1 0 1 0 0 1 

In [95]:
y_pred_logreg

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)