In [1]:
# Step 1: Import libraries and load data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('wdbc-data.csv', header=None)

In [2]:
# Step 2: Assign a title to each column of the dataset
column_names = ['id', 'diagnosis', 'mean_radius', 'mean_texture', 'mean_perimeter',
                'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity',
                'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
                'se_radius', 'se_texture', 'se_perimeter', 'se_area', 'se_smoothness',
                'se_compactness', 'se_concavity', 'se_concave_points', 'se_symmetry',
                'se_fractal_dimension', 'worst_radius', 'worst_texture', 'worst_perimeter',
                'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity',
                'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension']
df.columns = column_names

In [3]:
# Step 3: Data preprocessing (Min-max Normalization and Feature selection using selectkbest)
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis'].map({'B': 0, 'M': 1})  # Convert B to 0 and M to 1

In [4]:
from sklearn.preprocessing import MinMaxScaler
# Min-max normalization
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [6]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

# Step 6: Training using Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Step 7: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [7]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)

Logistic Regression Accuracy: 0.9736842105263158
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Random Forest Accuracy: 0.9649122807017544
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Naïve Bayes Accuracy: 0.9649122807017544
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97

In [8]:
# Min-max normalization
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [10]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)

# Step 6: Training using Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Step 7: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [11]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)

Logistic Regression Accuracy: 0.9824561403508771
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Random Forest Accuracy: 0.956140350877193
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97        71
           1       0.95      0.93      0.94        43

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

Naïve Bayes Accuracy: 0.9649122807017544
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97 

In [None]:
from sklearn.model_selection import GridSearchCV
# Define hyperparameter grid for Logistic Regression
logreg_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'saga']
}

logreg_model = LogisticRegression()
logreg_grid_search = GridSearchCV(logreg_model, logreg_param_grid, cv=5)
logreg_grid_search.fit(X_train, y_train)
best_logreg_model = logreg_grid_search.best_estimator_
y_pred_logreg = best_logreg_model.predict(X_test)

# Step 5: Training using Random Forest with Hyperparameter Tuning
rf_model = RandomForestClassifier()

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Step 6: Training using Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [None]:
# Step 8: Model evaluation based on prediction
# Calculate accuracy scores and classification reports
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

classification_report_logreg = classification_report(y_test, y_pred_logreg)
classification_report_rf = classification_report(y_test, y_pred_rf)
classification_report_nb = classification_report(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Logistic Regression Classification Report:")
print(classification_report_logreg)

print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Classification Report:")
print(classification_report_rf)

print("Naïve Bayes Accuracy:", accuracy_nb)
print("Naïve Bayes Classification Report:")
print(classification_report_nb)