In [3]:
import pandas as pd
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [4]:


# Load the dataset from a CSV file
file_path = 'data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [5]:
data.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,


In [6]:

# Drop the 'Unnamed: 32' column  and 'id' column as they are not important
data = data.drop(columns=['Unnamed: 32', 'id'])

In [7]:
# Convert 'diagnosis' column to numerical values (0 for Benign, 1 for Malignant)
data['diagnosis'] = data['diagnosis'].map({'B': 0, 'M': 1})

In [8]:
# Split data into features and target
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']

In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.preprocessing import StandardScaler
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

## Feature Selection

In [11]:
#using kbest
from sklearn.feature_selection import SelectKBest, f_classif

# Apply SelectKBest feature selection
kbest = SelectKBest(score_func=f_classif, k=10)
X_train_selected = kbest.fit_transform(X_train, y_train)
X_test_selected = kbest.transform(X_test)

# Get the selected feature names
selected_features = X.columns[kbest.get_support()]
print("Selected Features:", selected_features)


Selected Features: Index(['radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean',
       'concave points_mean', 'radius_worst', 'perimeter_worst', 'area_worst',
       'concavity_worst', 'concave points_worst'],
      dtype='object')


In [12]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Apply Recursive Feature Elimination (RFE) feature selection
rfe_model = RandomForestClassifier()
rfe = RFE(rfe_model, n_features_to_select=10)
X_train_rfe_selected = rfe.fit_transform(X_train, y_train)
X_test_rfe_selected = rfe.transform(X_test)

# Get the selected feature names
rfe_selected_features = X.columns[rfe.get_support()]
print("RFE Selected Features:", rfe_selected_features)


RFE Selected Features: Index(['perimeter_mean', 'concavity_mean', 'concave points_mean', 'area_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'concavity_worst', 'concave points_worst'],
      dtype='object')


##  Grid Search CV for Model Tuning


In [13]:
# grid_search_cv.py
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import pickle

# Define the MLPClassifier model
mlp = MLPClassifier(max_iter=200)

# Create the pipeline
pipeline = Pipeline([
    ('mlp', mlp)
])

# Define the parameter grid
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (50,), (100,), (50, 50)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['sgd', 'adam'],
    'mlp__alpha': [0.0001, 0.05],
    'mlp__learning_rate': ['constant', 'adaptive'],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)  # Adjust verbose as needed

# Fit GridSearchCV
grid_search.fit(X_train_selected, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on test dataset to get confusion matrix
y_pred = best_model.predict(X_test_selected)
cm = confusion_matrix(y_test, y_pred)


print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
print("Confusion Matrix:\n", cm)


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant', 'mlp__solver': 'adam'}
Best Score: 0.9582417582417582
Confusion Matrix:
 [[69  2]
 [ 1 42]]


## ANN model creation


In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# ANN Model Creation
ann_model = Sequential()
ann_model.add(Dense(50, input_dim=X_train_selected.shape[1], activation='relu'))
ann_model.add(Dense(50, activation='relu'))
ann_model.add(Dense(1, activation='sigmoid'))

# Compile the model
ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [15]:
## Ann model with dropouts
from tensorflow.keras.layers import Dropout


# Define the input dimension and dropout rate
input_dim = X_train_selected.shape[1]
dropout_rate = 0.5#to avoid overfitting

# ANN Model Creation
ann_model_with_dropout = Sequential()
ann_model_with_dropout.add(Dense(64, input_dim=input_dim, activation='relu'))
ann_model_with_dropout.add(Dropout(dropout_rate))
ann_model_with_dropout.add(Dense(64, activation='relu'))
ann_model_with_dropout.add(Dropout(dropout_rate))
ann_model_with_dropout.add(Dense(32, activation='relu'))
ann_model_with_dropout.add(Dropout(dropout_rate))
ann_model_with_dropout.add(Dense(1, activation='sigmoid'))

# Compile the model
ann_model_with_dropout.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])



## Training and Evaluating Artificial Neural Network Models

In [16]:
# Train the ANN model
ann_model.fit(X_train_selected, y_train, epochs=50, batch_size=10, validation_data=(X_test_selected, y_test))


Epoch 1/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7700 - loss: 0.5239 - val_accuracy: 0.9825 - val_loss: 0.1924
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9393 - loss: 0.2138 - val_accuracy: 0.9737 - val_loss: 0.1016
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9398 - loss: 0.1660 - val_accuracy: 0.9737 - val_loss: 0.0822
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9520 - loss: 0.1495 - val_accuracy: 0.9737 - val_loss: 0.0788
Epoch 5/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9460 - loss: 0.1576 - val_accuracy: 0.9737 - val_loss: 0.0739
Epoch 6/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9493 - loss: 0.1409 - val_accuracy: 0.9737 - val_loss: 0.0712
Epoch 7/50
[1m46/46[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x14189a57490>

In [17]:
# Evaluate the ANN model
_, ann_accuracy = ann_model.evaluate(X_test_selected, y_test)
print(f'ANN Model Accuracy: {ann_accuracy * 100:.2f}%')

# Predict on test dataset to get confusion matrix
y_pred_ann = (ann_model.predict(X_test_selected) > 0.5).astype("int32")
cm_ann = confusion_matrix(y_test, y_pred_ann)
print("ANN Confusion Matrix:\n", cm_ann)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.9794 - loss: 0.0998  


ANN Model Accuracy: 98.25%
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
ANN Confusion Matrix:
 [[70  1]
 [ 1 42]]


In [18]:

# Train the ANN model with drop out
history = ann_model_with_dropout.fit(X_train_selected, y_train, epochs=50, batch_size=10, validation_data=(X_test_selected, y_test))



Epoch 1/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5242 - loss: 0.7000 - val_accuracy: 0.9561 - val_loss: 0.3833
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8162 - loss: 0.4177 - val_accuracy: 0.9737 - val_loss: 0.1731
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9096 - loss: 0.2578 - val_accuracy: 0.9649 - val_loss: 0.1067
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8930 - loss: 0.2263 - val_accuracy: 0.9649 - val_loss: 0.0893
Epoch 5/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9356 - loss: 0.1892 - val_accuracy: 0.9649 - val_loss: 0.0799
Epoch 6/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9407 - loss: 0.1681 - val_accuracy: 0.9649 - val_loss: 0.0713
Epoch 7/50
[1m46/46[0m [32m━━━━━━━━━━

In [19]:
# Evaluate the ANN model
_, ann_accuracy_with_dropout = ann_model_with_dropout.evaluate(X_test_selected, y_test)
print(f'ANN Model with Dropout Accuracy: {ann_accuracy_with_dropout * 100:.2f}%')

# Predict on test dataset to get confusion matrix
y_pred_ann_with_dropout = (ann_model_with_dropout.predict(X_test_selected) > 0.5).astype("int32")
cm_ann_with_dropout = confusion_matrix(y_test, y_pred_ann_with_dropout)
print("ANN Model with Dropout Confusion Matrix:\n", cm_ann_with_dropout)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.9794 - loss: 0.0906  
ANN Model with Dropout Accuracy: 98.25%
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
ANN Model with Dropout Confusion Matrix:
 [[70  1]
 [ 1 42]]


In [20]:
# Save the model
ann_model_with_dropout.save('ann_model_with_dropout.h5')
print("Model saved to ann_model_with_dropout.h5")




Model saved to ann_model_with_dropout.h5


In [22]:
# Save the SelectKBest object using pickle
with open('kbest.pkl', 'wb') as file:
    pickle.dump(kbest, file)

In [21]:
import pickle
# Save the scaler to a file
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler saved to scaler.pkl")

Scaler saved to scaler.pkl
