## Data Exploration and Preprocessing

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

ImportError: cannot import name 'VisibleDeprecationWarning' from 'numpy' (unknown location)

In [None]:
# # Import all libraries from the '_imports.ipynb'
# %run _imports.ipynb

In [None]:
# Import Data

file_path = '../data/breast-cancer-wisconsin-data_work.csv'
dataset = pd.read_csv(file_path)
df = dataset.copy()
df.head()

### Statistical Summary - Exploration

In [None]:
df.info()

# no null values

In [None]:
df.dtypes

In [None]:
# Values of the type Float/Int(Numerical categories)
df.select_dtypes(include=["float", "int"]).columns

In [None]:
len(df.select_dtypes(include=["float", "int"]).columns)

In [None]:
# Values of the type Object (Categorical)
df.select_dtypes(include="object").columns

In [None]:
# Statistical Summary

df.describe()

### Handling Missing Values

In [None]:
# Check if there's any missing data in the dataframe
df.isnull().values.any()

In [None]:
# Check for columns containing for null values

df.columns[df.isnull().any()]


In [None]:
df["Unnamed: 32"].count()

In [None]:
# Drop the Unnamed column
df = df.drop(columns="Unnamed: 32")
df.head()

In [None]:
# Verify if there's no column with null values
df.columns[df.isnull().any()]

### Handling Categorical Variables

In [None]:
# Check for categorical variables
df.select_dtypes(include="object").head()

In [None]:
# Check for uniques values
df["diagnosis"].unique()

In [None]:
# Hot Encoding

df_encoded = pd.get_dummies(data=df, drop_first=True, dtype=int)
df_encoded.head()

In [None]:
# Countplot using Seaborn
fig, ax = plt.subplots()
sns.countplot(x=df_encoded["diagnosis_M"], label = "Count")

# Customize the plot
ax.set(
    title="Count of diagnosis_M",
    ylabel="Frequency",
    xlabel="Class"
)

# Show Count of each bar
ax.bar_label(ax.containers[0])
# Code Source - "https://stackoverflow.com/questions/55104819/display-count-on-top-of-seaborn-barplot"

plt.show()


### Feature Correlation Analysis

In [None]:
# Split data into Features and Target
X = df_encoded.drop(columns=["id", "diagnosis_M"]) # features
y = df_encoded["diagnosis_M"] # target

In [None]:
# Check which features have good correlation with the target

X_corr_y = X.corrwith(y)
X_corr_y.head()

In [None]:
# Sort the above correlation values in descending order
X_corr_y = X_corr_y.sort_values(ascending=False)
X_corr_y

In [None]:
# Create figure and axes
fig, ax = plt.subplots(figsize = (20, 10))

# Create the bar plot
X_corr_y.plot(
    kind='bar', 
    color='skyblue', 
    rot=90,
    grid=True)

# Customize the chart
ax.set(
    title = "Correlation between Features and Target",
    xlabel = "Features",
    ylabel = "Corrleation Coefficient"
)

plt.show()

In [None]:
# Correlation Matrix
X_corr_X = X.corr()
X_corr_X.head()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(X_corr_X, cmap="YlGnBu", annot=True)
plt.show()

### Train/Test Splitting

In [None]:
df_encoded.head()

In [None]:
# X, y split
X = df_encoded.drop(columns=["id", "diagnosis_M"])
y = df_encoded["diagnosis_M"]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(
    X_train.shape,
    X_test.shape)

### Feature Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train, X_test

## Building the Model

I'm going to apply three classification models and then compare the outcome
* Logistic Regression
* Random Forest
* Linear SVC

### Logistic Regression

In [None]:
clf_logReg = LogisticRegression()

# Train the model on the training data
clf_logReg.fit(X_train, y_train)

# Predict the target values
y_pred = clf_logReg.predict(X_test)

In [None]:
y_pred.shape, y_test.shape

In [None]:
# The confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf_logReg.classes_)
disp.plot()
plt.show()

In [None]:
# Evaluate the model's performance (Classification Report)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Cross Validation score
cv_scores = cross_val_score(clf_logReg, X_train, y_train, cv=10)
print(cv_scores)

In [None]:
print(f"Accurcacy is {cv_scores.mean()*100:.2f}%") # Overall accuracy
print(f"Standard deviation is {cv_scores.std()*100:.2f}") # Overall standard deviation

### Random Forest Classifier

In [None]:
clf_rf = RandomForestClassifier()

# Train the model on the data
clf_rf.fit(X_train, y_train)

# Predict values of y
y_pred = clf_rf.predict(X_test)

In [None]:
y_pred.shape, y_test.shape

In [None]:
# Model Evaluation

# The confusion Matrix
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf_rf.classes_)
disp.plot()
plt.show()

In [None]:
# Evaluate the model's performance (Classification Report)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Cross Validation score
cv_scores = cross_val_score(clf_rf, X_train, y_train, cv=10)
print(cv_scores)

print(f"Accurcacy is {cv_scores.mean()*100:.2f}%") # Overall accuracy
print(f"Standard deviation is {cv_scores.std()*100:.2f}") # Overall standard deviation

### Support Vector Machine - Classifier

In [None]:
svc = SVC()

# Train the model on the data
svc.fit(X_train, y_train)

# Predict values of y
y_pred = svc.predict(X_test)

In [None]:
y_pred.shape, y_test.shape

In [None]:
# Model Evaluation

# The confusion Matrix
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svc.classes_)
disp.plot()
plt.show()

In [None]:
# Evaluate the model's performance (Classification Report)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Evaluate the model's performance (Classification Report)

report = classification_report(y_test, y_pred, output_dict=True)
report

In [None]:
report["macro avg"]["f1-score"]

In [None]:
# Cross Validation score
cv_scores = cross_val_score(svc, X_train, y_train, cv=10)
print(cv_scores)

print(f"Accurcacy is {cv_scores.mean()*100:.2f}%") # Overall accuracy
print(f"Standard deviation is {cv_scores.std()*100:.2f}") # Overall standard deviation

### Function for Model Training

A function will simply my work in training and evaluating the models

In [None]:
import sys
import os

# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath('../scripts'))

# Import the function from functions.py
from functions import evaluate_classification_models, evaluate_classification_models_2

In [None]:
import pandas as pd

# Models to be trained
models = {
    "Logistic Regression": LogisticRegression,
    "RandomForest Classifier": RandomForestClassifier,
    "Support Vector Machine": SVC
}

# Model Training and Evaluation
metrics_df = evaluate_classification_models_2(models, 10, X_train, y_train, X_test, y_test)
metrics_df

### Hyperparameter Tuning

Using Randomized SearchCV

In [None]:
LogisticRegression().get_params()

In [None]:
# Create a dictionary with the hyperparameters to adjust and the values to try
param_grid = {
    'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs','newton-cg', 'liblinear', 'sag', 'saga']
}

# Instantiate and Setup Randomized Search CV
rs_clf = RandomizedSearchCV(
    estimator = clf_logReg, 
    param_distributions = param_grid,
    n_iter = 5, # number of models to try
    cv  = 5,
    verbose=3,
    n_jobs = -1,
    scoring = "roc_auc",
    error_score=np.nan  # Ignore errors and continue with np.nan as score
)


# Fit/train the model
rs_clf.fit(X_train, y_train)

In [None]:
# Check for best parameters
best_params = rs_clf.best_params_
best_params

### Final Model

In [None]:
# Final Model Training using the best parameters
final_model = LogisticRegression(
    solver=best_params["solver"], penalty = best_params["penalty"], C= best_params["C"]
)
final_model.fit(X_train, y_train)

y_preds = final_model.predict(X_test)

In [None]:
y_preds

In [None]:
# Classification Report
report = classification_report(y_test, y_preds, output_dict=True)

accuracy = f"{report["accuracy"]*100:.2f}%"
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"] 

# Cross Validation score
cv_scores_final = cross_val_score(final_model, X_train, y_train, cv=10)
cv_accuracy = f"{cv_scores.mean()*100:.2f}%"

In [None]:
# Save results of the final model to a variable
model_metrics = pd.DataFrame([["LogisticRegression Final", accuracy, precision,recall, f1_score, cv_accuracy]], columns=metrics_df.columns)
model_metrics

In [None]:
# Append result to metrics dataframe
metrics_df = pd.concat([metrics_df, model_metrics], ignore_index=True)
metrics_df

In [None]:
# Filter and return the CV Accuracy for the "Logistic Regression" and "LogisticRegression Final" models
log_reg_cv_accuracy = metrics_df.loc[metrics_df["Model"] == "Logistic Regression", "CV Accuracy"].values[0]
log_reg_cv_accuracy_final = metrics_df.loc[metrics_df["Model"] == "LogisticRegression Final", "CV Accuracy"].values[0]


if log_reg_cv_accuracy > log_reg_cv_accuracy_final:
    print(f"Accuracy decreased from {log_reg_cv_accuracy} to {log_reg_cv_accuracy_final}, hence maintain the model prior to hyperparameter tuning")
else:
    print(f"Accuracy increased from {log_reg_cv_accuracy} to {log_reg_cv_accuracy_final}, hence the new model should be used")
