Define Problem Statement

Our objective is to identify which features are most helpful in predicting malignant or benign cancer and to classify whether the breast cancer is benign or malignant.



Data Cleaning and Manipulation

In [None]:
# Importing the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Importing dataset
df = pd.read_csv("data.csv")
X = df.iloc[:, 1:31].values
Y = df.iloc[:, 31].values

In [None]:
# Examine the dataset
df.head()

In [None]:
# Shape of the dataset
print("Cancer data set dimensions : {}".format(df.shape))

In [None]:
diagnosis_counts = df['diagnosis'].value_counts()

# Print the counts for 'M' and 'B' separately
count_M = diagnosis_counts['M']
count_B = diagnosis_counts['B']

print(f'Count of M: {count_M}')
print(f'Count of B: {count_B}')

In [None]:
# Visualize distribution of classes

plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='diagnosis', palette='RdBu')

# Count the number of observations in each class
benign, malignant = df['diagnosis'].value_counts()
print('Number of cells labeled Benign: ', benign)
print('Number of cells labeled Malignant : ', malignant)
print('')
print('% of cells labeled Benign', round(benign / len(df) * 100, 2), '%')
print('% of cells labeled Malignant', round(malignant / len(df) * 100, 2), '%')

Let's quickly scan for any interesting patterns between our 10 "mean" columns and the response variable by generating a scatter plot matrix as shown below:

In [None]:
# Generate a Scatter Plot matrix with the "mean" columns
cols = ['diagnosis',
        'radius_mean', 
        'texture_mean', 
        'perimeter_mean', 
        'area_mean', 
        'smoothness_mean', 
        'compactness_mean', 
        'concavity_mean',
        'concave points_mean', 
        'symmetry_mean', 
        'fractal_dimension_mean']

sns.pairplot(data=df[cols], hue='diagnosis', palette='RdBu')

In [None]:
df.isnull().sum()
df.isna().sum()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

# Impute missing values with the mean value
df.fillna(df.mean(numeric_only=True), inplace=True)

In [None]:
# Encoding categorical data values
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
# Encode the target variable 'diagnosis'
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(df['diagnosis'])

# Create a copy of your original dataframe for preprocessing
X_encoded = df.copy()

# Drop non-numeric columns
# Drop 'id' and 'Unnamed: 32'
non_numeric_cols = ['id', 'Unnamed: 32']
X_encoded.drop(non_numeric_cols, axis=1, inplace=True)

# Split the dataset into the Training set and Test set
X = X_encoded.drop(columns=['diagnosis'])  # Remove the target variable
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


The Model

In [None]:
# Logistic Regression Model

# Create and fit the Logistic Regression model
logistic_model = LogisticRegression(random_state=0)
logistic_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_logistic = logistic_model.predict(X_test)

# Calculate the confusion matrix
confusion_logistic = confusion_matrix(Y_test, Y_pred_logistic)

# Extract TP, TN, FP, FN from the confusion matrix
TN_logistic = confusion_logistic[0, 0]
FP_logistic = confusion_logistic[0, 1]
FN_logistic = confusion_logistic[1, 0]
TP_logistic = confusion_logistic[1, 1]

# Calculate the accuracy
accuracy_logistic = (TP_logistic + TN_logistic) / (TP_logistic + TN_logistic + FP_logistic + FN_logistic) * 100

# Print the model name
print("Model: Logistic Regression")

# Print classification report
report_logistic = classification_report(Y_test, Y_pred_logistic)
print("Classification Report:\n", report_logistic)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_logistic)

# Print additional metrics
print(f"True Negative: {TN_logistic}")
print(f"False Positive: {FP_logistic}")
print(f"False Negative: {FN_logistic}")
print(f"True Positive: {TP_logistic}")
print(f"Correct Predictions: {accuracy_logistic:.2f}%")


In [None]:
# KNN Model

# Create and fit the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_knn = knn_model.predict(X_test)

# Calculate the confusion matrix
confusion_knn = confusion_matrix(Y_test, Y_pred_knn)

# Extract TP, TN, FP, FN from the confusion matrix
TN_knn = confusion_knn[0, 0]
FP_knn = confusion_knn[0, 1]
FN_knn = confusion_knn[1, 0]
TP_knn = confusion_knn[1, 1]

# Calculate the accuracy
accuracy_knn = (TP_knn + TN_knn) / (TP_knn + TN_knn + FP_knn + FN_knn) * 100

# Print the model name
print("Model: K-Nearest Neighbors")

# Print classification report
report_knn = classification_report(Y_test, Y_pred_knn)
print("Classification Report:\n", report_knn)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_knn)

# Print additional metrics
print(f"True Negative: {TN_knn}")
print(f"False Positive: {FP_knn}")
print(f"False Negative: {FN_knn}")
print(f"True Positive: {TP_knn}")
print(f"Correct Predictions: {accuracy_knn:.2f}%")


In [None]:
# SVM Model

# Create and fit the SVM model
svm_model = SVC(kernel='linear', random_state=0)
svm_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_svm = svm_model.predict(X_test)

# Calculate the confusion matrix
confusion_svm = confusion_matrix(Y_test, Y_pred_svm)

# Extract TP, TN, FP, FN from the confusion matrix
TN_svm = confusion_svm[0, 0]
FP_svm = confusion_svm[0, 1]
FN_svm = confusion_svm[1, 0]
TP_svm = confusion_svm[1, 1]

# Calculate the accuracy
accuracy_svm = (TP_svm + TN_svm) / (TP_svm + TN_svm + FP_svm + FN_svm) * 100

# Print the model name
print("Model: Support Vector Machine")

# Print classification report
report_svm = classification_report(Y_test, Y_pred_svm)
print("Classification Report:\n", report_svm)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_svm)

# Print additional metrics
print(f"True Negative: {TN_svm}")
print(f"False Positive: {FP_svm}")
print(f"False Negative: {FN_svm}")
print(f"True Positive: {TP_svm}")
print(f"Correct Predictions: {accuracy_svm:.2f}%")


In [None]:
# Naive Bayes Model

# Create and fit the Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_nb = nb_model.predict(X_test)

# Calculate the confusion matrix
confusion_nb = confusion_matrix(Y_test, Y_pred_nb)

# Extract TP, TN, FP, FN from the confusion matrix
TN_nb = confusion_nb[0, 0]
FP_nb = confusion_nb[0, 1]
FN_nb = confusion_nb[1, 0]
TP_nb = confusion_nb[1, 1]

# Calculate the accuracy
accuracy_nb = (TP_nb + TN_nb) / (TP_nb + TN_nb + FP_nb + FN_nb) * 100

# Print the model name
print("Model: Naïve Bayes")

# Print classification report
report_nb = classification_report(Y_test, Y_pred_nb)
print("Classification Report:\n", report_nb)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_nb)

# Print additional metrics
print(f"True Negative: {TN_nb}")
print(f"False Positive: {FP_nb}")
print(f"False Negative: {FN_nb}")
print(f"True Positive: {TP_nb}")
print(f"Correct Predictions: {accuracy_nb:.2f}%")


In [None]:
# Decision Tree Model

# Create and fit the Decision Tree model
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_dt = dt_model.predict(X_test)

# Calculate the confusion matrix
confusion_dt = confusion_matrix(Y_test, Y_pred_dt)

# Extract TP, TN, FP, FN from the confusion matrix
TN_dt = confusion_dt[0, 0]
FP_dt = confusion_dt[0, 1]
FN_dt = confusion_dt[1, 0]
TP_dt = confusion_dt[1, 1]

# Calculate the accuracy
accuracy_dt = (TP_dt + TN_dt) / (TP_dt + TN_dt + FP_dt + FN_dt) * 100

# Print the model name
print("Model: Decision Tree")

# Print classification report
report_dt = classification_report(Y_test, Y_pred_dt)
print("Classification Report:\n", report_dt)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_dt)

# Print additional metrics
print(f"True Negative: {TN_dt}")
print(f"False Positive: {FP_dt}")
print(f"False Negative: {FN_dt}")
print(f"True Positive: {TP_dt}")
print(f"Correct Predictions: {accuracy_dt:.2f}%")


In [None]:
# Random Forest Model

# Create and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred_rf = rf_model.predict(X_test)

# Calculate the confusion matrix
confusion_rf = confusion_matrix(Y_test, Y_pred_rf)

# Extract TP, TN, FP, FN from the confusion matrix
TN_rf = confusion_rf[0, 0]
FP_rf = confusion_rf[0, 1]
FN_rf = confusion_rf[1, 0]
TP_rf = confusion_rf[1, 1]

# Calculate the accuracy
accuracy_rf = (TP_rf + TN_rf) / (TP_rf + TN_rf + FP_rf + FN_rf) * 100

# Print the model name
print("Model: Random Forest")

# Print classification report
report_rf = classification_report(Y_test, Y_pred_rf)
print("Classification Report:\n", report_rf)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_rf)

# Print additional metrics
print(f"True Negative: {TN_rf}")
print(f"False Positive: {FP_rf}")
print(f"False Negative: {FN_rf}")
print(f"True Positive: {TP_rf}")
print(f"Correct Predictions: {accuracy_rf:.2f}%")


Model Evaluation

In [None]:
# Comparing All Models

# Create and initialize the models
models = {
    "Logistic Regression": LogisticRegression(random_state=0),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Support Vector Machine (Linear)": SVC(kernel='linear', random_state=0),
    "Support Vector Machine (RBF)": SVC(kernel='rbf', random_state=0),
    "Naïve Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "Random Forest": RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
}

# Initialize an empty dictionary to store the results
results = {}

for model_name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    confusion = confusion_matrix(Y_test, Y_pred)
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    TP = confusion[1, 1]
    accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
    results[model_name] = accuracy

results_df = pd.DataFrame(results.items(), columns=['Model', 'Accuracy'])
results_df.set_index('Model', inplace=True)

# Sort the results by accuracy in descending order
results_df = results_df.sort_values(by='Accuracy', ascending=False)

# Display the comparison table
print(results_df)
