In [1]:
# Fixed dependencies - do not remove or change.
import pytest
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive/')
# Import your dependencies
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import GridSearchCV

Mounted at /content/gdrive/


In [2]:
# Import data
def import_local_data(file_path):

    raw_df = pd.read_excel(file_path)

    return raw_df

In [3]:
#local_file_path = 'https://github.com/Tom12325/Module4c/blob/main/breast-cancer.xls?raw=true'
local_file_path = 'https://github.com/jhall1996/Machine-Learning/blob/main/breast-cancer.xls?raw=true'

In [4]:
# Dont change
raw_data = import_local_data(local_file_path)

Conduct exploratory data analysis and explain your key findings - Examine the data, explain its key features and what they look like. Highlight any fields that are anomalous.

In [None]:
# Explain your key findings
# 286 entries were identified within the dataset
# Within the age column, there are age ranges. These will need to be transformed to record the median of each range.
# Tumour size column includes invalid data entries (dates rather than numeric values). These will be imputed with the mean value.
# Node cap and breast quad columns includes invalid data entries (question marks). As these entries are small in numnber the rows including them will be removed from the dataset.
# There is a higher bias towards no-recurrence-events within the target variable, therefore we will use a larger test size (0.3, usually it would be 0.2)
# Update, see end results for the justification for reversing this decision and using a test size of 0.2


Create any data pre-processing that you will conduct on seen and unseen data. Regardless of the model you use, this dataframe must contain only numeric features and have a strategy for any expected missing values. Any objects can that are needed to handle the test data that are dependent on the training data can be stored in the model class. You are recommended to use sklearn Pipelines or similar functionality to ensure reproducibility.

In [45]:
X = raw_data.iloc[:, :-1].values
y = raw_data.iloc[:, -1].values
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [46]:
y_train_split

array(['no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'recurrence-events', 'recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'recurrence-events', 'recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
       'no-recurrence-events', 'no-recurrence-events',
     

In [47]:
class Module4_Model:

    def __init__(self):
        self.model = None
        self.label_encoder = LabelEncoder()

    def preprocess_training_data(self, training_df, train_labels):

        # Convert NumPy array to a DataFrame
        df = pd.DataFrame(training_df, columns=['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
                                                 'breast', 'breast-quad', 'irradiat'])

        # Concatenate features and labels into a single DataFrame
        df = pd.concat([df, pd.DataFrame(train_labels, columns=['Class'])], axis=1)

        # Create a LabelEncoder object
        le = LabelEncoder()

        # Convert 'age', 'tumor-size', and 'inv-nodes' columns to string representation
        df['age'] = df['age'].astype(str)
        df['tumor-size'] = df['tumor-size'].astype(str)
        df['inv-nodes'] = df['inv-nodes'].astype(str)

        # Split 'age' column into two columns and convert them to numeric values
        age_ranges = df['age'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['age'] = age_ranges.mean(axis=1, skipna=True)
        df['age'] = df['age'].fillna(df['age'].mean())

        # Encode 'age' column as integer values
        df['age'] = le.fit_transform(df['age'])

        # Encode 'menopause' column as integer values
        df['menopause'] = le.fit_transform(df['menopause'])

        # Split the 'tumor-size' column into two columns and convert them to numeric values
        tumor_ranges = df['tumor-size'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['tumor-size'] = tumor_ranges.mean(axis=1, skipna=True)
        df['tumor-size'] = df['tumor-size'].fillna(df['tumor-size'].mean())

        # Split the 'inv-nodes' column into two columns and convert them to numeric values
        inv_ranges = df['inv-nodes'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['inv-nodes'] = inv_ranges.mean(axis=1, skipna=True)
        df['inv-nodes'] = df['inv-nodes'].fillna(df['inv-nodes'].mean())

        # Remove rows with a "?" in the "node-caps" column
        df = df[df["node-caps"] != "?"]
        # Encode "node-caps" column as integer values
        df["node-caps"] = le.fit_transform(df["node-caps"])

        # Encode "breast" column as integer values
        df["breast"] = le.fit_transform(df["breast"])

        # Remove any rows with a "?" in the "breast-quad" column
        df = df[df["breast-quad"] != "?"]
        # Encode "breast-quad" column as integer values
        df["breast-quad"] = le.fit_transform(df["breast-quad"])

        # Encode "irradiat" column as integer values
        df["irradiat"] = le.fit_transform(df["irradiat"])

        # Split combined DataFrame back into features and labels
        X_train_processed = df.drop('Class', axis=1)
        y_train_processed = df['Class']

        # Fit label encoder to training labels
        self.label_encoder.fit(y_train_processed)
        processed_train_labels = self.label_encoder.transform(y_train_processed)

        return X_train_processed, processed_train_labels

    def preprocess_test_data(self, test_df, test_labels):

        # Convert NumPy array to a DataFrame
        df = pd.DataFrame(test_df, columns=['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
                                                 'breast', 'breast-quad', 'irradiat'])

        # Concatenate the features and labels into a single DataFrame
        df = pd.concat([df, pd.DataFrame(test_labels, columns=['Class'])], axis=1)

        # Create a LabelEncoder object
        le = LabelEncoder()

        # Convert 'age', 'tumor-size', and 'inv-nodes' columns to string representation
        df['age'] = df['age'].astype(str)
        df['tumor-size'] = df['tumor-size'].astype(str)
        df['inv-nodes'] = df['inv-nodes'].astype(str)

        # Split 'age' column into two columns and convert them to numeric values
        age_ranges = df['age'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['age'] = age_ranges.mean(axis=1, skipna=True)
        df['age'] = df['age'].fillna(df['age'].mean())

        # Encode 'age' column as integer values
        df['age'] = le.fit_transform(df['age'])

        # Encode'menopause' column as integer values
        df['menopause'] = le.fit_transform(df['menopause'])

        # Split 'tumor-size' column into two columns and convert them to numeric values
        tumor_ranges = df['tumor-size'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['tumor-size'] = tumor_ranges.mean(axis=1, skipna=True)
        df['tumor-size'] = df['tumor-size'].fillna(df['tumor-size'].mean())

        # Split 'inv-nodes' column into two columns and convert them to numeric values
        inv_ranges = df['inv-nodes'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
        df['inv-nodes'] = inv_ranges.mean(axis=1, skipna=True)
        df['inv-nodes'] = df['inv-nodes'].fillna(df['inv-nodes'].mean())

        # Remove rows with a "?" in "node-caps"
        df = df[df["node-caps"] != "?"]
        # Encode "node-caps" column as integer values
        df["node-caps"] = le.fit_transform(df["node-caps"])

        # Encode "breast" column as integer values
        df["breast"] = le.fit_transform(df["breast"])

        # Remove any rows with a "?" in the "breast-quad" column
        df = df[df["breast-quad"] != "?"]
        # Encode  "breast-quad" column as integer values
        df["breast-quad"] = le.fit_transform(df["breast-quad"])

        # Encode "irradiat" column as integer values
        df["irradiat"] = le.fit_transform(df["irradiat"])

        # Split combined dataframe back into features and labels
        X_test_processed = df.drop('Class', axis=1)
        y_test_processed = df['Class']

        # Fit label encoder to test labels
        self.label_encoder.fit(y_test_processed)
        processed_train_labels = self.label_encoder.transform(y_test_processed)

        return X_test_processed, processed_train_labels


In [48]:
# Dont change
my_model = Module4_Model()

In [49]:
X_train, y_train = my_model.preprocess_training_data(X_train_split, y_train_split)

In [50]:
X_train

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,4,0,1012.0,1.0,0,2,1,2,1
1,4,0,27.0,1.0,0,3,0,3,1
2,2,2,22.0,1.0,0,2,1,2,0
3,2,2,22.0,1.0,0,2,0,0,0
4,3,0,37.0,1.0,0,2,0,2,0
...,...,...,...,...,...,...,...,...,...
223,3,0,1012.0,1.0,0,2,0,1,0
224,4,0,37.0,1013.5,1,3,0,1,0
225,4,0,1012.0,1.0,0,1,0,2,0
226,2,2,37.0,1.0,1,3,1,2,1


In [51]:
# Create model, starting with Logistic Regression
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler object
scaler = StandardScaler()
# Fit the scaler to the data and transform the data
X_train_scaled = scaler.fit_transform(X_train)
#Initialise model
classifier = LogisticRegression(max_iter=1000)

In [52]:
#Train model
classifier.fit(X_train_scaled, y_train)
#Define new variable values to predict, this can be modified to test all models on unseen data.
age_pred = 34.5
menopause_pred = 1
tumor_pred = 17
inv_nodes_pred = 2
node_caps_pred = 0
deg_malig_pred = 1
breast_pred = 4
breast_quad_pred = 0
irradiat_pred = 0

In [53]:
X_test, y_test = my_model.preprocess_test_data(X_test_split, y_test_split)

Use your model to make a prediction on unseen data

In [64]:
X_test_scaled = scaler.fit_transform(X_test)

# Predict a new result with logistic regression,0 for no-recurrence-event and 1 for recurrence-event
print(classifier.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set result, 0 for no-recurrence-event and 1 for recurrence-event
y_pred = classifier.predict(X_test_scaled)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'Logistic Regression Accuracy: {accuracy:.2f}')

[0]
[[36  0]
 [13  7]]
Logistic Regression Accuracy: 0.77




## **Alternative Models**

K Nearest Neighbour

In [65]:
# Fit KNN model to the training data
knn = KNeighborsClassifier(n_neighbors=7, p=2, weights='uniform')
knn.fit(X_train, y_train)

# Predict a new result with KNN, 0 for no-recurrence-event and 1 for recurrence-event
print(knn.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set result
y_pred = knn.predict(X_test)
# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'KNN Accuracy: {accuracy:.2f}')

[0]
[[34  2]
 [15  5]]
KNN Accuracy: 0.70




SVM

In [66]:
# Fit SVM model to the training data
sv = SVC(kernel = 'linear', random_state = 0)
sv.fit(X_train, y_train)

# Predict a new result with SVM, 0 for no-recurrence-event and 1 for recurrence-event
print(sv.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set results
y_pred = sv.predict(X_test)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'SVM Accuracy: {accuracy:.2f}')

[0]
[[33  3]
 [12  8]]
SVM Accuracy: 0.73




Kernel SVM

In [67]:
# Fit Kernel SVM model to the training data
ksv = SVC(kernel = 'poly',degree = 3,random_state = 0)
ksv.fit(X_train, y_train)

# Predict a new result with Kernel SVM,0 for no-recurrence-event and 1 for recurrence-event
print(ksv.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set results
y_pred = ksv.predict(X_test)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'Kernel SVM Accuracy: {accuracy:.2f}')

[0]
[[33  3]
 [12  8]]
Kernel SVM Accuracy: 0.73




Naive Bayes

In [68]:
# Fit Naive model to the training data
gauss = GaussianNB()
gauss.fit(X_train, y_train)

# Predict a new result with Naive Bayes, 0 for no-recurrence-event and 1 for recurrence-event
print(gauss.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set results
y_pred = gauss.predict(X_test)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'Naive Bayes Accuracy: {accuracy:.2f}')

[0]
[[16 20]
 [ 5 15]]
Naive Bayes Accuracy: 0.55




Decision Tree

In [69]:
# Create a DecisionTreeClassifier object
dtc = DecisionTreeClassifier()

# Define the parameter grid for the grid search
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': [2, 4, 6, 8, 10],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Use the best hyperparameters to create a new decision tree classifier
dtc = DecisionTreeClassifier(**grid_search.best_params_)

# Fit the new classifier to the training data
dtc.fit(X_train, y_train)

# Predict a new result with Decision Tree Classifier, 0 for no-recurrence-event and 1 for recurrence-event
print(dtc.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set results
y_pred = dtc.predict(X_test)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print(f'Decision Tree Classifier Accuracy: {accuracy:.2f}')

[0]
[[36  0]
 [15  5]]
Decision Tree Classifier Accuracy: 0.73




Random Forest

In [70]:
# Fit Random Forest Classifier model to the training data
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy',random_state=1)
rfc.fit(X_train, y_train)

# Predict a new result with Random Forest Classifier, 0 for no-recurrence-event and 1 for recurrence-event
print(rfc.predict(scaler.transform([[
  age_pred,
  menopause_pred,
  tumor_pred,
  inv_nodes_pred,
  node_caps_pred,
  deg_malig_pred,
  breast_pred,
  breast_quad_pred,
  irradiat_pred
  ]])))

# Predict test set results
y_pred = rfc.predict(X_test)

# Print Confusion Matrix and Accuracy Score for model performance on test data
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print(f'Random Forest Classifier Accuracy: {accuracy:.2f}')

[0]
[[34  2]
 [13  7]]
Random Forest Classifier Accuracy: 0.73




In [None]:
# Asssess the accuracy of your model and explain your key findings
#The best performing model was linear regression at 0.77 accuracy score, possibly because it was dealing with a small dataset.
#The worst performing model was Naive Bayes at 0.55 accuracy score, possibly due to its assumption that all features are independant, which is highly unlikely to be true in a medical context where there are often strong relationships between the variables.
#Most other moedels performed between 0.7 to 0.73
#In the original code I used a test size of 0.3 rather than 0.2. However, I compared this against a size of 0.2 and found that 0.3 achieved a lower accuracy score
#The dataset is relatively small. This has likely resulted in overfitting, therefore despite the high bias towards no-recurrence-events.


Unit tests:

Checking training and test data for null values. This will work for both pd dataframes and np arrays, and ensures no null values exist.

In [35]:
def test_no_nulls(data):
    """ Assert no null values within pd dataframe or np array """

    # if data is numpy array, handle accordingly
    if isinstance(data, (np.ndarray)):
        assert not np.isnan(np.min(data))

    # if not np array, assume data is pandas dataframe
    else:
        assert data.isna().sum().sum() == 0


In [36]:
# run null data unit test on both training and test data
test_no_nulls(X_train)
test_no_nulls(X_test)