<a href="https://colab.research.google.com/github/haimanhamzah/ML-Classification/blob/main/Classification_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, make_pipeline

# Storing the training set in a dataframe
all_df = pd.read_excel("TrainDataset2023.xls", index_col=False)

# Dropping the ID column as it is irrelevant to training
all_df = all_df.drop(['ID'], axis=1)
all_df.head()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.0,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.0,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,135.0,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.0,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.0,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [None]:
# Data inspection
# We can see below that the missing values are messing with the max values of certain features
# It's also less obvious but there are outliers
all_df.info()
all_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Columns: 119 entries, pCR (outcome) to original_ngtdm_Strength
dtypes: float64(108), int64(11)
memory usage: 372.0 KB


Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,...,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,12.6975,56.000208,51.804674,0.5475,2.9025,2.7975,2.83,9.875,6.5625,8.6325,...,0.3957637,0.3911005,2.722189,0.003347,56797170.0,178.311246,32500.03262,0.056935,0.005965,0.029322
std,111.107417,27.137584,10.948522,0.498362,49.932114,49.937068,49.935558,86.092911,70.444284,86.20034,...,0.1666319,0.1615922,0.7648849,0.002419,706384600.0,1045.453432,177545.921568,0.047179,0.008379,0.115915
min,0.0,0.0,23.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,7.05e-11,7.05e-11,-3.2e-16,8e-06,0.0,0.0,0.000248,0.0,0.0,0.0
25%,0.0,38.0,44.516769,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.3199017,0.3184398,2.340783,0.001389,1030473.0,18.76057,0.001826,0.018628,0.00031,0.001464
50%,0.0,55.0,51.019507,1.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.4095627,0.4054695,2.814884,0.002944,3277334.0,67.929659,0.004383,0.04774,0.00233,0.003276
75%,0.0,73.0,60.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,...,0.5000049,0.495692,3.304411,0.004798,9079686.0,157.370294,0.013769,0.085321,0.007962,0.009479
max,999.0,144.0,79.603012,1.0,999.0,999.0,999.0,999.0,999.0,999.0,...,0.8773779,0.8571429,4.947427,0.011301,13900010000.0,20764.69379,1000000.0,0.2851,0.060742,1.145601


In [None]:
# DATA PRE-PROCESSING
# dropping rows that have missing pCR values
# since pCR is a classification target, a missing value means its effectively useless for our purposes

j = 0
all_df_new = all_df
for i in all_df_new['pCR (outcome)'] :
  if i == 999 :
    all_df_new = all_df_new.drop(axis=0, index=j)
  j += 1

In [None]:
# Undersampling funciton
# Removes a number of entries (roughly) equal to the number of zeroes minus the number of ones
# This balances the dataset
def undersampler(df):
  import random
  import math

  # The seed for which random zeroes are removed
  seed = 10
  num_instances = len(df.index)

  # Resets the in-built pandas index column to prevent removal errors
  df.index = [x for x in range(num_instances)]

  # initialising variables
  df_target = df['pCR (outcome)']

  zeroes_index_list = []
  return_index_list = []
  row_list = []
  num_ones = 0;

  # Setting seed
  random.seed(seed)

  # Counting the number of ones and zeroes in the dataset
  # Storing the indexes of the zeroes in a list for later use
  j = 0
  for val in df_target :
    if(val == 1) :
      num_ones = num_ones + 1
    elif(val == 0) :
      zeroes_index_list.append(j)
    j = j + 1

  # Modifier - the multiplier augments the number of zeroes passed into the data for slightly varied balancing
  num_ones = math.floor(num_ones*1.03)

  num_zeroes = num_instances - num_ones
  num_zeroes_to_remove = num_zeroes - num_ones

  # loop that removes "num_zeroes_to_remove" amount of zeroes
  for i in range(num_zeroes_to_remove) :
    potential_row = 0
    # Inner loop to prevent the same index getting selected for removal twice
    while True :

      index_matched = False
      potential_row = random.randint(0, num_zeroes-1)

      # Checking if the random index selected has been selected before
      for j in row_list:
        if (j == potential_row) :
          index_matched = True

      # If the random index has been selected before
      # Then you rerun the inner while loop
      # If not
      # Add the index to the index removal list and continue the for loop
      if (index_matched == False) :
        row_list.append(potential_row)
        return_index_list.append(zeroes_index_list[potential_row])
        break
  # Returns the balanced dataframe with the randomly selected zero indexes dropped
  return df.drop(index=return_index_list)

In [None]:
# Running the undersampler function above
all_df_new = undersampler(all_df_new)
all_df_new.describe()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
count,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,...,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0,172.0
mean,0.488372,56.935078,51.931869,0.482558,6.174419,6.19186,6.145349,14.0,7.436047,12.755814,...,0.3983954,0.3937721,2.700786,0.00328,18156560.0,263.853946,34883.744128,0.058276,0.006188,0.020821
std,0.501324,26.518665,10.920595,0.501155,76.146472,76.145163,76.148645,107.151366,76.052394,107.286133,...,0.1699934,0.1643529,0.8222805,0.002427,57221850.0,1584.175457,184020.997341,0.0482,0.008611,0.063563
min,0.0,9.0,23.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,8.65e-09,8.65e-09,-3.2e-16,5.9e-05,0.0,0.0,0.000248,0.0,0.0,0.0
25%,0.0,38.0,44.179329,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.3275055,0.3275053,2.251629,0.001413,1030473.0,18.488987,0.001632,0.019493,0.000341,0.001294
50%,0.0,55.0,51.516633,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.4077443,0.4042952,2.778916,0.002851,3873320.0,75.54666,0.003883,0.049721,0.003079,0.002825
75%,1.0,73.8125,60.972792,1.0,1.0,1.0,1.0,3.0,2.0,1.0,...,0.5092063,0.5032208,3.32659,0.004601,9430736.0,170.971846,0.013578,0.086826,0.008499,0.009583
max,1.0,144.0,78.001369,1.0,999.0,999.0,999.0,999.0,999.0,999.0,...,0.8652006,0.8571429,4.947427,0.011301,548818800.0,20764.69379,1000000.0,0.2851,0.060742,0.518837


In [None]:
all_df_new.info()

# Finds the mode of all of the values in the dataset while excluding missing values
def findModeExcludingMissing(df):
  mode = {}
  # Only looks at the first 12 columns as these are the only ones with missing values
  for col in df.columns[:12]:
    # Find mode excluding 999
    mode[col] = df[df[col] != 999][col].mode().values[0]
  return mode

# Fill in missing values with mode for that column
def replaceMissing(df, mode):
  df_temp = df
  for col in df_temp.columns[:12]:
    # Replace 999 with mode
    df_temp[col] = df_temp[col].replace(999, mode[col])
  return df_temp

# Running above functions and altering dataframe
mode = findModeExcludingMissing(all_df_new)
df_imputed = replaceMissing(all_df_new, mode)

df_imputed.describe()

temp_X = df_imputed.drop(['pCR (outcome)'], axis=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 0 to 394
Columns: 119 entries, pCR (outcome) to original_ngtdm_Strength
dtypes: float64(108), int64(11)
memory usage: 161.2 KB


In [None]:
# Function to identify outliers using IQR
def outliers(df):
  index_list = []
  for col in df.columns[1:]:
    if df[col].dtype in [int, float]:

      # Retrieving values for upper and lower quartile and calculating Inter-Quartile Range
      Q1 = df[col].quantile(0.25)
      Q3 = df[col].quantile(0.75)
      IQR = Q3 - Q1

      # Calculating lower and upper bounds for outliers
      lower = Q1 - 1.5 * IQR
      upper = Q3 + 1.5 * IQR

      # Adding instance indexes that are above or below the bounds defined above to a list
      ls = df.index[(df[col] < lower) | (df[col] > upper)]
      index_list.extend(ls)

  # Returns a list of indexes to remove
  return ls

In [None]:
index_list = outliers(df_imputed)

In [None]:
# Function to remove rows with outliers
def remove(df, ls):
  ls = sorted(set(ls))
  df = df.drop(ls)
  return df

In [None]:
df_cleaned = remove(df_imputed, index_list)

In [None]:
df_cleaned.describe()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,...,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,0.489796,57.102041,52.446059,0.469388,0.353741,0.37415,0.340136,2.435374,1.687075,1.156463,...,0.3810371,0.3756277,2.845024,0.003397,14432210.0,307.917803,40816.331082,0.065918,0.007118,0.00403
std,0.501605,27.623782,10.801982,0.500768,0.479765,0.485557,0.475374,0.511083,0.817637,0.364535,...,0.1625453,0.15498,0.7790131,0.002354,51869860.0,1710.522704,198540.944782,0.047162,0.008952,0.004361
min,0.0,9.0,29.670089,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,8.65e-09,8.65e-09,-3.2e-16,5.9e-05,0.0,0.0,0.000248,0.0,0.0,0.0
25%,0.0,38.0,44.651609,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.3221133,0.3221076,2.428997,0.001665,1140288.0,39.475236,0.001424,0.033156,0.001242,0.001175
50%,0.0,53.0,52.76386,0.0,0.0,0.0,0.0,2.0,1.0,1.0,...,0.3993227,0.3976008,3.009937,0.002952,3962034.0,102.144179,0.002827,0.058387,0.003913,0.002333
75%,1.0,74.0,61.065024,1.0,1.0,1.0,1.0,3.0,2.0,1.0,...,0.4797157,0.4709722,3.386186,0.00466,8969372.0,188.010709,0.00705,0.095009,0.010815,0.005846
max,1.0,144.0,78.001369,1.0,1.0,1.0,1.0,3.0,3.0,2.0,...,0.8652006,0.6433012,4.947427,0.011301,548818800.0,20764.69379,1000000.0,0.2851,0.060742,0.021448


In [None]:
# Resetting indexes
num_instances = len(df_cleaned.index)

df_cleaned.index = [x for x in range(num_instances)]

# Defining our test features
X = df_cleaned.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)
X.info()

# Assuming 'pCR (outcome)' is target variable
y = df_cleaned['pCR (outcome)']
ys = pd.DataFrame(y)
ys.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Columns: 117 entries, Age to original_ngtdm_Strength
dtypes: float64(107), int64(10)
memory usage: 135.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   pCR (outcome)  147 non-null    int64
dtypes: int64(1)
memory usage: 2.3 KB


In [None]:
#Using standard scaler as a form of data normalisation
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xs = scaler.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA

#Use PCA for dimensionality reduction on the normalized data
#This is to showcase and describe the data

# Number of principal components to calculate and reduce dimensionality down to
num_components = 40

# Defining, then Performing PCA
pca = PCA(n_components=num_components)

PCA_Xs = pca.fit_transform(Xs)

PCA_Xs_df = pd.DataFrame(PCA_Xs)
PCA_Xs_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,...,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,-1.93345e-16,0.0,-2.4168120000000003e-17,0.0,4.8336240000000005e-17,-7.250436000000001e-17,2.416812e-16,-6.04203e-18,2.416812e-16,-2.1147110000000002e-17,...,-1.5105080000000002e-17,-3.9273200000000005e-17,1.178196e-16,-1.5105080000000002e-17,4.909149e-17,3.021015e-18,-2.1902360000000002e-17,-1.9636600000000002e-17,3.3042350000000005e-17,7.099385e-17
std,5.987535,4.264759,3.777409,3.607293,2.333046,2.082313,1.858338,1.69023,1.523851,1.457192,...,0.4386183,0.4039627,0.3980417,0.3574611,0.3305196,0.2978918,0.2861877,0.2697972,0.2645567,0.253132
min,-13.56563,-12.463665,-5.682804,-10.62777,-3.821287,-7.715582,-4.447819,-6.042677,-3.66439,-4.846696,...,-1.445804,-1.041202,-0.960234,-0.831452,-0.7531869,-0.7238074,-0.8486838,-0.7038179,-0.67543,-0.6383741
25%,-4.440495,-1.851094,-2.588788,-1.97611,-1.550373,-0.9453591,-1.071961,-1.097169,-0.9339072,-0.8835844,...,-0.2475129,-0.2492284,-0.2698923,-0.2568616,-0.2179955,-0.2235073,-0.1624744,-0.1758995,-0.1651839,-0.1890718
50%,-0.6104024,0.075978,-0.5589934,-0.029345,-0.4776996,0.3733739,-0.2278449,0.003710259,-0.1074215,-0.1303295,...,-0.07071992,-0.01045017,-0.0004903137,-0.003280206,-0.01930073,0.0007383751,-0.0003661574,-0.008251205,-0.02601838,0.009651937
75%,3.791971,1.690302,1.447136,1.913455,1.080463,1.226472,0.7516968,1.147191,1.06713,0.7502611,...,0.2596007,0.2817303,0.2502663,0.215702,0.195238,0.1932939,0.1709747,0.1599959,0.1314943,0.1560464
max,23.22714,16.796041,15.49775,14.968212,13.68399,7.769233,8.400241,6.452778,5.225334,7.748973,...,1.534299,1.115943,1.235587,1.519225,0.8528906,0.8951035,0.937534,0.8253841,0.7456923,1.192393


In [None]:
# Implementing test split with 60% of the variables being used for training
X_train, X_test, y_train, y_test = train_test_split(PCA_Xs_df, y, test_size=0.4, random_state=42)

In [None]:
# Class for filling in missing values
# Created to be used in the pipeline
class valueFiller():
  def __init__(self, fitter, transformer):
    self.mode = {}
    self.fitter = fitter
    self.transformer = transformer

  # Saves the mode of the training data to ensure that most accurate modal value
  # is compiled with the test data
  def fit(self, X, y):
    self.mode = self.fitter(X)
    return self

  # Replaces all missing data with mode saved in the fit function
  def transform(self, X):
    return self.transformer(X, self.mode)

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
# Feature Selection using Wrapper Methods
# ANOVA
fs = SelectKBest(score_func=f_classif, k='all')
X_train_model = fs.fit_transform(X_train, y_train)
X_test_model = fs.transform(X_test)


# Train a SV classifier
svcC = 0.8
# Polynomial kernel
svcKernel = 'poly'
# Polynomial to the 9th degree
svcDegree = 9

clf = SVC(C=svcC, kernel=svcKernel,
          degree=svcDegree)
clf.fit(X_train_model, y_train)


classifier_score = clf.score(X_test_model, y_test)
print('The classification accuracy  is {:05.4f}'.format(classifier_score))

scores = cross_val_score(clf, X_test_model, y_test, cv=5)

print("Cross-validation scores: ",format(scores))
print("Mean accuracy: ", format(scores.mean()))
print("Standard deviation: ", format(scores.std()))

# Make predictions on the test data
y_pred = clf.predict(X_test_model)

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

The classification accuracy  is 0.6271
Cross-validation scores:  [0.58333333 0.5        0.58333333 0.5        0.63636364]
Mean accuracy:  0.5606060606060607
Standard deviation:  0.05313841792954601
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.94      0.75        35
           1       0.67      0.17      0.27        24

    accuracy                           0.63        59
   macro avg       0.64      0.55      0.51        59
weighted avg       0.64      0.63      0.55        59



In [None]:
# Defining valuefiller object before pipeline creation
po_valueFiller = valueFiller(fitter=findModeExcludingMissing, transformer=replaceMissing)

In [None]:
# Train/Test split to do brief test of pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
# Making the pipeline
pipe = make_pipeline(
    po_valueFiller,
    StandardScaler(),
    PCA(n_components=num_components),
    SelectKBest(score_func=f_classif, k='all'),
    clf
)

pipe.fit(X, y)

In [None]:
# briefly testing pipeline
y_pred = pipe.predict(X_test)

scores = cross_val_score(pipe, X_test, y_test, cv=4)

print("Cross-validation scores: ",format(scores))
print("Mean accuracy: ", format(scores.mean()))
print("Standard deviation: ", format(scores.std()))

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Cross-validation scores:  [0.6        0.6        0.53333333 0.64285714]
Mean accuracy:  0.594047619047619
Standard deviation:  0.03917733969501033
Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80        35
           1       1.00      0.29      0.45        24

    accuracy                           0.71        59
   macro avg       0.84      0.65      0.63        59
weighted avg       0.81      0.71      0.66        59



In [None]:
import pickle

# Pickling pipeline for use on test set
pickle.dump(pipe, open('classification_model.pkl','wb'))