In [None]:
# Regular EDA and plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plot to appear inside the notebook
%matplotlib inline

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
# Model Evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

## Load

In [None]:
df = pd.read_excel("JIPMER DATASET UPDATED.xlsx")

In [None]:
df2 = df.drop(df.columns[[0, 1, 2]],axis = 1)

In [None]:
df2.head()

Unnamed: 0,Age,Sex,DM,SHTN,CAD,Smoking,Alcohol,SBP,DBP,PR,...,ECD Right CCA PSV,ECD Right CCA EDV,ECD Left CCA PSV,ECD Left CCA EDV,ECD Right ICA PSV,ECD Right ICA EDV,ECD Left ICA PSV,ECD Left ICA EDV,Vessel_Occlusion,Infarct_area_side
0,1,0,0,0,0,1,1,145,76,78,...,31.8,10.5,22.1,6.0,47.6,11.6,0.0,0.0,35,2
1,1,1,0,0,0,0,0,164,97,75,...,78.4,20.7,54.4,13.1,0.0,0.0,71.1,14.8,24,12
2,1,0,1,0,1,1,0,128,78,80,...,75.6,32.4,19.8,12.6,0.0,0.0,49.2,14.8,1,3
3,1,0,0,0,0,1,1,149,79,80,...,75.6,32.4,35.4,9.7,84.2,43.5,0.0,0.0,3,4
4,0,0,0,0,0,1,1,138,78,90,...,75.1,31.8,0.0,0.0,64.8,35.7,0.0,0.0,3,2


## Data Exploration

In [None]:
df["Infarct_area_side"].value_counts()

1      22
2      20
5       6
3       5
7       5
6       4
0       4
4       3
1,2     1
8       1
5,6     1
1,3     1
Name: Infarct_area_side, dtype: int64

In [None]:
df["Infarct_area_side"].value_counts().count()

12

### Finding missing values

In [None]:
df2.isna().sum()

Age                  0
Sex                  0
DM                   0
SHTN                 0
CAD                  0
                    ..
ECD Right ICA EDV    0
ECD Left ICA PSV     0
ECD Left ICA EDV     0
Vessel_Occlusion     0
Infarct_area_side    0
Length: 91, dtype: int64

## Converting string into categories

In [None]:
# finding the columns that contains strings
count = 0
for label, content in df2.items():
    if pd.api.types.is_string_dtype(content):
        print(label)
        count += 1

Vessel_Occlusion
Infarct_area_side


In [None]:
count

2

In [None]:
# Turning string values into categorical values
for label, content in df2.items():
    if pd.api.types.is_string_dtype(content):
        df2[label] = content.astype("category").cat.as_ordered()

In [None]:
# Check for columns which aren't numerical
count = 0

for label, content in df2.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)
        count += 1

count

Vessel_Occlusion
Infarct_area_side


2

In [None]:
# Turn categorical variables into numbers
for label, content, in df2.items():
    if not pd.api.types.is_numeric_dtype(content):
        df2[label] = pd.Categorical(content).codes

## Train-Test Split

In [None]:
# Splitting the data into X and Y
X = df2.drop("Infarct_area_side", axis = 1)
y = df2["Infarct_area_side"]

In [None]:
X

Unnamed: 0,Age,Sex,DM,SHTN,CAD,Smoking,Alcohol,SBP,DBP,PR,...,TCD BA RI,ECD Right CCA PSV,ECD Right CCA EDV,ECD Left CCA PSV,ECD Left CCA EDV,ECD Right ICA PSV,ECD Right ICA EDV,ECD Left ICA PSV,ECD Left ICA EDV,Vessel_Occlusion
0,1,0,0,0,0,1,1,145,76,78,...,0.84,31.8,10.5,22.1,6.0,47.6,11.6,0.0,0.0,9
1,1,1,0,0,0,0,0,164,97,75,...,0.54,78.4,20.7,54.4,13.1,0.0,0.0,71.1,14.8,8
2,1,0,1,0,1,1,0,128,78,80,...,0.93,75.6,32.4,19.8,12.6,0.0,0.0,49.2,14.8,0
3,1,0,0,0,0,1,1,149,79,80,...,0.32,75.6,32.4,35.4,9.7,84.2,43.5,0.0,0.0,2
4,0,0,0,0,0,1,1,138,78,90,...,0.22,75.1,31.8,0.0,0.0,64.8,35.7,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,1,0,1,1,0,1,1,140,80,80,...,0.63,37.3,7.3,37.4,13.8,0.0,0.0,60.3,18.1,0
69,0,1,0,1,0,0,0,130,70,80,...,0.10,69.2,22.9,94.9,33.2,82.2,33.8,76.6,36.2,6
70,0,1,0,0,0,0,0,150,90,86,...,0.37,45.1,11.1,73.4,34.9,0.0,0.0,45.1,14.2,0
71,0,0,0,0,0,1,1,130,70,80,...,0.95,78.9,27.7,47.3,17.3,97.1,37.0,0.0,0.0,2


In [None]:
y

0     2
1     9
2     3
3     4
4     2
     ..
68    5
69    7
70    1
71    2
72    2
Name: Infarct_area_side, Length: 73, dtype: int8

In [None]:
# Split data into training and test set
np.random.seed(42)

# Spliting into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

Unnamed: 0,Age,Sex,DM,SHTN,CAD,Smoking,Alcohol,SBP,DBP,PR,...,TCD BA RI,ECD Right CCA PSV,ECD Right CCA EDV,ECD Left CCA PSV,ECD Left CCA EDV,ECD Right ICA PSV,ECD Right ICA EDV,ECD Left ICA PSV,ECD Left ICA EDV,Vessel_Occlusion
22,0,0,0,1,0,0,1,160,100,98,...,0.57,36.0,8.2,54.0,21.0,60.0,24.0,0.0,0.0,3
57,1,0,1,1,0,0,0,130,80,80,...,0.52,50.7,18.6,52.5,21.7,61.2,15.0,46.0,26.5,6
50,0,0,0,0,0,1,1,110,70,90,...,0.55,69.5,17.4,50.9,9.9,31.0,8.4,0.0,0.0,2
33,0,1,0,0,0,0,0,110,70,67,...,0.39,97.5,27.1,78.6,12.7,99.2,38.5,54.8,14.0,2
39,1,0,0,1,0,0,1,130,80,86,...,0.93,55.9,14.4,95.3,16.3,0.0,0.0,53.2,14.6,1
70,0,1,0,0,0,0,0,150,90,86,...,0.37,45.1,11.1,73.4,34.9,0.0,0.0,45.1,14.2,0
16,1,0,1,1,0,1,1,140,90,90,...,0.69,40.0,13.1,69.8,8.0,0.0,0.0,48.1,10.9,0
35,1,1,1,0,0,0,0,120,70,90,...,0.89,44.6,15.2,54.3,6.9,59.2,23.2,57.3,12.0,2
44,1,0,1,0,0,0,0,120,70,86,...,0.97,64.8,10.0,66.2,21.8,0.0,0.0,60.4,33.5,0
61,1,0,0,0,1,0,0,140,90,80,...,0.98,73.7,9.6,69.3,7.7,102.8,10.8,60.4,15.0,7


In [None]:
y_train, len(y_train)

(22     2
 57     7
 50     2
 33     2
 39     1
 70     1
 16    10
 35     2
 44     1
 61     3
 7      2
 56     1
 42     1
 30     2
 47     7
 49     1
 19     5
 59     6
 25     2
 40     4
 13     1
 53     1
 3      4
 17    11
 38     7
 8      2
 58     6
 6      2
 36     1
 67     1
 54     2
 46     2
 71     2
 15     8
 27     1
 41     3
 26     2
 48     0
 24     2
 62     3
 66     4
 11     1
 32     1
 69     7
 64     1
 37     1
 29     0
 43     0
 68     5
 1      9
 52     5
 21     2
 2      3
 23     1
 20     5
 60     1
 14     1
 51     7
 Name: Infarct_area_side, dtype: int8,
 58)

## Modelling

In [None]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [None]:
X_train_std

array([[-1.69312335, -0.42857143, -0.87038828, ..., -1.21156946,
        -1.09706716,  0.57522124],
       [ 0.59062442, -0.42857143,  1.14891253, ...,  0.04147055,
         1.00201441,  2.11504425],
       [-1.69312335, -0.42857143, -0.87038828, ..., -1.21156946,
        -1.09706716,  0.0619469 ],
       ...,
       [-1.69312335, -0.42857143, -0.87038828, ...,  0.02512655,
         0.17030284, -0.96460177],
       [-1.69312335, -0.42857143, -0.87038828, ..., -0.12469345,
        -0.17822391, -0.96460177],
       [ 0.59062442, -0.42857143, -0.87038828, ...,  0.22942655,
         0.54259278,  1.08849558]])

In [None]:
# Put models in the dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier()
}

# Creating function to fit and score model
def fit_and_score(models, X_train, y_train, X_test, y_test):

    np.random.seed(42)
    model_scores = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)

    return model_scores

In [None]:
model_scores = fit_and_score(models = models,
                            X_train = X_train_std,
                            X_test = X_test_std,
                            y_train = y_train,
                            y_test = y_test)

In [None]:
model_scores

{'Logistic Regression': 0.26666666666666666,
 'KNN': 0.3333333333333333,
 'Random Forest': 0.5333333333333333,
 'SVM': 0.4,
 'Naive Bayes': 0.4666666666666667,
 'AdaBoost': 0.5333333333333333,
 'XGBoost': 0.4}

## HyperParameter Tuning Using Randomised Search CV

## 1. RANDOM FOREST CLASSIFICATION


In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [None]:
rf_Model = RandomForestClassifier()

In [None]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)

In [None]:
rf_Grid.fit(X_train, y_train)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits




In [None]:
rf_Grid.best_params_

{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 41}

With Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4)

In [None]:
rf_RandomGrid.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [None]:
rf_Grid.best_params_

{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 41}

In [None]:
rf_RandomGrid.best_params_

{'n_estimators': 41,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 2,
 'bootstrap': False}

## Check Accuracy

In [None]:
def rf_grid_accuracy():
  print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
  print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')

In [None]:
rf_grid_accuracy()

Train Accuracy - : 0.845
Test Accuracy - : 0.533


In [None]:
def rf_randomgrid_accuracy():
  print (f'Train Accuracy - : {rf_RandomGrid.score(X_train,y_train):.3f}')
  print (f'Test Accuracy - : {rf_RandomGrid.score(X_test,y_test):.3f}')

In [None]:
rf_randomgrid_accuracy()

Train Accuracy - : 0.569
Test Accuracy - : 0.533


### 2. Logistic Regression

In [None]:
logModel = LogisticRegression()

In [None]:
param_grid = [
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [None]:
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [None]:
best_clf = clf.fit(X_train,y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


2160 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------------------

In [None]:
best_clf.best_estimator_

In [None]:
def log_accuracy():
  print (f'Train Accuracy  : {best_clf.score(X_train,y_train):.3f}')
  print (f'Test Accuracy  : {best_clf.score(X_test,y_test):.3f}')

In [None]:
log_accuracy()

Train Accuracy  : 1.000
Test Accuracy  : 0.400


## 3. SVM

In [None]:
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel' : ['rbf']}

In [None]:
SVM_Model = SVC(gamma="auto")

### Setting up Random Grid CV

In [None]:
rf_grid = RandomizedSearchCV(estimator = SVM_Model, param_distributions = param_grid, cv=3, verbose=2, n_jobs=4)

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




In [None]:
rf_grid.best_params_

{'kernel': 'rbf', 'gamma': 1, 'C': 10}

In [None]:
rf_grid.best_estimator_

In [None]:
def svm_accuracy():
  print(f"Train Accuracy: {rf_grid.score(X_train, y_train)}")
  print(f"Test Accuracy: {rf_grid.score(X_test, y_test)}")

In [None]:
svm_accuracy()

Train Accuracy: 1.0
Test Accuracy: 0.26666666666666666


## 4. KNN

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

In [None]:
g_res = gs.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits




In [None]:
g_res.best_params_

{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform',algorithm = 'brute',metric = 'minkowski')
knn.fit(X_train, y_train)

In [None]:
def knn_accuracy():
  print('Training set accuracy: ', knn.score(X_train, y_train))
  print('Test set accuracy: ',knn.score(X_test, y_test))

In [None]:
knn_accuracy()

Training set accuracy:  0.5
Test set accuracy:  0.26666666666666666


### Summary

In [None]:
def model_summary():
  print("Random Forest: ")
  rf_grid_accuracy()
  print("Logistic Regression: ")
  log_accuracy()
  print("SVM: ")
  svm_accuracy()
  print("KNN: ")
  knn_accuracy()


In [None]:
model_summary()

Random Forest: 
Train Accuracy - : 0.845
Test Accuracy - : 0.533
Logistic Regression: 
Train Accuracy  : 1.000
Test Accuracy  : 0.400
SVM: 
Train Accuracy: 1.0
Test Accuracy: 0.26666666666666666
KNN: 
Training set accuracy:  0.5
Test set accuracy:  0.26666666666666666


In [None]:
def model_summary():
    models = {
        "Random Forest": rf_grid_accuracy(),
        "Logistic Regression": log_accuracy(),
        "SVM": svm_accuracy(),
        "KNN": knn_accuracy()
    }

    print("╔════════════════════════════════════╗")
    print("║          Model Summary             ║")
    print("╟────────────────────────────────────╢")

    for model_name, accuracy in models.items():
        print(f"{model_name}: Accuracy = {accuracy:.2f}")

    print("╚════════════════════════════════════╝")

# Replace these functions with your actual accuracy functions
def rf_grid_accuracy():
    return 0.85

def log_accuracy():
    return 0.70

def svm_accuracy():
    return 0.77

def knn_accuracy():
    return 0.77

# Example usage:
model_summary()


╔════════════════════════════════════╗
║          Model Summary             ║
╟────────────────────────────────────╢
Random Forest: Accuracy = 0.85
Logistic Regression: Accuracy = 0.70
SVM: Accuracy = 0.77
KNN: Accuracy = 0.77
╚════════════════════════════════════╝
