CLASSIFICATION OF LYMPH NODES

AUTHOR: Laura Banham

DATE STARTED: 05/25/2022

Data obtained from: https://archive.ics.uci.edu/ml/datasets/Lymphography


In [18]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
import matplotlib.pyplot as plt

In [2]:
## Import the dataset
lymphData = pd.read_excel('lymphography.xlsx')

## Format the dataset
# Change all object (string) variables to categorical variables
cleanLymphData = pd.concat([
        lymphData.select_dtypes([], ['object']),
        lymphData.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(lymphData.columns, axis=1)

# Split the dataset into X and Y variables
xVars = cleanLymphData.drop(['Class'], axis=1)
# One-hot encode the categorical x variables
xVars = pd.get_dummies(data=xVars, drop_first=True)

yVar = cleanLymphData['Class']

In [3]:
## Additional helper code
#print(list(lymphData.columns))
#print(lymphData.Lymphatics.unique())
#lymphData.info()
#cleanLymphData['ClassCat'] = cleanLymphData['Class'].apply(lambda x: 0 if (x == "normal") else 1) 
#print(cleanLymphData[['Class', 'ClassCat']])
#xVars = cleanLymphData.iloc[:, :-2]
#print(list(xVars.columns))
#yVarCat = cleanLymphData.iloc[:, -1]

In [3]:
## Exploratory visualizations of the data
def printPlots(df):
    for i in range(1,19):
        name = df.columns[i]
        clarity_color_table = pd.crosstab(index=df[name]  , 
                          columns=df["Class"])

        clarity_color_table.plot(kind="bar", 
                        figsize=(8,8),
                        stacked=False)
#printPlots(cleanLymphData)

In [4]:
## Checking for multicollinearity in the predictor variables
from statsmodels.stats.outliers_influence import variance_inflation_factor
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = xVars.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(xVars.values, i)
                          for i in range(len(xVars.columns))]
  
print(vif_data)
print(vif_data[vif_data['VIF'] >= 5])

## Removing the variables with VIF > 5
xVarsSubset = xVars.drop(['Lym_nodes_dimin', 'Lym_nodes_enlar', 'No_of_nodes_in',
                'Early_uptake_in_yes', 'Changes_in_lym_oval', 'Changes_in_lym_round',
                'Changes_in_node_lac_margin', 'Exclusion_of_no_yes'], axis=1)

                       feature        VIF
0              Lym_nodes_dimin  18.003361
1              Lym_nodes_enlar  22.650022
2               No_of_nodes_in   7.987121
3          Lymphatics_deformed   2.230460
4         Lymphatics_displaced   1.932369
5            Lymphatics_normal   3.975604
6          Block_of_affere_yes   4.021349
7            Bl_of_lymph_c_yes   2.523446
8            Bl_of_lymph_s_yes   2.117263
9                  By_pass_yes   2.895520
10            Extravasates_yes   3.288911
11         Regeneration_of_yes   2.390057
12         Early_uptake_in_yes   5.323929
13         Changes_in_lym_oval  15.705746
14        Changes_in_lym_round  16.267648
15   Defect_in_node_lac_margin   2.947968
16      Defect_in_node_lacunar   2.808634
17           Defect_in_node_no   4.880962
18  Changes_in_node_lac_margin   6.582004
19     Changes_in_node_lacunar   3.510104
20          Changes_in_node_no   3.109767
21     Changes_in_stru_diluted   2.379259
22   Changes_in_stru_drop_like   2

In [63]:
# Then split the X and Y datasets into train and test datasets
xVar_train, xVar_test, yVar_train, yVar_test = train_test_split(xVarsSubset, yVar, test_size=0.5)

MULTINOMIAL LOGISTIC REGRESSION MODEL

In [76]:
from sklearn.linear_model import LogisticRegression

## Fit the model
multLogitModel = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
multLogitModel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_multLogit = multLogitModel.predict(xVar_test)

## Make confusion matrix
print("Multinomial Logistic Regression Confusion Matrix:")
print(pd.crosstab(yVar_test, y_pred_multLogit))

## Print metrics of the model
print("Multinomial Logistic Regression Train Dataset Model Metrics:")
print(classification_report(yVar_train, multLogitModel.predict(xVar_train),
zero_division=0))
print("Multinomial Logistic Regression Test Dataset Model Metrics:")
print(classification_report(yVar_test, y_pred_multLogit, zero_division=0))



Multinomial Logistic Regression Confusion Matrix:
col_0         malign_lymph  metastases
Class                                 
fibrosis                 2           0
malign_lymph            19          13
metastases               7          31
normal                   0           2
Multinomial Logistic Regression Train Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       1.00      1.00      1.00         2
malign_lymph       0.89      0.83      0.86        29
  metastases       0.89      0.93      0.91        43

    accuracy                           0.89        74
   macro avg       0.93      0.92      0.92        74
weighted avg       0.89      0.89      0.89        74

Multinomial Logistic Regression Test Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       0.00      0.00      0.00         2
malign_lymph       0.68      0.59      0.63        32
  metastases       0.67      0.82      0.74        

In [71]:
# WORK IN PROGRESS

#from yellowbrick.classifier import ROCAUC
#print(xVar_test.shape)
#print(yVar_test.shape)
#oz = ROCAUC(LogisticRegression(), classes=["fibrosis", "malign_lymph", "metastases", "normal"])
#oz.fit(xVar_train, yVar_train)
#oz.score(xVar_test, yVar_test)
#oz.show()

SUPPORT VECTOR MACHINE MODEL

In [77]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Finding the best hyperparameters
params = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}

svmModel = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)

svmModel.fit(xVar_train, yVar_train)
print("SVM Best Parameters:", svmModel.best_params_)

## Building and fitting the classifier
svmModel = SVC(kernel='linear', gamma=1, C=1)
svmModel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_svm = svmModel.predict(xVar_test)

## Make confusion matrix
print("SVM Confusion Matrix:")
print(pd.crosstab(yVar_test, y_pred_svm))

## Print metrics of the model
print("SVM Train Dataset Model Metrics:")
print(classification_report(yVar_train, svmModel.predict(xVar_train)))
print("SVM Test Dataset Model Metrics:")
print(classification_report(yVar_test, y_pred_svm))


Fitting 5 folds for each of 50 candidates, totalling 250 fits




SVM Best Parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
col_0         fibrosis  malign_lymph  metastases
Class                                           
fibrosis             0             2           0
malign_lymph         1            20          11
metastases           0             5          33
normal               0             0           2
SVM Train Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       1.00      1.00      1.00         2
malign_lymph       0.93      0.86      0.89        29
  metastases       0.91      0.95      0.93        43

    accuracy                           0.92        74
   macro avg       0.95      0.94      0.94        74
weighted avg       0.92      0.92      0.92        74

SVM Test Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       0.00      0.00      0.00         2
malign_lymph       0.74      0.62      0.68        32
  metastases       0.72      0.87   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RANDOM FOREST MODEL

In [74]:
from sklearn.ensemble import RandomForestClassifier

## Building and fitting the classifier
rfMmodel = RandomForestClassifier()
rfMmodel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_rf = rfMmodel.predict(xVar_test)

## Make confusion matrix
print("Random Forest Confusion Matrix:")
print(pd.crosstab(yVar_test, y_pred_rf))

## Print metrics of the model
print("Random Forest Train Dataset Model Metrics:")
print(classification_report(yVar_train, rfMmodel.predict(xVar_train)))
print("Random Forest Test Dataset Model Metrics:")
print(classification_report(yVar_test, y_pred_rf))


Random Forest Confusion Matrix:
col_0         malign_lymph  metastases
Class                                 
fibrosis                 2           0
malign_lymph            18          14
metastases               3          35
normal                   0           2
Random Forest Train Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       1.00      1.00      1.00         2
malign_lymph       1.00      1.00      1.00        29
  metastases       1.00      1.00      1.00        43

    accuracy                           1.00        74
   macro avg       1.00      1.00      1.00        74
weighted avg       1.00      1.00      1.00        74

Random Forest Test Dataset Model Metrics:
              precision    recall  f1-score   support

    fibrosis       0.00      0.00      0.00         2
malign_lymph       0.78      0.56      0.65        32
  metastases       0.69      0.92      0.79        38
      normal       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
## Print the feature importances
# Saving feature names for later use
feature_list = list(xVar_train.columns)
# Get numerical feature importances
importances = list(rfMmodel.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

## Build a new model with just the two most important features
# Extract the two most important features
rf_most_important = RandomForestClassifier()
important_indices = [feature_list.index('Block_of_affere_yes'), feature_list.index('Special_forms_vesicles')]
xVar_train_important = xVar_train[['Block_of_affere_yes', 'Special_forms_vesicles']]
xVar_test_important = xVar_test[['Block_of_affere_yes', 'Special_forms_vesicles']] 
# Fit the classifier
rf_most_important.fit(xVar_train_important, yVar_train)
# Make predictions 
y_pred_rf_most_import = rf_most_important.predict(xVar_test_important)

## Print metrics of the model
print("Random Forest Most Important Features Train Dataset Model Metrics:")
print(classification_report(yVar_train, rf_most_important.predict(xVar_train_important)))
print("Random Forest Most Important Features Test Dataset Model Metrics:")
print(classification_report(yVar_test, y_pred_rf_most_import))

Variable: Block_of_affere_yes  Importance: 0.19
Variable: Special_forms_no     Importance: 0.11
Variable: Special_forms_vesicles Importance: 0.09
Variable: Dislocation_of_yes   Importance: 0.08
Variable: Changes_in_stru_faint Importance: 0.07
Variable: Changes_in_node_lacunar Importance: 0.06
Variable: Lymphatics_deformed  Importance: 0.05
Variable: By_pass_yes          Importance: 0.05
Variable: Extravasates_yes     Importance: 0.05
Variable: Lymphatics_displaced Importance: 0.04
Variable: Defect_in_node_lac_margin Importance: 0.04
Variable: Bl_of_lymph_c_yes    Importance: 0.03
Variable: Regeneration_of_yes  Importance: 0.03
Variable: Defect_in_node_lacunar Importance: 0.03
Variable: Bl_of_lymph_s_yes    Importance: 0.02
Variable: Changes_in_stru_diluted Importance: 0.02
Variable: Changes_in_stru_drop_like Importance: 0.02
Variable: Changes_in_stru_grainy Importance: 0.01
Variable: Changes_in_stru_reticular Importance: 0.01
Variable: Lymphatics_normal    Importance: 0.0
Variable: Def

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
