CLASSIFICATION OF LYMPH NODES

AUTHOR: Laura Banham

DATE STARTED: 05/25/2022

Data obtained from: https://archive.ics.uci.edu/ml/datasets/Lymphography


In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
## Import the dataset
lymphData = pd.read_excel('lymphography.xlsx')

## Format the dataset
# Change all object (string) variables to categorical variables
cleanLymphData = pd.concat([
        lymphData.select_dtypes([], ['object']),
        lymphData.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(lymphData.columns, axis=1)

# Split the dataset into X and Y variables
xVars = cleanLymphData.drop(['Class'], axis=1)
xVars = pd.get_dummies(data=xVars, drop_first=True)

yVar = cleanLymphData['Class']



In [None]:
## Additional helper code
#print(list(lymphData.columns))
#print(lymphData.Lymphatics.unique())
#lymphData.info()
#cleanLymphData['ClassCat'] = cleanLymphData['Class'].apply(lambda x: 0 if (x == "normal") else 1) 
#print(cleanLymphData[['Class', 'ClassCat']])
#xVars = cleanLymphData.iloc[:, :-2]
#print(list(xVars.columns))
#yVarCat = cleanLymphData.iloc[:, -1]

In [None]:
## Exploratory visualizations of the data
def printPlots(df):
    for i in range(1,19):
        name = df.columns[i]
        clarity_color_table = pd.crosstab(index=df[name]  , 
                          columns=df["Class"])

        clarity_color_table.plot(kind="bar", 
                        figsize=(8,8),
                        stacked=False)
#printPlots(cleanLymphData)

In [None]:
## Checking for multicollinearity in the predictor variables
from statsmodels.stats.outliers_influence import variance_inflation_factor
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = xVars.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(xVars.values, i)
                          for i in range(len(xVars.columns))]
  
#print(vif_data)
#print(vif_data[vif_data['VIF'] >= 5])

## Removing the variables with VIF > 5
xVarsSubset = xVars.drop(['Lym_nodes_dimin', 'Lym_nodes_enlar', 'No_of_nodes_in',
                'Early_uptake_in_yes', 'Changes_in_lym_oval', 'Changes_in_lym_round',
                'Changes_in_node_lac_margin', 'Exclusion_of_no_yes'], axis=1)
#print(cleanLymphData.columns)

In [None]:
# Then split the X and Y datasets into train and test datasets
xVar_train, xVar_test, yVar_train, yVar_test = train_test_split(xVarsSubset, yVar, test_size=0.2)
#xVar_train.info()

MULTINOMIAL LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression

## Fit the model
multLogitModel = LogisticRegression(multi_class='multinomial',solver ='newton-cg')
multLogitModel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_multLogit = multLogitModel.predict(xVar_test)

## Make confusion matrix
print(pd.crosstab(yVar_test, y_pred_multLogit))

## Print the accuracy of the model
from sklearn.metrics import accuracy_score
print("Multinomial Logistic Regression Train Dataset Accuracy:", accuracy_score(yVar_train, multLogitModel.predict(xVar_train)))
print("Multinomial Logistic Regression Test Dataset Accuracy:", accuracy_score(yVar_test, multLogitModel.predict(xVar_test)))

SUPPORT VECTOR MACHINE MODEL

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Finding the best hyperparameters
params = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}

svmModel = GridSearchCV(
    estimator=SVC(),
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)

svmModel.fit(xVar_train, yVar_train)
print("SVM Best Parameters:", svmModel.best_params_)

## Building and fitting the classifier
svmModel = SVC(kernel='linear', gamma=1, C=1)
svmModel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_svm = svmModel.predict(xVar_test)

## Make confusion matrix
print(pd.crosstab(yVar_test, y_pred_svm))

## Print the accuracy of the model
print("SVM Train Dataset Accuracy:", accuracy_score(yVar_train, svmModel.predict(xVar_train)))
print("SVM Test Dataset Accuracy:", accuracy_score(yVar_test, svmModel.predict(xVar_test)))


RANDOM FOREST MODEL

In [None]:
from sklearn.ensemble import RandomForestClassifier

## Building and fitting the classifier
rfMmodel = RandomForestClassifier()
rfMmodel.fit(xVar_train, yVar_train)

## Make predictions
# Predicting the test set results
y_pred_rf = rfMmodel.predict(xVar_test)

## Make confusion matrix
print(pd.crosstab(yVar_test, y_pred_rf))

## Print the accuracy of the model
print("Random Forest Train Dataset Accuracy:", accuracy_score(yVar_train, rfMmodel.predict(xVar_train)))
print("Random Forest Test Dataset Accuracy:", accuracy_score(yVar_test, rfMmodel.predict(xVar_test)))


In [None]:
## Print the feature importances
# Saving feature names for later use
feature_list = list(xVar_train.columns)
# Get numerical feature importances
importances = list(rfMmodel.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

## Build a new model with just the two most important features
# Extract the two most important features
rf_most_important = RandomForestClassifier()
important_indices = [feature_list.index('Block_of_affere_yes'), feature_list.index('Special_forms_vesicles')]
xVar_train_important = xVar_train[['Block_of_affere_yes', 'Special_forms_vesicles']]
xVar_test_important = xVar_test[['Block_of_affere_yes', 'Special_forms_vesicles']] 
# Fit the classifier
rf_most_important.fit(xVar_train_important, yVar_train)
# Make predictions and determine the error
predictions = rf_most_important.predict(xVar_test_important)

## Print the accuracy of the model
print("Random Forest Most Important Train Dataset Accuracy:", accuracy_score(yVar_train, rf_most_important.predict(xVar_train_important)))
print("Random Forest Most Important Test Dataset Accuracy:", accuracy_score(yVar_test, rf_most_important.predict(xVar_test_important)))