# Import Required Packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing import sequence

2024-07-14 14:52:00.303631: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



# Load the dataset.

For this evalution, I am using the IMDB dataset, a collection of 50,000 movie reviews appropriate for training and testing binary sentiment classification models. Because it comes included as part of the sklearn library, it is loaded as an import rather than read from a text file.


In [4]:
# Load the IMDB dataset
num_words = 5000 
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

# Pad sequences to a fixed length
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Define a function to compute classification metrics

The function will take a confusion matrix as its argument and return a dictionary with all of the Module 8 metrics, which will be in a useful form to be aggregated for tabular presentation after all models have been trained and tested.

In [6]:
def get_classifier_metrics(conf_matrix):
    tp = conf_matrix[1][1]
    tn = conf_matrix[0][0]
    fp = conf_matrix[0][1]
    fn = conf_matrix[1][0]

    p = tp + fn # actual positives
    n = tn + fp # actual negatives

    # Metrics
    tpr = tp / p
    fnr = fn / p
    tnr = tn / n
    fpr = fp / n
    precision = tp / (tp + fp)
    fdr = fp / (tp + fp)
    npv = tn / (tn + fn)
    f1_score = 2 * (precision * tpr) / (precision + tpr)
    accuracy = (tp + tn) / (p + n)
    bacc = (tpr + tnr) / 2
    tss = tpr - fpr
    hss = 2 * ((tp * tn - fp * fn) / ((p * n) + (tp + fp) * (tn + fn)))

    # Dictionary for all of the metrics
    metrics_dict = {
        'Sensitivity': tpr,
        'Specificity': tnr,
        'Precision': precision,
        'Negative Predictive Value': npv,
        'False Positive Rate': fpr,
        'False Discovery Rate': fdr,
        'False Negative Rate': fnr,
        'Accuracy': accuracy,
        'F1 Score': f1_score,
        'Balanced Accuracy': bacc,
        'True Skill Statistic': tss,
        'Heidke Skill Score': hss
    }
    
    return metrics_dict

# Train and evaluate the three models

Implementation for LSTM used reference to https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/


In [8]:
# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Dictionary to store metrics for each classifier
metrics_dict = {
    'random_forest': [],
    'knn': [],
    'lstm': []
}

# Perform 10-fold cross-validation for each classifier
fold = 1
for train_index, test_index in kf.split(X_train):
    print(f'Fold {fold}/10')
    X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]

    # Random Forest
    rf_clf = RandomForestClassifier()
    rf_clf.fit(X_train_fold, y_train_fold)
    y_pred_rf = rf_clf.predict(X_val_fold)
    conf_matrix_rf = confusion_matrix(y_val_fold, y_pred_rf)
    metrics_rf = get_classifier_metrics(conf_matrix_rf)
    metrics_dict['random_forest'].append(metrics_rf)
    print('Random Forest completed')

    # K-Nearest Neighbors
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train_fold, y_train_fold)
    y_pred_knn = knn_clf.predict(X_val_fold)
    conf_matrix_knn = confusion_matrix(y_val_fold, y_pred_knn)
    metrics_knn = get_classifier_metrics(conf_matrix_knn)
    metrics_dict['knn'].append(metrics_knn)
    print('K-Nearest Neighbors completed')
    
    # LSTM - Implementation for this used reference to https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
    lstm_clf = Sequential()
    lstm_clf.add(Embedding(num_words, 32))
    lstm_clf.add(LSTM(100))
    lstm_clf.add(Dense(1, activation='sigmoid'))
    lstm_clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    lstm_clf.fit(X_train_fold, y_train_fold, epochs=2, batch_size=64, verbose=0)
    y_pred_lstm = np.round(lstm_clf.predict(X_val_fold)).astype(int)
    conf_matrix_lstm = confusion_matrix(y_val_fold, y_pred_lstm)
    metrics_lstm = get_classifier_metrics(conf_matrix_lstm)
    metrics_dict['lstm'].append(metrics_lstm)
    print('LSTM completed')

    fold += 1

Fold 1/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 124ms/step
LSTM completed
Fold 2/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 112ms/step
LSTM completed
Fold 3/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 131ms/step
LSTM completed
Fold 4/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 142ms/step
LSTM completed
Fold 5/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 122ms/step
LSTM completed
Fold 6/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 125ms/step
LSTM completed
Fold 7/10
Random Forest completed
K-Nearest Neighbors completed
[1m79/

# Define a function to present the metrics data in tabular form

In [10]:
def create_metrics_dataframe(metrics_dict, classifier_name):
    metrics_df = pd.DataFrame(metrics_dict[classifier_name])
    average_metrics = metrics_df.mean().to_frame().T
    metrics_df = pd.concat([metrics_df, average_metrics], ignore_index=True)
    metrics_df.index = [f'Fold {i+1}' for i in range(metrics_df.shape[0] - 1)] + ['Average']
    
    return metrics_df

# Random Forest Metrics

In [13]:
metrics_df_rf = create_metrics_dataframe(metrics_dict, 'random_forest')
metrics_df_rf

Unnamed: 0,Sensitivity,Specificity,Precision,Negative Predictive Value,False Positive Rate,False Discovery Rate,False Negative Rate,Accuracy,F1 Score,Balanced Accuracy,True Skill Statistic,Heidke Skill Score
Fold 1,0.485194,0.596788,0.572581,0.510116,0.403212,0.427419,0.514806,0.538,0.525277,0.540991,0.081981,0.082337
Fold 2,0.499197,0.551834,0.525338,0.525836,0.448166,0.474662,0.500803,0.5256,0.511934,0.525516,0.051032,0.051103
Fold 3,0.495153,0.549128,0.518613,0.525797,0.450872,0.481387,0.504847,0.5224,0.506612,0.522141,0.044282,0.044345
Fold 4,0.513776,0.56793,0.536833,0.54511,0.43207,0.463167,0.486224,0.5412,0.525052,0.540853,0.081707,0.081825
Fold 5,0.501608,0.57086,0.536543,0.536275,0.42914,0.463457,0.498392,0.5364,0.518488,0.536234,0.072468,0.072643
Fold 6,0.50315,0.591057,0.559545,0.535346,0.408943,0.440455,0.49685,0.5464,0.529851,0.547103,0.094207,0.094547
Fold 7,0.502811,0.56255,0.532766,0.53283,0.43745,0.467234,0.497189,0.5328,0.517355,0.532681,0.065361,0.065478
Fold 8,0.473473,0.589968,0.533514,0.530802,0.410032,0.466486,0.526527,0.532,0.501704,0.53172,0.063441,0.063876
Fold 9,0.497517,0.575077,0.522609,0.55037,0.424923,0.477391,0.502483,0.5376,0.509754,0.536297,0.072594,0.072786
Fold 10,0.526316,0.5626,0.547718,0.541313,0.4374,0.452282,0.473684,0.5444,0.536804,0.544458,0.088916,0.088973


# K-Nearest Neighbors Metrics

In [16]:
metrics_df_knn = create_metrics_dataframe(metrics_dict, 'knn')
metrics_df_knn

Unnamed: 0,Sensitivity,Specificity,Precision,Negative Predictive Value,False Positive Rate,False Discovery Rate,False Negative Rate,Accuracy,F1 Score,Balanced Accuracy,True Skill Statistic,Heidke Skill Score
Fold 1,0.428246,0.575655,0.529081,0.474895,0.424345,0.470919,0.571754,0.498,0.473353,0.501951,0.003901,0.003938
Fold 2,0.432584,0.57177,0.500929,0.503511,0.42823,0.499071,0.567416,0.5024,0.464255,0.502177,0.004355,0.004397
Fold 3,0.42811,0.570523,0.494403,0.504202,0.429477,0.505597,0.57189,0.5,0.458874,0.499316,-0.001367,-0.001381
Fold 4,0.448947,0.56872,0.503636,0.514286,0.43128,0.496364,0.551053,0.5096,0.474722,0.508833,0.017667,0.017794
Fold 5,0.436495,0.596338,0.517143,0.516552,0.403662,0.482857,0.563505,0.5168,0.473409,0.516416,0.032833,0.033258
Fold 6,0.43937,0.546341,0.5,0.485549,0.453659,0.5,0.56063,0.492,0.467728,0.492856,-0.014288,-0.014369
Fold 7,0.460241,0.548207,0.502632,0.505882,0.451793,0.497368,0.539759,0.5044,0.480503,0.504224,0.008448,0.008481
Fold 8,0.431672,0.574841,0.501401,0.505248,0.425159,0.498599,0.568328,0.5036,0.463931,0.503256,0.006513,0.00658
Fold 9,0.44702,0.540248,0.47619,0.510981,0.459752,0.52381,0.55298,0.4952,0.461144,0.493634,-0.012732,-0.01278
Fold 10,0.452951,0.550562,0.503546,0.5,0.449438,0.496454,0.547049,0.5016,0.47691,0.501756,0.003512,0.003529


# LSTM Metrics

In [19]:
metrics_df_gru = create_metrics_dataframe(metrics_dict, 'lstm')
metrics_df_gru

Unnamed: 0,Sensitivity,Specificity,Precision,Negative Predictive Value,False Positive Rate,False Discovery Rate,False Negative Rate,Accuracy,F1 Score,Balanced Accuracy,True Skill Statistic,Heidke Skill Score
Fold 1,0.889901,0.84869,0.867506,0.873803,0.15131,0.132494,0.110099,0.8704,0.878561,0.869296,0.738591,0.739947
Fold 2,0.479936,0.8437,0.753149,0.620164,0.1563,0.246851,0.520064,0.6624,0.586275,0.661818,0.323636,0.346704
Fold 3,0.851373,0.829635,0.830575,0.850528,0.170365,0.169425,0.148627,0.8404,0.840846,0.840504,0.681009,0.681056
Fold 4,0.923825,0.809637,0.825489,0.915996,0.190363,0.174511,0.076175,0.866,0.871893,0.866731,0.733462,0.737452
Fold 5,0.552251,0.846338,0.780682,0.656173,0.153662,0.219318,0.447749,0.7,0.646893,0.699294,0.398588,0.416845
Fold 6,0.912598,0.843902,0.857883,0.903394,0.156098,0.142117,0.087402,0.8788,0.884395,0.87825,0.756501,0.758882
Fold 7,0.781526,0.943426,0.931992,0.813187,0.056574,0.068008,0.218474,0.8628,0.850153,0.862476,0.724952,0.734927
Fold 8,0.79582,0.869427,0.857886,0.811293,0.130573,0.142114,0.20418,0.8328,0.825688,0.832623,0.665247,0.667207
Fold 9,0.48096,0.835913,0.732661,0.632689,0.164087,0.267339,0.51904,0.6644,0.58071,0.658437,0.316874,0.339389
Fold 10,0.868421,0.85313,0.856132,0.865635,0.14687,0.143868,0.131579,0.8608,0.862233,0.860776,0.721551,0.721659


# Show the comparison of the averages for all three models

In [22]:
average_data = {'Random Forest': metrics_df_rf.loc['Average']
               ,'KNN': metrics_df_knn.loc['Average']
               ,'LSTM': metrics_df_gru.loc['Average']
               }
average_metrics_df = pd.DataFrame(average_data)
average_metrics_df

Unnamed: 0,Random Forest,KNN,LSTM
Sensitivity,0.499819,0.440564,0.753661
Specificity,0.571779,0.564321,0.85238
Precision,0.538606,0.502896,0.829395
Negative Predictive Value,0.533379,0.502111,0.794286
False Positive Rate,0.428221,0.435679,0.14762
False Discovery Rate,0.461394,0.497104,0.170605
False Negative Rate,0.500181,0.559436,0.246339
Accuracy,0.53568,0.50236,0.80388
F1 Score,0.518283,0.469483,0.782765
Balanced Accuracy,0.535799,0.502442,0.803021
