# Data Preprocessing

Min-max scaling of our features

In [37]:
import pandas as pd
import numpy as np  
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [38]:
# For reproducibility of results
np.random.seed(0)

# Load the dataset if selected_features.csv does not exist
if os.path.exists("selected_features.csv"):
    selected_features = pd.read_csv("selected_features.csv")
    print(selected_features.head())
else:
    file_path = "../DataProcessing/rtfDataSet.csv"
    original_data = pd.read_csv(file_path)
    # Get only our features of interest and save them to a local file "selecte_features.csv"
    # 2 7 8 11 14 15 17 18 21 22 23 25 30 31 33 36 40 43 44 50 53 54 56 57 58 59 60 64 66 68 and label
    selected_features = original_data.iloc[:, [2, 7, 8, 11, 14, 15, 17, 18, 21, 22, 23, 25, 30, 31, 33, 36, 40, 43, 44, 50, 53, 54, 56, 57, 58, 59, 60, 64, 66, 68,70]]    
    # Save the selected features to a local file
    selected_features.to_csv("selected_features.csv", index=False)
    print(original_data.head())
    print(selected_features.head())

       2      7      8     11     14     15     17     18     21     22  ...  \
0 -0.684 -0.113  0.401 -0.251 -0.178  0.321  0.016 -0.003 -0.275 -0.162  ...   
1 -0.529 -0.066 -0.168 -0.205  0.020  0.031 -0.165 -0.026 -0.130  0.176  ...   
2 -0.240  0.031 -0.077 -0.036 -0.005  0.170  0.212  0.138 -0.300  0.689  ...   
3 -0.062 -0.143  0.068 -0.295 -0.188 -0.317 -0.113  0.018 -0.272 -0.531  ...   
4 -0.203 -0.168  0.008  0.166 -0.048 -0.291 -0.091  0.062  0.026 -0.036  ...   

      54     56     57     58     59     60     64     66     68  label  
0 -0.263 -0.075 -0.327 -0.198 -0.257 -0.205 -0.223 -0.136 -0.132      0  
1 -0.064  0.004 -0.305 -0.018 -0.230 -0.150 -0.151 -0.031 -0.097      0  
2 -0.149  0.102 -0.189 -0.112 -0.121  0.121 -0.172  0.033 -0.321      0  
3 -0.245 -0.153 -0.405 -0.102 -0.120 -0.141 -0.292 -0.078 -0.151      0  
4  0.043 -0.099 -0.188  0.042 -0.124 -0.097 -0.165 -0.112 -0.138      1  

[5 rows x 31 columns]


In [39]:
labels = selected_features.iloc[:, -1]
features = selected_features.iloc[:, :-1]

features, labels

(            2         7         8        11        14        15        17  \
 0   -0.684000 -0.113000  0.401000 -0.251000 -0.178000  0.321000  0.016000   
 1   -0.529000 -0.066000 -0.168000 -0.205000  0.020000  0.031000 -0.165000   
 2   -0.240000  0.031000 -0.077000 -0.036000 -0.005000  0.170000  0.212000   
 3   -0.062000 -0.143000  0.068000 -0.295000 -0.188000 -0.317000 -0.113000   
 4   -0.203000 -0.168000  0.008000  0.166000 -0.048000 -0.291000 -0.091000   
 ..        ...       ...       ...       ...       ...       ...       ...   
 391 -0.240811  0.160049 -0.172790  0.431347  0.430861 -0.305451 -0.119770   
 392 -0.179418 -0.000023 -0.260852  0.237564 -0.080075 -0.354038 -0.004051   
 393 -0.553610 -0.005744 -0.068708 -0.253044  0.017528 -0.070628 -0.172178   
 394  0.203276 -0.036268 -0.253699  0.382949  0.295444  0.049701 -0.245894   
 395 -0.221108  0.450663 -0.293762  0.042793  0.153844 -0.268618 -0.083691   
 
            18        21        22  ...        53        54   

## Min-Max

In [40]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_normalized = scaler.fit_transform(features)
dataset_normalized

array([[0.18304732, 0.49900892, 0.89076305, ..., 0.33932274, 0.34881517,
        0.52682455],
       [0.26365055, 0.52229931, 0.43373494, ..., 0.38908086, 0.44834123,
        0.54374094],
       [0.41393656, 0.5703667 , 0.50682731, ..., 0.37456807, 0.50900474,
        0.43547608],
       ...,
       [0.25085283, 0.55215857, 0.51348755, ..., 0.31353283, 0.41787867,
        0.28746834],
       [0.6444493 , 0.53703271, 0.3649004 , ..., 0.57430339, 0.55802938,
        0.67151474],
       [0.42376079, 0.77832656, 0.33272129, ..., 0.38708569, 0.42213365,
        0.63412131]])

## Naive Bayes

In [41]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classifier(X_train, y_train, X_test, y_test):
    # Initialize and fit the Naive Bayes model
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    
    # Make predictions and calculate metrics
    y_pred = nb.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    return [acc, pre, rec, auc]

## k-Nearest Neighbors Classifier

In [42]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier(X_train, y_train, X_test, y_test, k_values, distance_metrics):
    best_metrics = {}
    
    for metric in distance_metrics:
        best_k = None
        best_acc = 0
        
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            
            acc = accuracy_score(y_test, y_pred)
            
            if acc > best_acc:
                best_acc = acc
                best_k = k
                
        # Use the best k to calculate metrics
        knn = KNeighborsClassifier(n_neighbors=best_k, metric=metric)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        
        best_metrics[metric] = [acc, pre, rec, auc]
        
    return best_metrics


## Stratified 10-Fold Cross-Validation

In [43]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Initialize empty lists to store metrics
nb_metrics = []
knn_metrics = []

# Initialize lists to store metrics for averaging and standard deviation
nb_acc_list, nb_pre_list, nb_rec_list, nb_auc_list = [], [], [], []
knn_acc_list, knn_pre_list, knn_rec_list, knn_auc_list = [], [], [], []

# Initialize StratifiedKFold and other variables
skf = StratifiedKFold(n_splits=10)
X = dataset_normalized
y = labels
k_values = list(range(1, 16, 2))
distance_metrics = ['euclidean', 'manhattan']

# Stratified 10-Fold CV
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Naive Bayes metrics
    nb_metrics_current = naive_bayes_classifier(X_train, y_train, X_test, y_test)
    nb_acc, nb_pre, nb_rec, nb_auc = nb_metrics_current
    
    # Append to lists for Naive Bayes
    nb_acc_list.append(nb_acc)
    nb_pre_list.append(nb_pre)
    nb_rec_list.append(nb_rec)
    nb_auc_list.append(nb_auc)

    # k-Nearest Neighbors metrics
    knn_result = knn_classifier(X_train, y_train, X_test, y_test, k_values, distance_metrics)
    for metric, metric_values in knn_result.items():
        knn_acc, knn_pre, knn_rec, knn_auc = metric_values
        knn_metrics.append(metric_values)
        
        # Append to lists for kNN
        knn_acc_list.append(knn_acc)
        knn_pre_list.append(knn_pre)
        knn_rec_list.append(knn_rec)
        knn_auc_list.append(knn_auc)

In [44]:

# After the loop ends, calculate average and standard deviation for each metric
# For Naive Bayes
avg_nb_acc, std_nb_acc = np.mean(nb_acc_list), np.std(nb_acc_list)
avg_nb_pre, std_nb_pre = np.mean(nb_pre_list), np.std(nb_pre_list)
avg_nb_rec, std_nb_rec = np.mean(nb_rec_list), np.std(nb_rec_list)
avg_nb_auc, std_nb_auc = np.mean(nb_auc_list), np.std(nb_auc_list)

# For kNN
avg_knn_acc, std_knn_acc = np.mean(knn_acc_list), np.std(knn_acc_list)
avg_knn_pre, std_knn_pre = np.mean(knn_pre_list), np.std(knn_pre_list)
avg_knn_rec, std_knn_rec = np.mean(knn_rec_list), np.std(knn_rec_list)
avg_knn_auc, std_knn_auc = np.mean(knn_auc_list), np.std(knn_auc_list)

# Print or store these values for interpretation
print(f"Naive Bayes Average Accuracy: {avg_nb_acc}, Std: {std_nb_acc}")
print(f"Naive Bayes Average Precision: {avg_nb_pre}, Std: {std_nb_pre}")
print(f"Naive Bayes Average Recall: {avg_nb_rec}, Std: {std_nb_rec}")
print(f"Naive Bayes Average AUC: {avg_nb_auc}, Std: {std_nb_auc}")

print(f"kNN Average Accuracy: {avg_knn_acc}, Std: {std_knn_acc}")
print(f"kNN Average Precision: {avg_knn_pre}, Std: {std_knn_pre}")
print(f"kNN Average Recall: {avg_knn_rec}, Std: {std_knn_rec}")
print(f"kNN Average AUC: {avg_knn_auc}, Std: {std_knn_auc}")

Naive Bayes Average Accuracy: 0.6820512820512821, Std: 0.07965552891865686
Naive Bayes Average Precision: 0.6940705071502284, Std: 0.0902119565080107
Naive Bayes Average Recall: 0.6830952380952382, Std: 0.10870527464062577
Naive Bayes Average AUC: 0.6820739348370928, Std: 0.07974708555709994
kNN Average Accuracy: 0.7817948717948718, Std: 0.056898527854926585
kNN Average Precision: 0.7085877926618243, Std: 0.059667736418086696
kNN Average Recall: 0.985, Std: 0.03201562118716424
kNN Average AUC: 0.7780263157894737, Std: 0.05595138496252241


In [45]:
# Find and print the best metrics for Naive Bayes
best_nb_acc = max(nb_acc_list)
best_nb_pre = max(nb_pre_list)
best_nb_rec = max(nb_rec_list)
best_nb_auc = max(nb_auc_list)

print(f"Best Naive Bayes Accuracy: {best_nb_acc}")
print(f"Best Naive Bayes Precision: {best_nb_pre}")
print(f"Best Naive Bayes Recall: {best_nb_rec}")
print(f"Best Naive Bayes AUC: {best_nb_auc}")

# Find and print the best metrics for kNN
best_knn_acc = max(knn_acc_list)
best_knn_pre = max(knn_pre_list)
best_knn_rec = max(knn_rec_list)
best_knn_auc = max(knn_auc_list)

print(f"Best kNN Accuracy: {best_knn_acc}")
print(f"Best kNN Precision: {best_knn_pre}")
print(f"Best kNN Recall: {best_knn_rec}")
print(f"Best kNN AUC: {best_knn_auc}")

Best Naive Bayes Accuracy: 0.8205128205128205
Best Naive Bayes Precision: 0.8823529411764706
Best Naive Bayes Recall: 0.85
Best Naive Bayes AUC: 0.8223684210526316
Best kNN Accuracy: 0.8974358974358975
Best kNN Precision: 0.8333333333333334
Best kNN Recall: 1.0
Best kNN AUC: 0.8947368421052632
