# Data Preprocessing

Min-max scaling of our features

In [1]:
import pandas as pd
import numpy as np  
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [4]:
# For reproducibility of results
np.random.seed(0)

# Load the dataset if selected_features.csv does not exist
if os.path.exists("selected_features.csv"):
    selected_features = pd.read_csv("selected_features.csv")
    print(selected_features.head())
else:
    file_path = "../DataProcessing/rtfDataSet.csv"
    original_data = pd.read_csv(file_path)
    # Get only our features of interest and save them to a local file "selecte_features.csv"
    # 2 7 8 11 14 15 17 18 21 22 23 25 30 31 33 36 40 43 44 50 53 54 56 57 58 59 60 64 66 68 and label
    selected_features = original_data.iloc[:, [2, 7, 8, 11, 14, 15, 17, 18, 21, 22, 23, 25, 30, 31, 33, 36, 40, 43, 44, 50, 53, 54, 56, 57, 58, 59, 60, 64, 66, 68,70]]    
    # Save the selected features to a local file
    selected_features.to_csv("selected_features.csv", index=False)
    print(original_data.head())
    print(selected_features.head())

       0      1      2      3      4      5      6      7      8      9  ...  \
0 -0.536  0.292 -0.684  0.123 -0.118  0.346 -0.308 -0.113  0.401 -0.399  ...   
1 -0.496 -0.298 -0.529  0.073 -0.072  0.107 -0.316 -0.066 -0.168 -0.099  ...   
2  0.008 -0.031 -0.240  0.178 -0.120  0.317 -0.481  0.031 -0.077  0.063  ...   
3 -0.188 -0.180 -0.062 -0.104 -0.136 -0.061 -0.216 -0.143  0.068 -0.189  ...   
4  0.038  0.155 -0.203 -0.088 -0.084 -0.164 -0.145 -0.168  0.008  0.143  ...   

      61     62     63     64     65     66     67     68     69  label  
0 -0.135 -0.063 -0.410 -0.223 -0.599 -0.136 -0.329 -0.132 -0.266      0  
1  0.189  0.007 -0.362 -0.151 -0.338 -0.031 -0.159 -0.097 -0.131      0  
2 -0.035  0.101 -0.098 -0.172 -0.290  0.033 -0.342 -0.321 -0.238      0  
3  0.146 -0.145 -0.527 -0.292 -0.663 -0.078 -0.194 -0.151 -0.268      0  
4  0.019 -0.084  0.164 -0.165 -0.126 -0.112  0.029 -0.138  0.148      1  

[5 rows x 71 columns]
       2      7      8     11     14     15     17  

In [5]:
labels = selected_features.iloc[:, -1]
features = selected_features.iloc[:, :-1]

features, labels

(            2         7         8        11        14        15        17  \
 0   -0.684000 -0.113000  0.401000 -0.251000 -0.178000  0.321000  0.016000   
 1   -0.529000 -0.066000 -0.168000 -0.205000  0.020000  0.031000 -0.165000   
 2   -0.240000  0.031000 -0.077000 -0.036000 -0.005000  0.170000  0.212000   
 3   -0.062000 -0.143000  0.068000 -0.295000 -0.188000 -0.317000 -0.113000   
 4   -0.203000 -0.168000  0.008000  0.166000 -0.048000 -0.291000 -0.091000   
 ..        ...       ...       ...       ...       ...       ...       ...   
 391 -0.240811  0.160049 -0.172790  0.431347  0.430861 -0.305451 -0.119770   
 392 -0.179418 -0.000023 -0.260852  0.237564 -0.080075 -0.354038 -0.004051   
 393 -0.553610 -0.005744 -0.068708 -0.253044  0.017528 -0.070628 -0.172178   
 394  0.203276 -0.036268 -0.253699  0.382949  0.295444  0.049701 -0.245894   
 395 -0.221108  0.450663 -0.293762  0.042793  0.153844 -0.268618 -0.083691   
 
            18        21        22  ...        53        54   

## Min-Max

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_normalized = scaler.fit_transform(features)
dataset_normalized

array([[0.18304732, 0.49900892, 0.89076305, ..., 0.33932274, 0.34881517,
        0.52682455],
       [0.26365055, 0.52229931, 0.43373494, ..., 0.38908086, 0.44834123,
        0.54374094],
       [0.41393656, 0.5703667 , 0.50682731, ..., 0.37456807, 0.50900474,
        0.43547608],
       ...,
       [0.25085283, 0.55215857, 0.51348755, ..., 0.31353283, 0.41787867,
        0.28746834],
       [0.6444493 , 0.53703271, 0.3649004 , ..., 0.57430339, 0.55802938,
        0.67151474],
       [0.42376079, 0.77832656, 0.33272129, ..., 0.38708569, 0.42213365,
        0.63412131]])

## Stratified 10-Fold Cross-Validation

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# TODO: Initialize your features as X and labels as y
X = dataset_normalized
y = labels

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=10)

# Initialize empty lists to store metrics for Naive Bayes and kNN
nb_metrics = []
knn_metrics = []

# TODO: Initialize k_values and distance_metrics for kNN
k_values = list(range(1, 16, 2))
distance_metrics = ['euclidean', 'manhattan']

# TODO: Implement the loop for Stratified 10-Fold CV
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    # ## Naive Bayes
    # TODO: Fit the Naive Bayes model and calculate metrics
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_pred_nb = nb.predict(X_test)
    nb_acc = accuracy_score(y_test, y_pred_nb)
    nb_pre = precision_score(y_test, y_pred_nb)
    nb_rec = recall_score(y_test, y_pred_nb)
    nb_auc = roc_auc_score(y_test, y_pred_nb)
    nb_metrics.append([nb_acc, nb_pre, nb_rec, nb_auc])
    
    # ## k-Nearest Neighbors Classifier
    # TODO: Loop through k_values and distance_metrics to find the best k
    knn_results = {}
    for metric in distance_metrics:
        best_k = None
        best_acc = 0
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            knn.fit(X_train, y_train)
            y_pred_knn = knn.predict(X_test)
            acc = accuracy_score(y_test, y_pred_knn)
            if acc > best_acc:
                best_acc = acc
                best_k = k
        knn_results[metric] = best_k
    
    # TODO: Use best k to fit the model and calculate metrics
    for metric, best_k in knn_results.items():
        knn = KNeighborsClassifier(n_neighbors=best_k, metric=metric)
        knn.fit(X_train, y_train)
        y_pred_knn = knn.predict(X_test)
        knn_acc = accuracy_score(y_test, y_pred_knn)
        knn_pre = precision_score(y_test, y_pred_knn)
        knn_rec = recall_score(y_test, y_pred_knn)
        knn_auc = roc_auc_score(y_test, y_pred_knn)
        knn_metrics.append([knn_acc, knn_pre, knn_rec, knn_auc])


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classifier(X_train, y_train, X_test, y_test):
    # Initialize and fit the Naive Bayes model
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    
    # Make predictions and calculate metrics
    y_pred = nb.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    return [acc, pre, rec, auc]

## k-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier(X_train, y_train, X_test, y_test, k_values, distance_metrics):
    best_metrics = {}
    
    for metric in distance_metrics:
        best_k = None
        best_acc = 0
        
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            
            acc = accuracy_score(y_test, y_pred)
            
            if acc > best_acc:
                best_acc = acc
                best_k = k
                
        # Use the best k to calculate metrics
        knn = KNeighborsClassifier(n_neighbors=best_k, metric=metric)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        
        best_metrics[metric] = [acc, pre, rec, auc]
        
    return best_metrics
