g## 0 Imports

In [None]:
oos

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 K-Nearest Neighbors Model

In [3]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("KNN \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

KNN 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [4]:
def sklearn_knn(x_test_knn, x_train_knn, y_train_knn, y_test_knn):

    # 1: GRID SEARCH
    knn_model = KNeighborsClassifier()

    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }

    grid_search = GridSearchCV(estimator = knn_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_knn, y_train_knn)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = KNeighborsClassifier(n_neighbors = best_model.n_neighbors,
                                 weights = best_model.weights,
                                 metric = best_model.metric)

    model.fit(x_train_knn, y_train_knn)

    # 3: ESTIMATING WEIGHTS
    # Note: KNN does not provide weights in the same way as linear models.
    # Instead, we can use the model to make predictions and evaluate performance.

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred = model.predict(x_test_knn)
    y_pred_proba = model.predict_proba(x_test_knn)

    # 5: COMPUTE METRICS
    accuracy_knn = model.score(x_test_knn, y_test_knn)
    macro_f1_knn = recall_score(y_test_knn, y_pred, average = 'macro')
    mirco_f1_knn = recall_score(y_test_knn, y_pred, average = 'micro')
    mcc_knn = matthews_corrcoef(y_test_knn, y_pred)

    cm_knn = confusion_matrix(y_test_knn, y_pred)
    precision_knn = cm_knn[1, 1] / (cm_knn[1, 1] + cm_knn[0, 1])
    recall_knn = cm_knn[1, 1] / (cm_knn[1, 1] + cm_knn[1, 0])

    # store metrics in a dictionary
    metrics = {
        "accuracy": accuracy_knn,
        "macro_f1": macro_f1_knn,
        "micro_f1": mirco_f1_knn,
        "mcc": mcc_knn,
        "precision": precision_knn,
        "recall": recall_knn,
        "confusion_matrix": cm_knn
    }

    return y_pred, y_pred_proba, metrics

In [5]:
start = time.time()

y_pred_knn, y_pred_proba_knn, metrics_knn = sklearn_knn(X_test, X_train, y_train, y_test)

end = time.time()
print("Execution time: ", {end - start}, "seconds")
time_knn = end - start
time_knn_df = pd.DataFrame({"time": [time_knn]})
time_knn_df.to_csv("../exp/times_ML/time_knn.csv", sep = ",", index = False)

# save predictions
y_pred_knn = pd.DataFrame(y_pred_knn, columns = ["y_pred"])
y_pred_knn.to_csv("../exp/predictions/y_pred_knn.csv", sep = ",", index = False)

Best parameters:  {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Best score:  0.7202123273551846
Execution time:  {1.6480400562286377} seconds


In [6]:
metrics_knn

{'accuracy': 0.7154471544715447,
 'macro_f1': 0.5281257676246622,
 'micro_f1': 0.7154471544715447,
 'mcc': np.float64(0.10560729990877482),
 'precision': np.float64(0.4666666666666667),
 'recall': np.float64(0.10144927536231885),
 'confusion_matrix': array([[169,   8],
        [ 62,   7]])}