In [None]:
import pandas as pd
from configs.data import MACHINE_LEARNING_DATASET_PATH
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from configs.enums import Column, RiskClassifications
from machine_learning.utils import split_data, scale_dataset
from sklearn.model_selection import cross_val_score
import shap

In [None]:
df = pd.read_excel(MACHINE_LEARNING_DATASET_PATH)

train, valid, test = split_data(df)

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

## KNN

In [None]:
def print_results(model) -> None:
    y_pred_train = model.predict(x_train).argmax(axis=1)
    print("\n###### Training ######")
    print(classification_report(train_labels, y_pred_train))
    
    y_pred_valid = model.predict(x_val).argmax(axis=1)
    print("\n###### Validation ######")
    print(classification_report(val_labels, y_pred_valid))
    
    y_pred = model.predict(x_test).argmax(axis=1)
    print("\n###### Test ######")
    print(classification_report(test_labels, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=200)
knn_model.fit(X_train, y_train)

print_results(knn_model)

In [None]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]  
score_knn = cross_val_score(knn_model, X, y, cv=10)
print(score_knn)
print("Avg: ", np.average((score_knn)))

### Shap

In [None]:
def shapify(data: pd.DataFrame, model):
    train, valid, test = split_data(data)

    train, x_train, train_labels = scale_dataset(train, oversample=True)
    valid, x_val, val_labels = scale_dataset(valid, oversample=False)
    test, x_test, test_labels = scale_dataset(test, oversample=False)

    explainer = shap.KernelExplainer(model.predict, x_train)
    shap_values = explainer.shap_values(x_test, nsamples=100)
    # explainer.save()

    return explainer, shap_values, x_test

In [None]:
shap_df = df[:300]

train, valid, test = split_data(shap_df)
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(X_train, y_train)

explainer, shap_values, shap_x_test = shapify(shap_df, knn_model)
shap.summary_plot(shap_values, shap_x_test, feature_names=df.columns[Column.COUNTRY_RISK],
                  class_names=RiskClassifications.get_names())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)

print_results(lg_model)

## SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)

print_results(svm_model)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model.fit(X_train, y_train)

print_results(rf_model)

In [None]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]  
score_rf = cross_val_score(rf_model, X, y, cv=10)
print(score_rf)
print("Avg: ", np.average((score_rf)))