In [None]:
import pandas as pd
from configs.data import MACHINE_LEARNING_DATASET_PATH
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from configs.enums import Column, RiskClassifications
from machine_learning.utils import split_data, scale_dataset

In [None]:
df = pd.read_excel(MACHINE_LEARNING_DATASET_PATH)

train, valid, test = split_data(df)

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Shap

In [None]:
import sys
python_version = sys.version.split("|")[0].rstrip()

if python_version == "3.11.6":
    import shap
    print("Shap imported.")
else:
    print("Shap could not be installed on this docker image ('tensorflow-2150'), build 'tensorflow-2140' from docker compose.")

In [None]:
def shapify(data: pd.DataFrame, model):
    train, valid, test = split_data(data)

    train, x_train, train_labels = scale_dataset(train, oversample=True)
    valid, x_val, val_labels = scale_dataset(valid, oversample=False)
    test, x_test, test_labels = scale_dataset(test, oversample=False)

    explainer = shap.KernelExplainer(model.predict, x_train)
    shap_values = explainer.shap_values(x_test, nsamples=100)
    # explainer.save()

    return explainer, shap_values, x_test

In [None]:
shap_df = df[:300]

train, valid, test = split_data(shap_df)
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(X_train, y_train)

explainer, shap_values, shap_x_test = shapify(shap_df, knn_model)
shap.summary_plot(shap_values, shap_x_test, feature_names=df.columns[Column.COUNTRY_RISK],
                  class_names=RiskClassifications.get_names())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

## SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))