In [None]:
import pandas as pd
from configs.data import MACHINE_LEARNING_DATASET_PATH, MERGED_DATASET_PATH, OUT_PATH, VERSION
from sklearn.metrics import classification_report
import numpy as np
from configs.enums import Column, RISKCLASSIFICATIONS
from machine_learning.utils import scale_dataset
from sklearn.model_selection import cross_val_score
import shap
from typing import Tuple
import matplotlib.pyplot as plt
from machine_learning.utils import plot_distribution, get_distribution

## Load and split dataset

In [None]:
def split_data(dataframe: pd.DataFrame) -> Tuple[np.array, np.array]:
    """
    Splits the data into a train and test dataset, where each label/class is spread equally over each
    dataset.
    :param dataframe: pandas.Dataframe, The dataframe to split.
    :return: Tuple[np.array, np.array], A tuple containing the train and test dataset
    respectively.
    """
    data_by_risk = [dataframe[dataframe["country_risk"] == v] for v in RISKCLASSIFICATIONS.get_values()]
    split_data = [
        # Train (70%) and test (30%) datasets
        np.split(sd.sample(frac=1, random_state=0), [int(0.7 * len(sd))])
        for sd
        in data_by_risk
    ]

    train = pd.concat([row[0] for row in split_data])
    test = pd.concat([row[1] for row in split_data])

    return train, test

In [None]:
df = pd.read_excel(MACHINE_LEARNING_DATASET_PATH)

train_df, test_df = split_data(df)

train, x_train, train_labels = scale_dataset(train_df, oversample=True)
test, x_test, test_labels = scale_dataset(test_df, oversample=False)  

## Utility function definitions

In [None]:
def print_results(model) -> Tuple:
    """
    Prints the confusion matrices for the train and test data. 
    :param model: A model that will perform the predictions. 
    :return: Tuple, containing the prediction results.
    """
    y_pred_train = model.predict(x_train)
    print("\n###### Training ######")
    print(classification_report(train_labels, y_pred_train))

    y_pred = model.predict(x_test)
    print("\n###### Test ######")
    print(classification_report(test_labels, y_pred))
    
    return y_pred_train, y_pred

In [None]:
feature_names = df.columns.tolist()
feature_names.remove(Column.COUNTRY_RISK)

In [None]:
def k_cross_validation(model, k: int=10):
    """Performs the k cross validation"""
    X = df[feature_names]
    y = df[Column.COUNTRY_RISK]  
    
    scores = cross_val_score(model, X, y, cv=k)
    avg_score = np.average((scores))
    print("Scores:",scores)
    print("Avg:", avg_score)
    
    return scores, avg_score

### Shap

In [None]:
def calculate_shap_values(model) -> Tuple:
    """Calculates the Shap values"""
    explainer = shap.KernelExplainer(model.predict, x_train)
    shap_values = explainer.shap_values(x_test, nsamples=100)
    return explainer, shap_values

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(x_train, train_labels)

_, y_pred = print_results(knn_model)

In [None]:
knn_scores, knn_avg_scores = k_cross_validation(knn_model)

In [None]:
from machine_learning.utils import output_incorrectly_predicted_xlsx
output_incorrectly_predicted_xlsx(test_df, y_pred, "knn")  

In [None]:
_, shap_values = calculate_shap_values(knn_model)
shap.summary_plot(shap_values, x_test, feature_names=feature_names,
                  class_names=RISKCLASSIFICATIONS.get_names())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train, train_labels)

_, y_pred = print_results(lr_model)

In [None]:
lr_scores, lr_avg_scores = k_cross_validation(lr_model)

## SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(x_train, train_labels)

_, y_pred = print_results(svm_model)

In [None]:
svm_scores, svm_avg_scores = k_cross_validation(svm_model)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# rf_model = RandomForestClassifier(n_estimators=3000, random_state=4098)
rf_model = RandomForestClassifier(n_estimators=500, random_state=42) 
rf_model.fit(x_train, train_labels)

_, y_pred = print_results(rf_model)

In [None]:
from machine_learning.utils import output_incorrectly_predicted_xlsx
output_incorrectly_predicted_xlsx(test_df, y_pred, "rf")  

In [None]:
rf_scores, rf_avg_scores = k_cross_validation(rf_model)

In [None]:
distribution = get_distribution(test_df, y_pred)
plot_distribution(distribution)

In [None]:
# https://www.kaggle.com/code/ahmedabdulhamid/best-n-estimators-for-randomforest
def plot_best_n_estimators(max_n_estimators: int, random_state: int=42) -> None:
    """Plots a graph for max_n_estimators amount of estimators."""
    import numpy as np
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    rf_model = RandomForestClassifier(n_estimators=max_n_estimators, random_state=random_state)
    rf_model.fit(x_train, train_labels)
    
    predictions = []
    for tree in rf_model.estimators_:
        predictions.append(tree.predict_proba(x_test)[None, :])
    
    predictions = np.vstack(predictions)
    cum_mean = np.cumsum(predictions, axis=0)/np.arange(1, predictions.shape[0] + 1)[:, None, None]
    
    scores = []
    for pred in cum_mean:
        scores.append(accuracy_score(test_labels, np.argmax(pred, axis=1)))
        
    plt.figure(figsize=(15, 8))
    plt.plot(scores, linewidth=3)
    plt.xlabel('num_trees')
    plt.ylabel('accuracy')

In [None]:
plot_best_n_estimators(5000)