In [1]:
import pandas as pd
from configs.data import MACHINE_LEARNING_DATASET_PATH, MERGED_DATASET_PATH, OUT_PATH, VERSION
from sklearn.metrics import classification_report
import numpy as np
from configs.enums import Column, RISKCLASSIFICATIONS
from machine_learning.utils import scale_dataset
from sklearn.model_selection import cross_val_score
import shap
from typing import Tuple
import matplotlib.pyplot as plt
from machine_learning.utils import plot_distribution, get_distribution

## Load and split dataset

In [2]:
def split_data(dataframe: pd.DataFrame):
    data_by_risk = [dataframe[dataframe["country_risk"] == v] for v in RISKCLASSIFICATIONS.get_values()]
    split_data = [
        # Train (70%) and test (30%) datasets
        np.split(sd.sample(frac=1, random_state=0), [int(0.7 * len(sd))])
        for sd
        in data_by_risk
    ]

    train = pd.concat([row[0] for row in split_data])
    test = pd.concat([row[1] for row in split_data])

    return train, test

In [3]:
df = pd.read_excel(MACHINE_LEARNING_DATASET_PATH)

train_df, test_df = split_data(df)

train, x_train, train_labels = scale_dataset(train_df, oversample=True)
test, x_test, test_labels = scale_dataset(test_df, oversample=False)  

'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.


## Utility function definitions

In [4]:
def print_results(model) -> Tuple:
    y_pred_train = model.predict(x_train)
    print("\n###### Training ######")
    print(classification_report(train_labels, y_pred_train))

    y_pred = model.predict(x_test)
    print("\n###### Test ######")
    print(classification_report(test_labels, y_pred))
    
    return y_pred_train, y_pred

In [5]:
feature_names = df.columns.tolist()
feature_names.remove(Column.COUNTRY_RISK)

In [6]:
def k_cross_validation(model, k=10):
    X = df[feature_names]
    y = df[Column.COUNTRY_RISK]  
    
    scores = cross_val_score(model, X, y, cv=k)
    avg_score = np.average((scores))
    print("Scores:",scores)
    print("Avg:", avg_score)
    
    return scores, avg_score

### Shap

In [7]:
def calculate_shap_values(model) -> Tuple:
    explainer = shap.KernelExplainer(model.predict, x_train)
    shap_values = explainer.shap_values(x_test, nsamples=100)
    return explainer, shap_values

## KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(x_train, train_labels)

_, y_pred = print_results(knn_model)


###### Training ######
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1404
           1       0.94      0.92      0.93      1404
           2       0.87      0.87      0.87      1404
           3       0.83      0.79      0.81      1404
           4       0.77      0.66      0.71      1404
           5       0.70      0.67      0.68      1404
           6       0.70      0.73      0.72      1404
           7       0.80      0.90      0.85      1404
           8       0.95      1.00      0.97      1404

    accuracy                           0.84     12636
   macro avg       0.84      0.84      0.84     12636
weighted avg       0.84      0.84      0.84     12636


###### Test ######
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        26
           1       0.88      0.88      0.88       139
           2       0.83      0.87      0.85       285
           3       0.84      0.81 

In [None]:
knn_scores, knn_avg_scores = k_cross_validation(knn_model)

In [9]:
from machine_learning.utils import output_incorrectly_predicted_xlsx
output_incorrectly_predicted_xlsx(test_df, y_pred, "knn")  

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'low_0' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,year,country,polity2,durable,fragment,gov_instability,gdp_rppp,gdp_rppp_pc,igov_rppp,ipriv_rppp,ippp_rppp,country_risk,predicted_country_risk,norm_risk
5861,1977,Saudi Arabia,-10,51,0,0,742.987793,84862.273916,52.070137,41.002678,0.000000,low_1,low_2,0.206145
7011,1998,United Kingdom,10,118,0,0,2108.601562,36052.395902,38.371407,341.133820,6.571032,low_1,low_0,0.118406
363,2009,Austria,10,63,0,0,424.346100,50860.562375,14.336362,80.998306,0.289296,low_1,low_2,0.216857
1096,2008,Canada,10,120,0,0,1522.818359,45803.018456,63.168842,317.800995,0.000000,low_1,low_0,0.115244
1097,2009,Canada,10,121,0,0,1478.261475,43958.074585,69.042236,268.824738,0.000000,low_1,low_0,0.117408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4440,1968,Mali,-7,8,0,1,1.465057,245.248622,0.020099,0.145065,0.000000,high_0,medium_2,0.677843
2183,2003,Ethiopia,1,8,0,8,51.947659,709.969708,1.967131,1.589661,0.000000,high_0,high_1,0.699851
938,1994,Burundi,0,0,0,4,5.814970,1040.913950,0.030692,0.094637,0.000000,high_0,high_1,0.775914
3836,1993,Liberia,0,0,0,3,0.789138,370.005978,0.008488,0.043327,0.000000,high_1,high_0,0.846401


In [None]:
_, shap_values = calculate_shap_values(knn_model)
shap.summary_plot(shap_values, x_test, feature_names=feature_names,
                  class_names=RISKCLASSIFICATIONS.get_names())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train, train_labels)

_, y_pred = print_results(lr_model)

In [None]:
lr_scores, lr_avg_scores = k_cross_validation(lr_model)

## SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(x_train, train_labels)

_, y_pred = print_results(svm_model)

In [None]:
svm_scores, svm_avg_scores = k_cross_validation(svm_model)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# rf_model = RandomForestClassifier(n_estimators=3000, random_state=4098)
rf_model = RandomForestClassifier(n_estimators=500, random_state=42) 
rf_model.fit(x_train, train_labels)

_, y_pred = print_results(rf_model)

In [None]:
from machine_learning.utils import output_incorrectly_predicted_xlsx
output_incorrectly_predicted_xlsx(test_df, y_pred, "rf")  

In [None]:
rf_scores, rf_avg_scores = k_cross_validation(rf_model)

In [None]:
distribution = get_distribution(test_df, y_pred)
plot_distribution(distribution)

In [None]:
# https://www.kaggle.com/code/ahmedabdulhamid/best-n-estimators-for-randomforest
def plot_best_n_estimators(max_n_estimators, random_state=42):
    import numpy as np
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    rf_model = RandomForestClassifier(n_estimators=max_n_estimators, random_state=random_state)
    rf_model.fit(x_train, train_labels)
    
    predictions = []
    for tree in rf_model.estimators_:
        predictions.append(tree.predict_proba(x_test)[None, :])
    
    predictions = np.vstack(predictions)
    cum_mean = np.cumsum(predictions, axis=0)/np.arange(1, predictions.shape[0] + 1)[:, None, None]
    
    scores = []
    for pred in cum_mean:
        scores.append(accuracy_score(test_labels, np.argmax(pred, axis=1)))
        
    plt.figure(figsize=(15, 8))
    plt.plot(scores, linewidth=3)
    plt.xlabel('num_trees')
    plt.ylabel('accuracy')
plot_best_n_estimators(5000)