In [238]:
# Import Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.metrics import accuracy_score
from imblearn.combine import SMOTEENN

In [182]:
# Load the data
file_path = Path('resources/military_expenditure.csv')
df = pd.read_csv(file_path)
df.head

<bound method NDFrame.head of            country iso3c iso2c  year  Military expenditure (current USD)  \
0      Afghanistan   AFG    AF  1970                        2.939586e+06   
1      Afghanistan   AFG    AF  1971                                 NaN   
2      Afghanistan   AFG    AF  1972                                 NaN   
3      Afghanistan   AFG    AF  1973                        3.341272e+06   
4      Afghanistan   AFG    AF  1974                        3.581366e+06   
...            ...   ...   ...   ...                                 ...   
13561     Zimbabwe   ZWE    ZW  2016                        3.580650e+08   
13562     Zimbabwe   ZWE    ZW  2017                        3.405220e+08   
13563     Zimbabwe   ZWE    ZW  2018                        4.203640e+08   
13564     Zimbabwe   ZWE    ZW  2019                        5.469390e+08   
13565     Zimbabwe   ZWE    ZW  2020                                 NaN   

       Military expenditure (% of general government expe

In [183]:
df

Unnamed: 0,country,iso3c,iso2c,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),adminregion,incomeLevel
0,Afghanistan,AFG,AF,1970,2.939586e+06,,1.629606,South Asia,Low income
1,Afghanistan,AFG,AF,1971,,,,South Asia,Low income
2,Afghanistan,AFG,AF,1972,,,,South Asia,Low income
3,Afghanistan,AFG,AF,1973,3.341272e+06,,1.868910,South Asia,Low income
4,Afghanistan,AFG,AF,1974,3.581366e+06,,1.610825,South Asia,Low income
...,...,...,...,...,...,...,...,...,...
13561,Zimbabwe,ZWE,ZW,2016,3.580650e+08,7.363048,1.742494,Sub-Saharan Africa (excluding high income),Lower middle income
13562,Zimbabwe,ZWE,ZW,2017,3.405220e+08,5.519890,1.544948,Sub-Saharan Africa (excluding high income),Lower middle income
13563,Zimbabwe,ZWE,ZW,2018,4.203640e+08,5.607096,1.222795,Sub-Saharan Africa (excluding high income),Lower middle income
13564,Zimbabwe,ZWE,ZW,2019,5.469390e+08,2.099816,0.698601,Sub-Saharan Africa (excluding high income),Lower middle income


In [184]:
df.count()

country                                                       13566
iso3c                                                         13464
iso2c                                                         13413
year                                                          13566
Military expenditure (current USD)                             8769
Military expenditure (% of general government expenditure)     5282
Military expenditure (% of GDP)                                8777
adminregion                                                    6987
incomeLevel                                                   13464
dtype: int64

In [185]:
df_dropna = df.dropna()
df_dropna.count()

country                                                       2724
iso3c                                                         2724
iso2c                                                         2724
year                                                          2724
Military expenditure (current USD)                            2724
Military expenditure (% of general government expenditure)    2724
Military expenditure (% of GDP)                               2724
adminregion                                                   2724
incomeLevel                                                   2724
dtype: int64

In [186]:
mapping = {'Low income': 1, 'Lower middle income': 2, 'Upper middle income': 3}
df_y = df_dropna.replace({'incomeLevel': mapping})
df_X = df_dropna.drop(columns=["incomeLevel"])
df_y.head()

Unnamed: 0,country,iso3c,iso2c,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),adminregion,incomeLevel
34,Afghanistan,AFG,AF,2004,125111600.0,16.134336,2.431258,South Asia,1
35,Afghanistan,AFG,AF,2005,122727200.0,12.689758,1.992066,South Asia,1
36,Afghanistan,AFG,AF,2006,131346200.0,10.360382,1.896234,South Asia,1
37,Afghanistan,AFG,AF,2007,219580200.0,11.948405,2.566267,South Asia,1
38,Afghanistan,AFG,AF,2008,240532600.0,11.165409,2.335546,South Asia,1


In [187]:
# Convert string values into numerical values
df_binary_encoded = pd.get_dummies(df_X, dtype=float)
df_binary_encoded

Unnamed: 0,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Argentina,country_Armenia,...,iso2c_YE,iso2c_ZA,iso2c_ZM,iso2c_ZW,adminregion_East Asia & Pacific (excluding high income),adminregion_Europe & Central Asia (excluding high income),adminregion_Latin America & Caribbean (excluding high income),adminregion_Middle East & North Africa (excluding high income),adminregion_South Asia,adminregion_Sub-Saharan Africa (excluding high income)
34,2004,1.251116e+08,16.134336,2.431258,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
35,2005,1.227272e+08,12.689758,1.992066,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
36,2006,1.313462e+08,10.360382,1.896234,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37,2007,2.195802e+08,11.948405,2.566267,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38,2008,2.405326e+08,11.165409,2.335546,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13560,2015,3.766770e+08,9.193971,1.886876,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13561,2016,3.580650e+08,7.363048,1.742494,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13562,2017,3.405220e+08,5.519890,1.544948,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13563,2018,4.203640e+08,5.607096,1.222795,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [188]:
y = df_y['incomeLevel']
X = df_binary_encoded
y.count()

2724

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1
                                                    )
X_train.shape

(2043, 352)

In [139]:
classifier = LogisticRegression(solver='lbfgs',
                                random_state=2)

In [140]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=2)

In [144]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(50)

Unnamed: 0,Prediction,Actual
0,3,1
1,3,3
2,3,2
3,3,1
4,3,1
5,3,2
6,3,3
7,3,1
8,3,3
9,3,3


In [142]:
print(accuracy_score(y_test, y_pred))

0.3641703377386197


In [133]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      1.00      0.00      0.00      0.00       144
          2       0.00      0.00      0.97      0.00      0.00      0.00       279
          3       0.37      0.95      0.00      0.53      0.00      0.00       258

avg / total       0.14      0.36      0.61      0.20      0.00      0.00       681



  _warn_prf(average, modifier, msg_start, len(result))


In [146]:
df.to_csv('resources/Basic_ML_Model1.csv', index=False) 

In [218]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [219]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=2)

In [249]:
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,3,3
2,3,3
3,3,3
4,2,2


In [248]:
print(accuracy_score(y_test, y_pred))

0.3406754772393539


Testing out Random Forest Classifier

In [230]:
rf_model = RandomForestClassifier(n_estimators=300, random_state=3) 

In [231]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [232]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [233]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3"], columns=["Predicted 1", "Predicted 2", "Predicted 3"])

cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,154,0,0
Actual 2,1,279,0
Actual 3,0,0,247


In [234]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [235]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,154,0,0
Actual 2,1,279,0
Actual 3,0,0,247


Accuracy Score : 0.9985315712187959
Classification Report
              precision    recall  f1-score   support

           1       0.99      1.00      1.00       154
           2       1.00      1.00      1.00       280
           3       1.00      1.00      1.00       247

    accuracy                           1.00       681
   macro avg       1.00      1.00      1.00       681
weighted avg       1.00      1.00      1.00       681



In [247]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_

In [237]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.06240837118754394,
  'adminregion_Sub-Saharan Africa (excluding high income)'),
 (0.047233507534209435,
  'adminregion_Europe & Central Asia (excluding high income)'),
 (0.04631215582318395, 'Military expenditure (current USD)'),
 (0.03303678165813541,
  'Military expenditure (% of general government expenditure)'),
 (0.029831060558512593,
  'adminregion_Latin America & Caribbean (excluding high income)'),
 (0.029352586970993806, 'Military expenditure (% of GDP)'),
 (0.02076244543939539, 'adminregion_South Asia'),
 (0.010183161796541808,
  'adminregion_East Asia & Pacific (excluding high income)'),
 (0.008243836034585861, 'country_Botswana'),
 (0.007759579264032677, 'year'),
 (0.006861086666664921, 'iso3c_BWA'),
 (0.006522056708564357, 'iso2c_BW'),
 (0.0062578297305310665, 'iso2c_UA'),
 (0.006069940986476291, 'iso3c_FJI'),
 (0.00587100339573261, 'iso3c_BOL'),
 (0.005721094263435988, 'iso2c_KG'),
 (0.00542381558309604, 'country_Ukraine'),
 (0.005408774060764811,
  'adminregion_Middl

In [239]:
# Using the SMOTEENN model
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [240]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [241]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  0,   0, 154],
       [  0,   0, 280],
       [ 15,   0, 232]], dtype=int64)

In [242]:
balanced_accuracy_score(y_test, y_pred)

0.31309041835357626

In [243]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      0.97      0.00      0.00      0.00       154
          2       0.00      0.00      1.00      0.00      0.00      0.00       280
          3       0.35      0.94      0.00      0.51      0.00      0.00       247

avg / total       0.13      0.34      0.63      0.18      0.00      0.00       681



  _warn_prf(average, modifier, msg_start, len(result))


Testing new dataset

In [253]:
# Load the data
file_path = Path('resources/clean_data.csv')
df_new = pd.read_csv(file_path)
df_new

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,5008804,1,1,1,1,0,0,0,0,2,15,427500.0,32.868574,12.435574,Working,Higher education,Civil marriage,Rented apartment,Other,1
1,5008806,1,1,1,0,0,0,0,0,2,29,112500.0,58.793815,3.104787,Working,Secondary / secondary special,Married,House / apartment,Security staff,0
2,5008808,0,0,1,0,1,1,0,0,1,4,270000.0,52.321403,8.353354,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,0
3,5008812,0,0,1,0,0,0,1,0,1,20,283500.0,61.504343,0.000000,Pensioner,Higher education,Separated,House / apartment,Other,0
4,5008815,1,1,1,1,1,1,0,0,2,5,270000.0,46.193967,2.105450,Working,Higher education,Married,House / apartment,Accountants,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9704,5148694,0,0,0,0,0,0,0,0,2,20,180000.0,56.400884,0.542106,Pensioner,Secondary / secondary special,Civil marriage,Municipal apartment,Laborers,1
9705,5149055,0,0,1,1,1,0,0,0,2,19,112500.0,43.360233,7.375921,Commercial associate,Secondary / secondary special,Married,House / apartment,Other,1
9706,5149729,1,1,1,0,0,0,0,0,2,21,90000.0,52.296762,4.711938,Working,Secondary / secondary special,Married,House / apartment,Other,1
9707,5149838,0,0,1,0,1,1,0,0,2,32,157500.0,33.914454,3.627727,Pensioner,Higher education,Married,House / apartment,Medicine staff,1


In [252]:
df_new.count()

ID                 9709
Gender             9709
Own_car            9709
Own_property       9709
Work_phone         9709
Phone              9709
Email              9709
Unemployed         9709
Num_children       9709
Num_family         9709
Account_length     9709
Total_income       9709
Age                9709
Years_employed     9709
Income_type        9709
Education_type     9709
Family_status      9709
Housing_type       9709
Occupation_type    9709
Target             9709
dtype: int64

In [254]:
# Convert string values into numerical values
df_binary_encoded = pd.get_dummies(df_new, dtype=float)
df_binary_encoded

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,...,Occupation_type_Low-skill Laborers,Occupation_type_Managers,Occupation_type_Medicine staff,Occupation_type_Other,Occupation_type_Private service staff,Occupation_type_Realty agents,Occupation_type_Sales staff,Occupation_type_Secretaries,Occupation_type_Security staff,Occupation_type_Waiters/barmen staff
0,5008804,1,1,1,1,0,0,0,0,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5008806,1,1,1,0,0,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5008808,0,0,1,0,1,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,5008812,0,0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5008815,1,1,1,1,1,1,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9704,5148694,0,0,0,0,0,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9705,5149055,0,0,1,1,1,0,0,0,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9706,5149729,1,1,1,0,0,0,0,0,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9707,5149838,0,0,1,0,1,1,0,0,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:
y = df_binary_encoded['Target']
X = df_binary_encoded
y.count()

9709

In [269]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1
                                                    )


In [270]:
classifier = LogisticRegression(solver='lbfgs',
                                random_state=2)

classifier.fit(X_train, y_train)

LogisticRegression(random_state=2)

In [271]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [272]:
#Creating the accuracy score for the model
balanced_accuracy_score(y_test, y_pred)

0.5

High risk vs Low risk is very skewed toward low risk. 87% of the Target group is low risk, so .5 accuracy isn't good. Will try oversampling/undersampling/SMOTEENN