In [125]:
# Import Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.metrics import accuracy_score

In [76]:
# Load the data
file_path = Path('resources/military_expenditure.csv')
df = pd.read_csv(file_path)
df.head

<bound method NDFrame.head of            country iso3c iso2c  year  Military expenditure (current USD)  \
0      Afghanistan   AFG    AF  1970                        2.939586e+06   
1      Afghanistan   AFG    AF  1971                                 NaN   
2      Afghanistan   AFG    AF  1972                                 NaN   
3      Afghanistan   AFG    AF  1973                        3.341272e+06   
4      Afghanistan   AFG    AF  1974                        3.581366e+06   
...            ...   ...   ...   ...                                 ...   
13561     Zimbabwe   ZWE    ZW  2016                        3.580650e+08   
13562     Zimbabwe   ZWE    ZW  2017                        3.405220e+08   
13563     Zimbabwe   ZWE    ZW  2018                        4.203640e+08   
13564     Zimbabwe   ZWE    ZW  2019                        5.469390e+08   
13565     Zimbabwe   ZWE    ZW  2020                                 NaN   

       Military expenditure (% of general government expe

In [104]:
df

Unnamed: 0,country,iso3c,iso2c,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),adminregion,incomeLevel
0,Afghanistan,AFG,AF,1970,2.939586e+06,,1.629606,South Asia,Low income
1,Afghanistan,AFG,AF,1971,,,,South Asia,Low income
2,Afghanistan,AFG,AF,1972,,,,South Asia,Low income
3,Afghanistan,AFG,AF,1973,3.341272e+06,,1.868910,South Asia,Low income
4,Afghanistan,AFG,AF,1974,3.581366e+06,,1.610825,South Asia,Low income
...,...,...,...,...,...,...,...,...,...
13561,Zimbabwe,ZWE,ZW,2016,3.580650e+08,7.363048,1.742494,Sub-Saharan Africa (excluding high income),Lower middle income
13562,Zimbabwe,ZWE,ZW,2017,3.405220e+08,5.519890,1.544948,Sub-Saharan Africa (excluding high income),Lower middle income
13563,Zimbabwe,ZWE,ZW,2018,4.203640e+08,5.607096,1.222795,Sub-Saharan Africa (excluding high income),Lower middle income
13564,Zimbabwe,ZWE,ZW,2019,5.469390e+08,2.099816,0.698601,Sub-Saharan Africa (excluding high income),Lower middle income


In [80]:
df.count()

country                                                       13566
iso3c                                                         13464
iso2c                                                         13413
year                                                          13566
Military expenditure (current USD)                             8769
Military expenditure (% of general government expenditure)     5282
Military expenditure (% of GDP)                                8777
adminregion                                                    6987
incomeLevel                                                   13464
dtype: int64

In [79]:
df_dropna = df.dropna()
df_dropna.count()

country                                                       2724
iso3c                                                         2724
iso2c                                                         2724
year                                                          2724
Military expenditure (current USD)                            2724
Military expenditure (% of general government expenditure)    2724
Military expenditure (% of GDP)                               2724
adminregion                                                   2724
incomeLevel                                                   2724
dtype: int64

In [114]:
mapping = {'Low income': 1, 'Lower middle income': 2, 'Upper middle income': 3}
df_y = df_dropna.replace({'incomeLevel': mapping})
df_X = df_dropna.drop(columns=["incomeLevel"])
df_y.head()

Unnamed: 0,country,iso3c,iso2c,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),adminregion,incomeLevel
34,Afghanistan,AFG,AF,2004,125111600.0,16.134336,2.431258,South Asia,1
35,Afghanistan,AFG,AF,2005,122727200.0,12.689758,1.992066,South Asia,1
36,Afghanistan,AFG,AF,2006,131346200.0,10.360382,1.896234,South Asia,1
37,Afghanistan,AFG,AF,2007,219580200.0,11.948405,2.566267,South Asia,1
38,Afghanistan,AFG,AF,2008,240532600.0,11.165409,2.335546,South Asia,1


In [119]:
# Convert string values into numerical values
df_binary_encoded = pd.get_dummies(df_X, dtype=float)
df_binary_encoded

Unnamed: 0,year,Military expenditure (current USD),Military expenditure (% of general government expenditure),Military expenditure (% of GDP),country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Argentina,country_Armenia,...,iso2c_YE,iso2c_ZA,iso2c_ZM,iso2c_ZW,adminregion_East Asia & Pacific (excluding high income),adminregion_Europe & Central Asia (excluding high income),adminregion_Latin America & Caribbean (excluding high income),adminregion_Middle East & North Africa (excluding high income),adminregion_South Asia,adminregion_Sub-Saharan Africa (excluding high income)
34,2004,1.251116e+08,16.134336,2.431258,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
35,2005,1.227272e+08,12.689758,1.992066,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
36,2006,1.313462e+08,10.360382,1.896234,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37,2007,2.195802e+08,11.948405,2.566267,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38,2008,2.405326e+08,11.165409,2.335546,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13560,2015,3.766770e+08,9.193971,1.886876,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13561,2016,3.580650e+08,7.363048,1.742494,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13562,2017,3.405220e+08,5.519890,1.544948,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13563,2018,4.203640e+08,5.607096,1.222795,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [120]:
y = df_y['incomeLevel']
X = df_binary_encoded
y.count()

2724

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=2,
                                                    stratify=y
                                                    )
X_train.shape

(2043, 352)

In [139]:
classifier = LogisticRegression(solver='lbfgs',
                                random_state=2)

In [140]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=2)

In [144]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(50)

Unnamed: 0,Prediction,Actual
0,3,1
1,3,3
2,3,2
3,3,1
4,3,1
5,3,2
6,3,3
7,3,1
8,3,3
9,3,3


In [142]:
print(accuracy_score(y_test, y_pred))

0.3641703377386197


In [133]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      1.00      0.00      0.00      0.00       144
          2       0.00      0.00      0.97      0.00      0.00      0.00       279
          3       0.37      0.95      0.00      0.53      0.00      0.00       258

avg / total       0.14      0.36      0.61      0.20      0.00      0.00       681



  _warn_prf(average, modifier, msg_start, len(result))


In [146]:
df.to_csv('resources/Basic_ML_Model1.csv', index=False) 