In [1]:
# Libs
import json
import csv
import pandas as pd
import numpy as np
import time
import sys
import random
import matplotlib.pyplot as plt
from joblib import dump, load
from glob import glob

from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split

In [2]:
# Load the african country dictionary
dict_path = "data/output/african_countries.json"

with open(dict_path) as json_file:
    ccDict = json.load(json_file)

In [3]:
# Load all features
with open('data/output/mil_exp.json') as json_file:
    mil_exp = json.load(json_file)
    
with open('data/output/population.json') as json_file:
    population = json.load(json_file)
    
with open('data/output/arms_imports.json') as json_file:
    arms_imports = json.load(json_file)
    
with open('data/output/conflicts.json') as json_file:
    conflicts = json.load(json_file)
    
with open('data/output/mil_pers.json') as json_file:
    mil_pers = json.load(json_file)
    

In [65]:
# Get the time windows we will work with
minYear = 1962
maxYear = 2012

# Go through all datasets
temp_df = []
for i, key in enumerate(ccDict):
    
    # Get the country name
    country_name = ccDict[key]['name']
    
    # Go through years
    for year in range(minYear, maxYear+1):
    
        # datum
        datum = {
            'COW_key': int(key),
            'Year': int(year),
            'Mil_Exp':float(mil_exp[key][str(year)]),
            'Population':int(population[key][str(year)]),
            'Mil_Pers':float(mil_pers[key][str(year)]),
            'Arms_Imports':int(arms_imports[key][str(year)]),
            'Conflict':conflicts[key][str(year)]
        }
        
        # Append to temp df
        temp_df.append(datum)

# Convert temp df to pandas
df = pd.DataFrame(temp_df) 
    

# Print nbr of rows
print("Nbr of rows : " + str(len(df.index)))

# Preview df
df.head(10)

Nbr of rows : 2754


Unnamed: 0,Arms_Imports,COW_key,Conflict,Mil_Exp,Mil_Pers,Population,Year
0,0,615,0,3.238663,0.412088,11619828,1962
1,0,615,1,2.937209,0.624721,11912803,1963
2,0,615,0,3.420652,0.556745,12221675,1964
3,0,615,0,3.168904,0.503229,12550885,1965
4,3915602,615,0,3.272904,0.59237,12902627,1966
5,0,615,0,2.948131,0.573482,13275026,1967
6,1958746,615,0,2.583317,0.555761,13663583,1968
7,665860,615,0,2.340749,0.575043,14061722,1969
8,1170484,615,0,2.044164,0.581987,14464985,1970
9,1849112,615,0,1.952654,0.564294,14872250,1971


## Missing Data

In [66]:
df.mean(axis = 0)

Arms_Imports    1.388874e+06
COW_key         5.115000e+02
Conflict        1.706609e-01
Mil_Exp         2.266421e+00
Mil_Pers        3.018577e-01
Population      1.142336e+07
Year            1.987000e+03
dtype: float64

In [67]:
mean_milexp = df.mean(axis = 0)['Mil_Exp']
mean_milpers = df.mean(axis = 0)['Mil_Pers']


for index, row in df.iterrows():
    
    if(row['Mil_Exp'] == 0):
        df.at[index,'Mil_Exp'] = mean_milexp
    
    if(row['Mil_Pers'] == 0):
        df.at[index,'Mil_Pers'] = mean_milpers

## Balance Dataset

In [68]:
# Shuffle Rows
df = df.sample(frac=1).reset_index(drop=True)

# count excess
imbalance = df['Conflict'].value_counts()

excessLabel = 0
if(imbalance[0] > imbalance[1]):
    excessLabel = 0
else:
    excessLabel = 1

# Nbr of excess
diff = abs(imbalance[0] - imbalance[1])

In [69]:
balanced_df = df.copy()

nbr_dropped = 0
for index, row in balanced_df.iterrows():
    
    if(nbr_dropped >= diff):
        break
    
    if(row['Conflict'] == excessLabel):
        balanced_df.drop(index, inplace=True)
        nbr_dropped += 1

In [70]:
balanced_df['Conflict'].value_counts()

1    470
0    470
Name: Conflict, dtype: int64

## Split Dataset

In [71]:
# Split features/label
features = ['Arms_Imports', 'Mil_Exp', 'Mil_Pers', 'Population', 'Year']
label = ['Conflict']
X = balanced_df[features]
y = balanced_df[label]

In [72]:
# Split the data
train_X, valid_X, train_Y, valid_Y = train_test_split(X, y, test_size=0.2, random_state=12, shuffle=True, stratify=y)

# cast to np
valid_Y = np.array(valid_Y)
valid_X = np.array(valid_X)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))

Length of training set :  752
Length of validation set :  188


## Random Forest

In [85]:
from sklearn.ensemble import RandomForestClassifier

rdf_classifier = RandomForestClassifier(n_estimators=5000, random_state=0)
rdf_classifier.fit(train_X,train_Y)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [86]:
rdf_predictions = rdf_classifier.predict(valid_X)

In [87]:
success = 0
for i, pred in enumerate(rdf_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(train_Y)))

Validation Accuracy = 0.21010638297872342


## Logistic Regression

In [47]:
from sklearn.preprocessing import MinMaxScaler

# Normalize data
scaler = MinMaxScaler()
train_X_t_n = scaler.fit_transform(train_X)
valid_X_t_n = scaler.transform(valid_X)

In [48]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression()
log_classifier.fit(train_X_t_n, train_Y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
log_predictions = log_classifier.predict(valid_X_t_n)

In [50]:
success = 0
for i, pred in enumerate(log_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(train_Y)))

Validation Accuracy = 0.17420212765957446


## SVM Algorithms

In [51]:
from sklearn import svm

svm_classifier = svm.SVC(gamma='auto',probability=True)
svm_classifier.fit(train_X_t_n, train_Y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [52]:
svm_predictions = svm_classifier.predict(valid_X_t_n)

In [53]:
success = 0
for i, pred in enumerate(svm_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(train_Y)))

Validation Accuracy = 0.16888297872340424


## Dataviz Output

In [26]:
print(features)

['Arms_Imports', 'Mil_Exp', 'Mil_Pers', 'Population', 'Year']


In [55]:
# Init a dict that will contain the total value of arms import per year per country
predict_dict = {}

nbrOfKey = len(ccDict.keys())

for i, key in tqdm(enumerate(ccDict), total=nbrOfKey):
    predict_dict[key] = {}
    
    for year in range(minYear, maxYear+1):
        
        # create datum in SAME order
        datum = [arms_imports[key][str(year)], float(mil_exp[key][str(year)]), float(mil_pers[key][str(year)]),int(population[key][str(year)]),year]
        
        # predict with probabibility
        predict_dict[key][str(year)] = str(rdf_classifier.predict_proba([datum])[0][1])

HBox(children=(IntProgress(value=0, max=54), HTML(value='')))




In [None]:
# Save the dict to a json file
with open('data/output/predictions.json', 'w') as fp:
    json.dump(predict_dict, fp)