In [None]:
# Libs
import json
import csv
import pandas as pd
import numpy as np
import time
import sys
import random
import matplotlib.pyplot as plt
from glob import glob

from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split

from joblib import dump, load
from keras.models import load_model

In [None]:
# Load the african country dictionary
dict_path = "data/output/african_countries.json"

with open(dict_path) as json_file:
    ccDict = json.load(json_file)

In [None]:
# Load all features
with open('data/output/mil_exp.json') as json_file:
    mil_exp = json.load(json_file)
    
with open('data/output/population.json') as json_file:
    population = json.load(json_file)
    
with open('data/output/arms_imports.json') as json_file:
    arms_imports = json.load(json_file)
    
with open('data/output/mil_pers.json') as json_file:
    mil_pers = json.load(json_file)
    
with open('data/output/water.json') as json_file:
    water = json.load(json_file)
    
with open('data/output/surface.json') as json_file:
    surface = json.load(json_file)
    
    
with open('data/output/conflicts.json') as json_file:
    conflicts = json.load(json_file)

In [None]:
# Get the time windows we will work with
minYear = 1962
maxYear = 2012

# Go through all datasets
temp_df = []
for i, key in enumerate(ccDict):
    
    # Get the country name
    country_name = ccDict[key]['name']
    
    # Go through years
    for year in range(minYear, maxYear+1):
    
        # datum
        datum = {
            'COW_key': int(key),
            'Year': int(year),
            'Mil_Exp':float(mil_exp[key][str(year)]),
            'Population':int(population[key][str(year)]),
            'Mil_Pers':float(mil_pers[key][str(year)]),
            'Arms_Imports':int(arms_imports[key][str(year)]),
            'Water':float(water[key][str(year)]),
            'Surface':float(surface[key][str(year)]),
            'Conflict':conflicts[key][str(year)]
        }
        
        # Append to temp df
        temp_df.append(datum)

# Convert temp df to pandas
df = pd.DataFrame(temp_df) 
    

# Print nbr of rows
print("Nbr of rows : " + str(len(df.index)))

# Preview df
df.head(10)

## Missing Data

In [None]:
df.mean(axis = 0)

In [None]:
mean_milexp = df.mean(axis = 0)['Mil_Exp']
mean_milpers = df.mean(axis = 0)['Mil_Pers']
mean_water = df.mean(axis = 0)['Water']


for index, row in df.iterrows():
    
    if(row['Mil_Exp'] == 0):
        df.at[index,'Mil_Exp'] = mean_milexp
    
    if(row['Mil_Pers'] == 0):
        df.at[index,'Mil_Pers'] = mean_milpers
    
    if(row['Water'] == 0):
        df.at[index,'Water'] = mean_water

## Balance Dataset

In [None]:
# Shuffle Rows
df = df.sample(frac=1).reset_index(drop=True)

# count excess
imbalance = df['Conflict'].value_counts()

excessLabel = 0
if(imbalance[0] > imbalance[1]):
    excessLabel = 0
else:
    excessLabel = 1

# Nbr of excess
diff = abs(imbalance[0] - imbalance[1])

In [None]:
balanced_df = df.copy()

nbr_dropped = 0
for index, row in balanced_df.iterrows():
    
    if(nbr_dropped >= diff):
        break
    
    if(row['Conflict'] == excessLabel):
        balanced_df.drop(index, inplace=True)
        nbr_dropped += 1

In [None]:
balanced_df['Conflict'].value_counts()

## Split Dataset

In [None]:
# Split features/label
features = ['Arms_Imports', 'Mil_Exp', 'Mil_Pers', 'Population', 'Water', 'Surface']
label = ['Conflict']
X = balanced_df[features]
y = balanced_df[label]

In [None]:
# Split the data
train_X, valid_X, train_Y, valid_Y = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=True, stratify=y)

# cast to np
valid_Y = np.array(valid_Y)
valid_X = np.array(valid_X)

print("Length of training set : ", len(train_X))
print("Length of validation set : ", len(valid_X))

## Normalize the Data

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize data
scaler = StandardScaler()
train_X_n = scaler.fit_transform(train_X)
valid_X_n = scaler.transform(valid_X)

In [None]:
# Save Scaler
scaler_filename = "data/model/scaler.joblib"
dump(scaler, scaler_filename) 

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rdf_classifier = RandomForestClassifier(n_estimators=30, random_state=0)
rdf_classifier.fit(train_X,train_Y)

In [None]:
rdf_predictions = rdf_classifier.predict(valid_X)

In [None]:
success = 0
for i, pred in enumerate(rdf_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(valid_X)))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression()
log_classifier.fit(train_X_n, train_Y)

In [None]:
log_predictions = log_classifier.predict(valid_X_n)

In [None]:
success = 0
for i, pred in enumerate(log_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(valid_X_n)))

## SVM Algorithms

In [None]:
from sklearn import svm

svm_classifier = svm.SVC(gamma='auto',probability=True)
svm_classifier.fit(train_X_n, train_Y)

In [None]:
svm_predictions = svm_classifier.predict(valid_X_n)

In [None]:
success = 0
for i, pred in enumerate(svm_predictions):
    if(pred == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(valid_X_n)))

## Save Model

In [None]:
dump(rdf_classifier, 'data/model/model.joblib') 

## CNN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
# Define model
model = Sequential()
model.add(Dense(100, input_dim=len(features), activation= "relu"))
model.add(Dense(60, activation= "relu"))
model.add(Dropout(rate=0.3))
model.add(Dense(60, activation= "relu"))
model.add(Dense(30, activation= "relu"))
model.add(Dense(1, activation='sigmoid'))
model.summary() #Print model Summary

In [None]:
# Compile model
model.compile(loss="binary_crossentropy" , optimizer="adam", metrics=["accuracy"])

In [None]:
# Fit Model
history = model.fit(train_X_n, train_Y, epochs=125, verbose=0)

In [None]:
score = model.evaluate(valid_X_n, valid_Y)

print('Test Score: {}'.format(score[0]))
print('Test Accuracy: {}'.format(score[1]))

In [None]:
fig = plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
plt.title('Training')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['loss','accuracy'], loc='upper left')
plt.show()

In [None]:
cnn_predictions = model.predict(valid_X_n)

In [None]:
success = 0
for i, pred in enumerate(cnn_predictions):
    if(round(pred[0]) == valid_Y[i]):
        success += 1
        
print("Validation Accuracy = " + str(success/len(valid_X_n)))

In [None]:
model.save('data/model/model_cnn.h5')

## Bagging

In [None]:
# Load Models
try:
    rdf_classifier = load('data/model/model.joblib')
    print("RDF classifier loaded!")
except:
    print("ERROR: RDF not loaded")
    
try:
    cnn_classifier = load_model('data/model/model_cnn.h5')
    print("CNN classifier loaded!")
except:
    print("ERROR: CNN not loaded")
    
# Load Scaler
try:
    scaler = load('data/model/scaler.joblib')
    print("Scaler loaded!")
except:
    print("ERROR: Scaler not loaded")

In [None]:
def bagged_predict(x, weight_rdf, weight_cnn):
    
    # Scale
    x_n = scaler.transform(x)
    
    # get prediction from the models
    rdf_preds = rdf_classifier.predict_proba(x)
    cnn_preds = model.predict(x_n)
    
    # init array
    predictions = []
    
    # go through
    for i in range(0,len(x)):
        
        # get preds
        rdf_pred = rdf_preds[i]
        cnn_pred = cnn_preds[i][0]
    
        # different rdf scenarios
        if(rdf_pred[0] > rdf_pred[1]):
            rdf_pred = 1.0 - rdf_pred[0]

        elif(rdf_pred[1] > rdf_pred[0]):
            rdf_pred = rdf_pred[1]

        else:
            rdf_pred = 0.5
    
        # get weighted average 
        ave_pred = rdf_pred*weight_rdf + cnn_pred*weight_cnn
        
        # add to final array
        predictions.append(ave_pred)
    
    return predictions

In [None]:
bestRDF_weight = 0
bestRDF_maxAcc = 0

for j in range(-10,11):
    
    weight_rdf = 0.5 + j/20
    weight_cnn = 0.5 - j/20
    
    success = 0
    
    # Get bagged prediction
    preds = bagged_predict(valid_X, weight_rdf, weight_cnn)
    
    for i, pred in enumerate(preds):

        # count successes
        if(round(pred) == valid_Y[i]):
            success += 1

    
    # Compute Score
    score = success/len(valid_X_n)
    
    if(score > bestRDF_maxAcc):
        bestRDF_maxAcc = score
        bestRDF_weight = weight_rdf
        
print("Validation Accuracy = " + str(bestRDF_maxAcc))
print("Best RDF Weight = " + str(bestRDF_weight))
print("\n")

In [None]:
# Get bagged prediction
preds = bagged_predict(valid_X, weight_rdf, weight_cnn)
success = 0
for i, pred in enumerate(preds):

    # count successes
    if(round(pred) == valid_Y[i]):
        success += 1
        

print("Validation Accuracy = " + str(success/len(valid_Y)))

# Predict

## Dataviz Output

In [None]:
print(features)

In [None]:
from model import Model

In [None]:
model = Model(0.9,0.1)

In [None]:
# Init a dict that will contain the total value of arms import per year per country
predict_dict = {}

nbrOfKey = len(ccDict.keys())

for i, key in tqdm(enumerate(ccDict), total=nbrOfKey):
    predict_dict[key] = {}
    
    for year in range(minYear, maxYear+1):
        
        # create datum in SAME order
        datum = [
             arms_imports[key][str(year)], 
             float(mil_exp[key][str(year)]), 
             float(mil_pers[key][str(year)]),
             int(population[key][str(year)]), 
             float(water[key][str(year)]),
             float(surface[key][str(year)]),
        ]
        
        # predict with probabibility
        prediction = model.predict([datum])[0]
        
        # set value
        predict_dict[key][str(year)] = str(prediction)

In [None]:
# Save the dict to a json file
with open('data/output/predictions.json', 'w') as fp:
    json.dump(predict_dict, fp)