In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns


In [4]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [5]:
peace_sys = load_peace_sys_data()

peace_sys.head()

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0


In [6]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [13]:
WAR_VARS = ['WNorm5.2', 'Lead9.2War', 'SymWar6', 'RitWar6', 'WVal4.2', 'PSys']

In [14]:
WAR_VARS

['WNorm5.2', 'Lead9.2War', 'SymWar6', 'RitWar6', 'WVal4.2', 'PSys']

In [15]:
war_dataset = peace_sys[WAR_VARS]

war_dataset

Unnamed: 0_level_0,WNorm5.2,Lead9.2War,SymWar6,RitWar6,WVal4.2,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gilbertese,9,9,9.0,9,9.0,2
Marshallese,9,4,2.0,9,3.0,2
E. Pomo,2,9,2.5,4,3.0,2
Popoluca,9,9,9.0,9,9.0,2
Konso,4,9,3.0,9,3.0,2
Bribri,3,9,2.0,2,3.0,2
Tallensi,9,9,9.0,9,2.0,2
Russians,9,9,9.0,9,9.0,2
Trukese,3,9,9.0,9,9.0,2
Toraja,3,9,9.0,9,3.0,2


In [17]:
y = war_dataset.PSys
X = war_dataset.drop('PSys', axis=1)

In [44]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [45]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [46]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.7368421052631579
F1 Score:  0.7398932112890922
Recall Score:  0.7368421052631579


In [53]:
rfc.feature_importances_

array([0.29115337, 0.11796548, 0.13865597, 0.14276094, 0.30946424])

# Logistic Regression

In [47]:
logreg = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial')

In [48]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
logreg_pred = logreg.predict(X_test)

In [50]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  0.8421052631578947
F1 Score:  0.8439359267734553
Recall Score:  0.8421052631578947


In [51]:
logreg.coef_

array([[ 0.14477549,  0.0165722 , -0.0379717 ,  0.00101074,  0.12846792]])

# Dividing to Peaceful and Non Peaceful

In [63]:
peaceful = war_dataset[war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = war_dataset[war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

# T-Test

In [72]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [74]:
ttest = compute_ttest(peaceful, non_peaceful)

In [75]:
ttest

Unnamed: 0_level_0,t_statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
WNorm5.2,-4.181508,0.000136
Lead9.2War,-1.492516,0.1427
SymWar6,-1.054722,0.29731
RitWar6,-1.557952,0.126409
WVal4.2,-3.403321,0.001429


# Mann-Whitney U-Test

In [76]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [77]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [78]:
mannwhitneyu

Unnamed: 0_level_0,statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
WNorm5.2,66.0,2e-05
Lead9.2War,186.0,0.072436
SymWar6,189.0,0.098272
RitWar6,166.0,0.032931
WVal4.2,84.0,0.000104
