In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns


In [2]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [3]:
peace_sys = load_peace_sys_data()

peace_sys.head()

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0


In [4]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [9]:
NON_WAR_VARS = ['SymP6', 'NWNorm5.1', 'RitP6', 'Dep3.3Econ', 'Int2.4Hist', 'ID1.1Over', 
                'NWVal4.1', 'Int2.2Econ', 'Dep3.2Ecol', 'CM8.5Peace', 'PSys']

In [10]:
NON_WAR_VARS

['SymP6',
 'NWNorm5.1',
 'RitP6',
 'Dep3.3Econ',
 'Int2.4Hist',
 'ID1.1Over',
 'NWVal4.1',
 'Int2.2Econ',
 'Dep3.2Ecol',
 'CM8.5Peace',
 'PSys']

In [11]:
non_war_dataset = peace_sys[NON_WAR_VARS]

In [12]:
non_war_dataset

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gilbertese,9,9,9,9.0,9,9.0,9.0,9.0,9,9,2
Marshallese,2,9,2,3.0,9,1.0,4.0,1.0,9,9,2
E. Pomo,9,9,9,1.0,9,9.0,2.5,9.0,1,9,2
Popoluca,9,9,9,3.0,9,9.0,3.0,3.0,3,9,2
Konso,3,9,9,3.0,4,9.0,3.0,4.0,2,9,2
Bribri,9,2,9,9.0,9,9.0,2.0,9.0,9,9,2
Tallensi,9,3,9,2.0,3,1.0,4.0,9.0,9,2,2
Russians,9,9,9,9.0,9,4.0,9.0,9.0,9,2,2
Trukese,9,3,2,3.0,9,3.0,3.0,4.0,9,9,2
Toraja,9,1,1,1.0,9,1.0,9.0,1.0,9,9,2


In [13]:
y = non_war_dataset.PSys
X = non_war_dataset.drop('PSys', axis=1)

In [14]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [15]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [16]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.8421052631578947
F1 Score:  0.8293460925039873
Recall Score:  0.8421052631578947


In [17]:
rfc.feature_importances_

array([0.04988861, 0.15395782, 0.06775046, 0.09114194, 0.15067707,
       0.08746488, 0.16049362, 0.07127116, 0.13720735, 0.03014708])

# Logistic Regression

In [18]:
logreg = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial')

In [19]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
logreg_pred = logreg.predict(X_test)

In [21]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  0.7368421052631579
F1 Score:  0.7319838056680162
Recall Score:  0.7368421052631579


In [22]:
logreg.coef_

array([[ 0.05798119,  0.07517052, -0.09894947, -0.02397734,  0.0818319 ,
         0.0101113 ,  0.15161118,  0.24638588,  0.08965452,  0.23910548]])

# Dividing to Peaceful and Non Peaceful

In [25]:
peaceful = non_war_dataset[non_war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = non_war_dataset[non_war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

# T-Test

In [26]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [27]:
ttest = compute_ttest(peaceful, non_peaceful)

In [28]:
ttest

Unnamed: 0_level_0,t_statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,-1.1637,0.250817
NWNorm5.1,-1.803568,0.078149
RitP6,-1.763857,0.084699
Dep3.3Econ,-1.50951,0.138317
Int2.4Hist,-2.522966,0.015326
ID1.1Over,-1.876233,0.067266
NWVal4.1,-1.689528,0.098193
Int2.2Econ,-1.667172,0.102584
Dep3.2Ecol,-2.265596,0.02845
CM8.5Peace,-2.101315,0.041375


# Mann-Whitney U-Test

In [29]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [30]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [31]:
mannwhitneyu

Unnamed: 0_level_0,statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,200.0,0.125066
NWNorm5.1,192.5,0.122128
RitP6,197.5,0.148734
Dep3.3Econ,219.0,0.31199
Int2.4Hist,163.0,0.028627
ID1.1Over,214.5,0.27669
NWVal4.1,221.5,0.332944
Int2.2Econ,210.5,0.244872
Dep3.2Ecol,170.5,0.045005
CM8.5Peace,168.5,0.011475
