In [169]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import statsmodels.api as sm


In [170]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [171]:
def get_odds_ratios(coefs):
    return np.exp(coefs[0])

In [172]:
peace_sys = load_peace_sys_data()

peace_sys

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0
Bribri,157,DG,9.0,2.0,2,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Tallensi,23,DG,1.0,2.0,4,9.0,1,3,3,9,...,2,9,9,9,4,1,3,9,2,0
Russians,54,DG,4.0,9.0,1,9.0,9,9,9,9,...,2,1,1,9,2,9,9,9,2,0
Trukese,109,DG,3.0,9.0,4,4.0,3,9,3,9,...,9,9,9,9,9,9,9,9,2,0
Toraja,87,DG,1.0,9.0,1,1.0,9,9,1,9,...,9,9,9,9,9,9,9,9,2,0


In [173]:
peace_sys = peace_sys.replace(9, np.nan)

In [203]:
# mean of each column
peace_sys.mean()

ID1.1Over       2.532258
ID1.2Ethno      2.600000
Int2.1Mar       2.428571
Int2.2Econ      2.984375
Int2.3Pol       2.391304
Int2.4Hist      2.913043
Dep3.1Sec       2.500000
Dep3.2Ecol      2.500000
Dep3.3Econ      2.689655
NWVal4.1        3.171875
WVal4.2         2.763889
NWNorm5.1       3.083333
WNorm5.2        2.451613
MythsP6         2.523810
MythsWar6       2.218750
RitP6           2.565217
RitWar6         2.545455
SymP6           2.923077
SymWar6         2.325000
SuperOrd7       2.625000
CM8.1Neg        2.842105
CM8.2Med        2.875000
CM8.3Arb        2.142857
CM8.4Adj        2.178571
CM8.5Peace      3.100000
CM8.6War        1.500000
Lead9.1P        2.764706
Lead9.2War      2.937500
Cult10.1Com     3.050000
Cult10.2Diff    1.411765
Comp10.3        2.714286
InComp10.4      1.818182
PSys            1.652174
PSysRec         0.347826
dtype: float64

In [174]:
peace_sys = peace_sys.fillna(peace_sys.mean())

In [175]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [176]:
NON_WAR_VARS = ['SymP6', 'NWNorm5.1', 'RitP6', 'Dep3.3Econ', 'Int2.4Hist', 'ID1.1Over', 
                'NWVal4.1', 'Int2.2Econ', 'Dep3.2Ecol', 'CM8.5Peace', 'PSys']

In [177]:
NON_WAR_VARS

['SymP6',
 'NWNorm5.1',
 'RitP6',
 'Dep3.3Econ',
 'Int2.4Hist',
 'ID1.1Over',
 'NWVal4.1',
 'Int2.2Econ',
 'Dep3.2Ecol',
 'CM8.5Peace',
 'PSys']

In [178]:
non_war_dataset = peace_sys[NON_WAR_VARS]

In [179]:
non_war_dataset

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gilbertese,2.923077,3.083333,2.565217,2.689655,2.913043,2.532258,3.171875,2.984375,2.5,3.1,2
Marshallese,2.0,3.083333,2.0,3.0,2.913043,1.0,4.0,1.0,2.5,3.1,2
E. Pomo,2.923077,3.083333,2.565217,1.0,2.913043,2.532258,2.5,2.984375,1.0,3.1,2
Popoluca,2.923077,3.083333,2.565217,3.0,2.913043,2.532258,3.0,3.0,3.0,3.1,2
Konso,3.0,3.083333,2.565217,3.0,4.0,2.532258,3.0,4.0,2.0,3.1,2
Bribri,2.923077,2.0,2.565217,2.689655,2.913043,2.532258,2.0,2.984375,2.5,3.1,2
Tallensi,2.923077,3.0,2.565217,2.0,3.0,1.0,4.0,2.984375,2.5,2.0,2
Russians,2.923077,3.083333,2.565217,2.689655,2.913043,4.0,3.171875,2.984375,2.5,2.0,2
Trukese,2.923077,3.0,2.0,3.0,2.913043,3.0,3.0,4.0,2.5,3.1,2
Toraja,2.923077,1.0,1.0,1.0,2.913043,1.0,3.171875,1.0,2.5,3.1,2


In [180]:
y = non_war_dataset.PSys
X = non_war_dataset.drop('PSys', axis=1)

In [181]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [182]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [183]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.8947368421052632
F1 Score:  0.8900134952766531
Recall Score:  0.8947368421052632


In [184]:
rfc.feature_importances_

array([0.06827039, 0.20018045, 0.08972711, 0.06970266, 0.12288338,
       0.06743962, 0.21606456, 0.06185313, 0.08468404, 0.01919467])

# Logistic Regression

In [185]:
logreg = LogisticRegression(random_state=42, C=1e9)

In [186]:
logreg.fit(X, y)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [187]:
logreg_pred = logreg.predict(X_test)

In [188]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  1.0
F1 Score:  1.0
Recall Score:  1.0


In [189]:
logreg.coef_

array([[ 0.59802361, -2.21508801, -1.16885619, -0.61278152,  0.38971173,
        -1.12348218, -0.72535515, -0.41568741, -1.34981695, -1.9064052 ]])

In [190]:
logreg.intercept_

array([26.1104215])

### Logistic Beta

In [191]:
logistic_beta = pd.DataFrame({'Variable' : X.columns, 'Logistic Beta' : logreg.coef_[0]})
logistic_beta = logistic_beta.set_index('Variable')

In [192]:
logistic_beta

Unnamed: 0_level_0,Logistic Beta
Variable,Unnamed: 1_level_1
SymP6,0.598024
NWNorm5.1,-2.215088
RitP6,-1.168856
Dep3.3Econ,-0.612782
Int2.4Hist,0.389712
ID1.1Over,-1.123482
NWVal4.1,-0.725355
Int2.2Econ,-0.415687
Dep3.2Ecol,-1.349817
CM8.5Peace,-1.906405


### Logistic Odds Ratio

In [193]:
odd_ratios = pd.DataFrame({'Variable' : X.columns, "Logistic Odds Ratio" : get_odds_ratios(logreg.coef_)})
odd_ratios = odd_ratios.set_index('Variable')

In [194]:
odd_ratios

Unnamed: 0_level_0,Logistic Odds Ratio
Variable,Unnamed: 1_level_1
SymP6,1.818521
NWNorm5.1,0.109144
RitP6,0.310722
Dep3.3Econ,0.541842
Int2.4Hist,1.476555
ID1.1Over,0.325146
NWVal4.1,0.484153
Int2.2Econ,0.659887
Dep3.2Ecol,0.259288
CM8.5Peace,0.148614


# Dividing to Peaceful and Non Peaceful

In [195]:
peaceful = non_war_dataset[non_war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = non_war_dataset[non_war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [196]:
peaceful

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Switzerland,4.0,4.0,4.0,3.0,4.0,3.0,4.0,2.984375,3.0,2.0
Iroquois,4.0,3.083333,4.0,2.0,4.0,3.5,4.0,3.5,2.0,4.0
Orang Asli,2.923077,4.0,2.565217,2.5,3.0,3.0,4.0,2.0,2.0,3.1
Nilgiri Pla.,2.923077,4.0,4.0,4.0,4.0,2.0,3.171875,4.0,2.5,3.1
Wynaad Pla.,2.923077,4.0,3.0,3.0,3.0,3.0,4.0,4.0,2.5,3.1
Nordic Nats.,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0
Up Xingu RB,2.923077,4.0,3.0,4.0,2.913043,4.0,4.0,4.0,1.0,3.1
W. Australia,4.0,4.0,4.0,2.689655,4.0,4.0,4.0,2.0,4.0,4.0
Mon-Naskapi,2.923077,4.0,2.565217,2.689655,3.0,2.0,4.0,4.0,4.0,3.1
Italy,2.0,2.0,1.0,2.5,2.0,2.0,2.0,3.0,2.5,3.1


# T-Test

In [197]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [198]:
ttest = compute_ttest(peaceful, non_peaceful)

In [199]:
ttest

Unnamed: 0_level_0,t_statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,3.11532,0.00323
NWNorm5.1,4.507533,4.8e-05
RitP6,3.630994,0.000733
Dep3.3Econ,3.972121,0.00026
Int2.4Hist,2.948822,0.005091
ID1.1Over,3.461848,0.001206
NWVal4.1,3.109278,0.003285
Int2.2Econ,2.829371,0.007
Dep3.2Ecol,2.524637,0.015263
CM8.5Peace,2.646938,0.011228


# Mann-Whitney U-Test

In [200]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [201]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [202]:
mannwhitneyu

Unnamed: 0_level_0,statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,145.0,0.002968
NWNorm5.1,62.5,6e-06
RitP6,81.5,4.4e-05
Dep3.3Econ,110.0,0.000978
Int2.4Hist,92.0,0.000123
ID1.1Over,116.5,0.001772
NWVal4.1,106.5,0.00071
Int2.2Econ,108.5,0.000904
Dep3.2Ecol,150.5,0.01438
CM8.5Peace,149.5,0.00197
