In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import statsmodels.api as sm


In [2]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [32]:
def get_odds_ratios(coefs):
    return np.exp(coefs[0])

In [4]:
peace_sys = load_peace_sys_data()

peace_sys.head()

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0


In [5]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [6]:
NON_WAR_VARS = ['SymP6', 'NWNorm5.1', 'RitP6', 'Dep3.3Econ', 'Int2.4Hist', 'ID1.1Over', 
                'NWVal4.1', 'Int2.2Econ', 'Dep3.2Ecol', 'CM8.5Peace', 'PSys']

In [7]:
NON_WAR_VARS

['SymP6',
 'NWNorm5.1',
 'RitP6',
 'Dep3.3Econ',
 'Int2.4Hist',
 'ID1.1Over',
 'NWVal4.1',
 'Int2.2Econ',
 'Dep3.2Ecol',
 'CM8.5Peace',
 'PSys']

In [8]:
non_war_dataset = peace_sys[NON_WAR_VARS]

In [9]:
non_war_dataset

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gilbertese,9,9,9,9.0,9,9.0,9.0,9.0,9,9,2
Marshallese,2,9,2,3.0,9,1.0,4.0,1.0,9,9,2
E. Pomo,9,9,9,1.0,9,9.0,2.5,9.0,1,9,2
Popoluca,9,9,9,3.0,9,9.0,3.0,3.0,3,9,2
Konso,3,9,9,3.0,4,9.0,3.0,4.0,2,9,2
Bribri,9,2,9,9.0,9,9.0,2.0,9.0,9,9,2
Tallensi,9,3,9,2.0,3,1.0,4.0,9.0,9,2,2
Russians,9,9,9,9.0,9,4.0,9.0,9.0,9,2,2
Trukese,9,3,2,3.0,9,3.0,3.0,4.0,9,9,2
Toraja,9,1,1,1.0,9,1.0,9.0,1.0,9,9,2


In [10]:
y = non_war_dataset.PSys
X = non_war_dataset.drop('PSys', axis=1)

In [11]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [12]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [13]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.8421052631578947
F1 Score:  0.8293460925039873
Recall Score:  0.8421052631578947


In [14]:
rfc.feature_importances_

array([0.05368608, 0.14938955, 0.07446974, 0.09466335, 0.14085132,
       0.08955129, 0.15642243, 0.07113672, 0.13095774, 0.03887178])

# Logistic Regression

In [15]:
logreg = LogisticRegression(random_state=42, C=1e9)

In [16]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
logreg_pred = logreg.predict(X_test)

In [18]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  0.6842105263157895
F1 Score:  0.6842105263157895
Recall Score:  0.6842105263157895


In [19]:
logreg.coef_

array([[ 0.11777175,  0.15140893, -0.22199729, -0.07219933,  0.16197307,
         0.0367737 ,  0.3554567 ,  0.57268005,  0.19116571,  0.54971107]])

In [20]:
logreg.intercept_

array([-9.39998962])

### Logistic Beta

In [44]:
logistic_beta = pd.DataFrame({'Variable' : X.columns, 'Logistic Beta' : logreg.coef_[0]})
logistic_beta = logistic_beta.set_index('Variable')

In [45]:
logistic_beta

Unnamed: 0_level_0,Logistic Beta
Variable,Unnamed: 1_level_1
SymP6,0.117772
NWNorm5.1,0.151409
RitP6,-0.221997
Dep3.3Econ,-0.072199
Int2.4Hist,0.161973
ID1.1Over,0.036774
NWVal4.1,0.355457
Int2.2Econ,0.57268
Dep3.2Ecol,0.191166
CM8.5Peace,0.549711


### Logistic Odds Ratio

In [41]:
odd_ratios = pd.DataFrame({'Variable' : X.columns, "Logistic Odds Ratio" : get_odds_ratios(logreg.coef_)})
odd_ratios = odd_ratios.set_index('Variable')

In [42]:
odd_ratios

Unnamed: 0_level_0,Logistic Odds Ratio
Variable,Unnamed: 1_level_1
SymP6,1.124987
NWNorm5.1,1.163472
RitP6,0.800918
Dep3.3Econ,0.930345
Int2.4Hist,1.175829
ID1.1Over,1.037458
NWVal4.1,1.426832
Int2.2Econ,1.773012
Dep3.2Ecol,1.21066
CM8.5Peace,1.732752


# Dividing to Peaceful and Non Peaceful

In [21]:
peaceful = non_war_dataset[non_war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = non_war_dataset[non_war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [22]:
peaceful

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Switzerland,4,4,4,3.0,4,3.0,4.0,9.0,3,2
Iroquois,4,9,4,2.0,4,3.5,4.0,3.5,2,4
Orang Asli,9,4,9,2.5,3,3.0,4.0,2.0,2,9
Nilgiri Pla.,9,4,4,4.0,4,2.0,9.0,4.0,9,9
Wynaad Pla.,9,4,3,3.0,3,3.0,4.0,4.0,9,9
Nordic Nats.,4,4,3,4.0,4,4.0,4.0,4.0,2,4
Up Xingu RB,9,4,3,4.0,9,4.0,4.0,4.0,1,9
W. Australia,4,4,4,9.0,4,4.0,4.0,2.0,4,4
Mon-Naskapi,9,4,9,9.0,3,2.0,4.0,4.0,4,9
Italy,2,2,1,2.5,2,2.0,2.0,3.0,9,9


# T-Test

In [23]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [24]:
ttest = compute_ttest(peaceful, non_peaceful)

In [25]:
ttest

Unnamed: 0_level_0,t_statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,-1.1637,0.250817
NWNorm5.1,-1.803568,0.078149
RitP6,-1.763857,0.084699
Dep3.3Econ,-1.50951,0.138317
Int2.4Hist,-2.522966,0.015326
ID1.1Over,-1.876233,0.067266
NWVal4.1,-1.689528,0.098193
Int2.2Econ,-1.667172,0.102584
Dep3.2Ecol,-2.265596,0.02845
CM8.5Peace,-2.101315,0.041375


# Mann-Whitney U-Test

In [26]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [27]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [28]:
mannwhitneyu

Unnamed: 0_level_0,statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
SymP6,200.0,0.125066
NWNorm5.1,192.5,0.122128
RitP6,197.5,0.148734
Dep3.3Econ,219.0,0.31199
Int2.4Hist,163.0,0.028627
ID1.1Over,214.5,0.27669
NWVal4.1,221.5,0.332944
Int2.2Econ,210.5,0.244872
Dep3.2Ecol,170.5,0.045005
CM8.5Peace,168.5,0.011475
