In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import statsmodels.api as sm


In [40]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df 

In [41]:
def get_odds_ratios(coefs):
    return np.exp(coefs[0])

In [76]:
def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    # first index refers to proba that belongs to class 0
    # second index refers to proba that belongs to calss 1
    p = model.predict_proba(x) # return matrix (N,2)
    # number of samples
    n = len(p)
    # number of features + 1 
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]]) # put intercept and coefs in same array
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1)) # 
    ans = np.zeros((m, m))
    for i in range(n):
        # dot product of transposed row and row
        # then multiply by both probas
        # add it to ans
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    # acovariance matrix
    vcov = np.linalg.inv(np.matrix(ans))
    # square root diagonal of covariace matrix is the standard error
    se = np.sqrt(np.diag(vcov))
    # divide coefs by standard error
    t =  coefs/se 
    # two tailed using normal dist
    p = (1 - stats.norm.cdf(abs(t))) * 2
    return p

In [43]:
peace_sys = load_peace_sys_data()

peace_sys.head()

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0


In [44]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [45]:
WAR_VARS = ['WNorm5.2', 'Lead9.2War', 'SymWar6', 'RitWar6', 'WVal4.2', 'PSys']

In [46]:
WAR_VARS

['WNorm5.2', 'Lead9.2War', 'SymWar6', 'RitWar6', 'WVal4.2', 'PSys']

In [47]:
war_dataset = peace_sys[WAR_VARS]

war_dataset

Unnamed: 0_level_0,WNorm5.2,Lead9.2War,SymWar6,RitWar6,WVal4.2,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gilbertese,9,9,9.0,9,9.0,2
Marshallese,9,4,2.0,9,3.0,2
E. Pomo,2,9,2.5,4,3.0,2
Popoluca,9,9,9.0,9,9.0,2
Konso,4,9,3.0,9,3.0,2
Bribri,3,9,2.0,2,3.0,2
Tallensi,9,9,9.0,9,2.0,2
Russians,9,9,9.0,9,9.0,2
Trukese,3,9,9.0,9,9.0,2
Toraja,3,9,9.0,9,3.0,2


In [48]:
y = war_dataset.PSys
X = war_dataset.drop('PSys', axis=1)

In [49]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [50]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [51]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.7368421052631579
F1 Score:  0.7398932112890922
Recall Score:  0.7368421052631579


In [52]:
rfc.feature_importances_

array([0.29900645, 0.1081733 , 0.13855106, 0.13954948, 0.31471971])

# Logistic Regression

In [53]:
logreg = LogisticRegression(random_state=42, C=1e9)

In [54]:
logreg.fit(X, y)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
logreg_pred = logreg.predict(X_test)

In [56]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  0.8947368421052632
F1 Score:  0.8947368421052632
Recall Score:  0.8947368421052632


In [57]:
logreg.coef_

array([[ 0.39154662,  0.07293433, -0.06173836,  0.04686843,  0.36535045]])

In [58]:
logreg.intercept_

array([-2.40948292])

## Logistic Beta

In [59]:
logistic_beta = pd.DataFrame({'Variable' : X.columns, 'Logistic Beta' : logreg.coef_[0]})
logistic_beta = logistic_beta.set_index('Variable')

In [60]:
logistic_beta

Unnamed: 0_level_0,Logistic Beta
Variable,Unnamed: 1_level_1
WNorm5.2,0.391547
Lead9.2War,0.072934
SymWar6,-0.061738
RitWar6,0.046868
WVal4.2,0.36535


## Logistic Odds Ratio

In [61]:
odd_ratios = pd.DataFrame({'Variable' : X.columns, "Logistic Odds Ratio" : get_odds_ratios(logreg.coef_)})
odd_ratios = odd_ratios.set_index('Variable')

In [62]:
odd_ratios

Unnamed: 0_level_0,Logistic Odds Ratio
Variable,Unnamed: 1_level_1
WNorm5.2,1.479267
Lead9.2War,1.07566
SymWar6,0.940129
RitWar6,1.047984
WVal4.2,1.441019


## P-values

In [63]:
logreg_pvalues = pd.DataFrame({'Variable' : X.columns, "Logistic Sig Level" : logit_pvalue(logreg, X)[1:]})
logreg_pvalues = logreg_pvalues.set_index('Variable')

In [64]:
logreg_pvalues

Unnamed: 0_level_0,Logistic Sig Level
Variable,Unnamed: 1_level_1
WNorm5.2,0.098579
Lead9.2War,0.55945
SymWar6,0.637344
RitWar6,0.733138
WVal4.2,0.276012


## Summary

In [74]:
logit = sm.Logit(y.replace(2, 0), sm.add_constant(X)).fit(disp=0)

In [66]:
logit.summary()

0,1,2,3
Dep. Variable:,PSys,No. Observations:,46.0
Model:,Logit,Df Residuals:,40.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 29 Jan 2020",Pseudo R-squ.:,0.31
Time:,20:55:13,Log-Likelihood:,-20.507
converged:,True,LL-Null:,-29.72
Covariance Type:,nonrobust,LLR p-value:,0.002456

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4095,1.391,1.732,0.083,-0.317,5.136
WNorm5.2,-0.3916,0.237,-1.652,0.099,-0.856,0.073
Lead9.2War,-0.0729,0.125,-0.584,0.559,-0.318,0.172
SymWar6,0.0617,0.131,0.471,0.637,-0.195,0.318
RitWar6,-0.0469,0.137,-0.341,0.733,-0.316,0.223
WVal4.2,-0.3653,0.335,-1.089,0.276,-1.023,0.292


# Dividing to Peaceful and Non Peaceful

In [75]:
peaceful = war_dataset[war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = war_dataset[war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

# T-Test

In [68]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [69]:
ttest = compute_ttest(peaceful, non_peaceful)

In [70]:
ttest

Unnamed: 0_level_0,t_statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
WNorm5.2,-4.181508,0.000136
Lead9.2War,-1.492516,0.1427
SymWar6,-1.054722,0.29731
RitWar6,-1.557952,0.126409
WVal4.2,-3.403321,0.001429


# Mann-Whitney U-Test

In [71]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [72]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [73]:
mannwhitneyu

Unnamed: 0_level_0,statistic,p_value
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
WNorm5.2,66.0,2e-05
Lead9.2War,186.0,0.072436
SymWar6,189.0,0.098272
RitWar6,166.0,0.032931
WVal4.2,84.0,0.000104
