In [120]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import statsmodels.api as sm


In [121]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [122]:
def get_odds_ratios(coefs):
    return np.exp(coefs[0])

In [123]:
def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    # first index refers to proba that belongs to class 0
    # second index refers to proba that belongs to calss 1
    p = model.predict_proba(x) # return matrix (N,2)
    # number of samples
    n = len(p)
    # number of features + 1 
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]]) # put intercept and coefs in same array
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1)) # 
    ans = np.zeros((m, m))
    for i in range(n):
        # dot product of transposed row and row
        # then multiply by both probas
        # add it to ans
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    # acovariance matrix
    vcov = np.linalg.inv(np.matrix(ans))
    # square root diagonal of covariace matrix
    se = np.sqrt(np.diag(vcov))
    # divide coefs by standard error
    t =  coefs/se 
    # two tailed using normal dist
    p = (1 - stats.norm.cdf(abs(t))) * 2
    return p

In [1]:
def logit_pvalues_with_nan(X, y):
    pvalues_df = pd.DataFrame(columns=['pvalue'])
    
    for i in X.columns:
        curr_col = X[[i]]
        curr_col[y.name] = y
        curr_col = curr_col.dropna()
        
        logreg = LogisticRegression(random_state=42, C=1e9)
        logreg.fit(curr_col.drop(y.name), curr_col[y.name])
        pvalue = logit_pvalue(logreg, curr_col.drop(y.name))
        
        pvalues.append({ str(i) : [pvalue]}, inplace=True)
        
    return pvalues_df

In [152]:
peace_sys = load_peace_sys_data()

peace_sys

Unnamed: 0_level_0,SCCS,Coder,ID1.1Over,ID1.2Ethno,Int2.1Mar,Int2.2Econ,Int2.3Pol,Int2.4Hist,Dep3.1Sec,Dep3.2Ecol,...,CM8.5Peace,CM8.6War,Lead9.1P,Lead9.2War,Cult10.1Com,Cult10.2Diff,Comp10.3,InComp10.4,PSys,PSysRec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gilbertese,107,KA,9.0,2.0,3,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Marshallese,108,EC,1.0,9.0,9,1.0,9,9,9,9,...,9,2,9,4,9,9,9,9,2,0
E. Pomo,135,"KA, EC",9.0,9.0,9,9.0,9,9,1,1,...,9,9,2,9,9,9,9,9,2,0
Popoluca,154,KA,9.0,3.0,9,3.0,9,9,9,3,...,9,9,9,9,9,9,9,9,2,0
Konso,35,KA,9.0,2.0,3,4.0,9,4,9,2,...,9,9,9,9,3,9,3,3,2,0
Bribri,157,DG,9.0,2.0,2,9.0,9,9,9,9,...,9,9,9,9,9,9,9,9,2,0
Tallensi,23,DG,1.0,2.0,4,9.0,1,3,3,9,...,2,9,9,9,4,1,3,9,2,0
Russians,54,DG,4.0,9.0,1,9.0,9,9,9,9,...,2,1,1,9,2,9,9,9,2,0
Trukese,109,DG,3.0,9.0,4,4.0,3,9,3,9,...,9,9,9,9,9,9,9,9,2,0
Toraja,87,DG,1.0,9.0,1,1.0,9,9,1,9,...,9,9,9,9,9,9,9,9,2,0


In [153]:
peace_sys = peace_sys.replace(9, np.nan)

In [127]:
# mean of each column
peace_sys.mean()

SCCS            128.717391
ID1.1Over         1.706522
ID1.2Ethno        1.695652
Int2.1Mar         1.478261
Int2.2Econ        2.076087
Int2.3Pol         1.195652
Int2.4Hist        1.456522
Dep3.1Sec         1.413043
Dep3.2Ecol        1.304348
Dep3.3Econ        1.695652
NWVal4.1          2.206522
WVal4.2           2.163043
NWNorm5.1         1.608696
WNorm5.2          1.652174
MythsP6           1.152174
MythsWar6         0.771739
RitP6             1.282609
RitWar6           1.217391
SymP6             0.826087
SymWar6           1.010870
SuperOrd7         1.369565
CM8.1Neg          1.173913
CM8.2Med          0.500000
CM8.3Arb          0.326087
CM8.4Adj          0.663043
CM8.5Peace        0.673913
CM8.6War          0.326087
Lead9.1P          1.021739
Lead9.2War        1.021739
Cult10.1Com       1.326087
Cult10.2Diff      0.521739
Comp10.3          0.826087
InComp10.4        0.434783
PSys              1.652174
PSysRec           0.347826
dtype: float64

In [128]:
peace_sys = peace_sys.drop(['SCCS','Coder'], axis=1)

In [129]:
NON_WAR_VARS = ['SymP6', 'NWNorm5.1', 'RitP6', 'Dep3.3Econ', 'Int2.4Hist', 'ID1.1Over', 
                'NWVal4.1', 'Int2.2Econ', 'Dep3.2Ecol', 'CM8.5Peace', 'PSys']

In [130]:
NON_WAR_VARS

['SymP6',
 'NWNorm5.1',
 'RitP6',
 'Dep3.3Econ',
 'Int2.4Hist',
 'ID1.1Over',
 'NWVal4.1',
 'Int2.2Econ',
 'Dep3.2Ecol',
 'CM8.5Peace',
 'PSys']

In [131]:
non_war_dataset = peace_sys[NON_WAR_VARS]

In [132]:
non_war_dataset

Unnamed: 0_level_0,SymP6,NWNorm5.1,RitP6,Dep3.3Econ,Int2.4Hist,ID1.1Over,NWVal4.1,Int2.2Econ,Dep3.2Ecol,CM8.5Peace,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Gilbertese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Marshallese,2.0,0.0,2.0,3.0,0.0,1.0,4.0,1.0,0.0,0.0,2
E. Pomo,0.0,0.0,0.0,1.0,0.0,0.0,2.5,0.0,1.0,0.0,2
Popoluca,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,3.0,0.0,2
Konso,3.0,0.0,0.0,3.0,4.0,0.0,3.0,4.0,2.0,0.0,2
Bribri,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2
Tallensi,0.0,3.0,0.0,2.0,3.0,1.0,4.0,0.0,0.0,2.0,2
Russians,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,2
Trukese,0.0,3.0,2.0,3.0,0.0,3.0,3.0,4.0,0.0,0.0,2
Toraja,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2


In [133]:
y = non_war_dataset.PSys
X = non_war_dataset.drop('PSys', axis=1)

In [134]:
# generating train_set and test_set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.4 , stratify=y, random_state=42)

# Random Forest

In [135]:
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

In [136]:
print('Accuracy: ', accuracy_score(y_test, rfc_pred))
print('F1 Score: ', f1_score(y_test, rfc_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, rfc_pred, average='weighted'))

Accuracy:  0.8947368421052632
F1 Score:  0.8900134952766531
Recall Score:  0.8947368421052632


In [137]:
rfc.feature_importances_

array([0.04655456, 0.17976759, 0.06325209, 0.09726477, 0.12271868,
       0.07300231, 0.19817299, 0.05636493, 0.14666677, 0.01623532])

# Logistic Regression

In [138]:
logreg = LogisticRegression(random_state=42, C=1e9)

In [139]:
logreg.fit(X, y)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [140]:
logreg_pred = logreg.predict(X_test)

In [141]:
print('Accuracy: ', accuracy_score(y_test, logreg_pred))
print('F1 Score: ', f1_score(y_test, logreg_pred, average='weighted'))
print('Recall Score: ', recall_score(y_test, logreg_pred, average='weighted'))

Accuracy:  1.0
F1 Score:  1.0
Recall Score:  1.0


In [142]:
logreg.coef_

array([[-20.91396574, -37.8303433 ,   2.2225801 , -22.56553653,
        -16.34177001, -16.39019809,  13.2955359 ,  -1.35452563,
        -15.17980474,   0.92794881]])

In [143]:
logreg.intercept_

array([203.92777369])

### Logistic Beta

In [144]:
logistic_beta = pd.DataFrame({'Variable' : X.columns, 'Logistic Beta' : logreg.coef_[0]})
logistic_beta = logistic_beta.set_index('Variable')

In [145]:
logistic_beta

Unnamed: 0_level_0,Logistic Beta
Variable,Unnamed: 1_level_1
SymP6,-20.913966
NWNorm5.1,-37.830343
RitP6,2.22258
Dep3.3Econ,-22.565537
Int2.4Hist,-16.34177
ID1.1Over,-16.390198
NWVal4.1,13.295536
Int2.2Econ,-1.354526
Dep3.2Ecol,-15.179805
CM8.5Peace,0.927949


### Logistic Odds Ratio

In [146]:
odd_ratios = pd.DataFrame({'Variable' : X.columns, "Logistic Odds Ratio" : get_odds_ratios(logreg.coef_)})
odd_ratios = odd_ratios.set_index('Variable')

In [147]:
odd_ratios

Unnamed: 0_level_0,Logistic Odds Ratio
Variable,Unnamed: 1_level_1
SymP6,8.263805e-10
NWNorm5.1,3.7195520000000005e-17
RitP6,9.231117
Dep3.3Econ,1.584572e-10
Int2.4Hist,7.995755e-08
ID1.1Over,7.617762e-08
NWVal4.1,594535.6
Int2.2Econ,0.2580697
Dep3.2Ecol,2.55561e-07
CM8.5Peace,2.529316


## P-values

In [148]:
logreg_pvalues = pd.DataFrame({'Variable' : X.columns, "Logistic Sig Level" : logit_pvalue(logreg, X)[1:]})
logreg_pvalues = logreg_pvalues.set_index('Variable')

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = (x >= np.asarray(_b)) & cond0


In [149]:
logreg_pvalues

Unnamed: 0_level_0,Logistic Sig Level
Variable,Unnamed: 1_level_1
SymP6,
NWNorm5.1,1.0
RitP6,
Dep3.3Econ,
Int2.4Hist,
ID1.1Over,
NWVal4.1,
Int2.2Econ,
Dep3.2Ecol,
CM8.5Peace,


## Summary

In [150]:
logit = sm.Logit(y.replace(2, 0), sm.add_constant(X)).fit(disp=0)

  return ptp(axis=axis, out=out, **kwargs)
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


LinAlgError: Singular matrix

In [None]:
logit.summary()

# Dividing to Peaceful and Non Peaceful

In [None]:
peaceful = non_war_dataset[non_war_dataset.PSys == 1]
peaceful.drop('PSys', axis=1, inplace=True)

non_peaceful = non_war_dataset[non_war_dataset.PSys == 2]
non_peaceful.drop('PSys', axis=1, inplace=True)

In [None]:
peaceful

# T-Test

In [None]:
def compute_ttest(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 't_statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        ttest = stats.ttest_ind(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             't_statistic' : ttest.statistic, 
                             'p_value' : [ttest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [None]:
ttest = compute_ttest(peaceful, non_peaceful)

In [None]:
ttest

# Mann-Whitney U-Test

In [None]:
def compute_mannwhitneyu(peaceful, non_peaceful):
    variables = peaceful.columns
    res_df = pd.DataFrame(columns=['feature', 'statistic', 'p_value'])
    
    for var in variables:
        x = peaceful[var]
        y = non_peaceful[var]
        utest = stats.mannwhitneyu(x, y)
        temp = pd.DataFrame({'feature' : [var] , 
                             'statistic' : utest.statistic, 
                             'p_value' : [utest.pvalue]})
        res_df = res_df.append(temp)
        
    res_df.set_index('feature', inplace=True)
    return res_df

In [None]:
mannwhitneyu = compute_mannwhitneyu(peaceful, non_peaceful)

In [None]:
mannwhitneyu