In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv('cleaned.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122376 entries, 0 to 122375
Data columns (total 24 columns):
Unnamed: 0     122376 non-null int64
iyear          122376 non-null int64
imonth         122376 non-null int64
region         122376 non-null int64
crit1          122376 non-null int64
crit2          122376 non-null int64
crit3          122376 non-null int64
success        122376 non-null int64
suicide        122376 non-null int64
attacktype1    122376 non-null float64
attacktype2    3533 non-null float64
attacktype3    214 non-null float64
targtype1      122376 non-null float64
targtype2      6685 non-null float64
targtype3      703 non-null float64
individual     122376 non-null int64
weaptype1      122376 non-null float64
weaptype2      8547 non-null float64
weaptype3      1127 non-null float64
weaptype4      63 non-null float64
nkill          122376 non-null float64
nwound         122376 non-null float64
property       122376 non-null float64
propextent     37549 non-null 

In [80]:
explanatory_vars = ['region','crit1','crit2','crit3','suicide','attacktype1','targtype1','individual','weaptype1']

Xtrain, Xtest, ytrain, ytest = train_test_split(df[explanatory_vars].values, df['success'], random_state = 42, test_size = 0.2)

Xtraining, Xholdout, ytraining, yholdout = train_test_split(Xtrain, ytrain, random_state = 42, test_size = 0.2)

print(type(Xtrain), type(ytrain))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


In [81]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(nfold, random_state = 42).split(x): # split data into train/test groups, 5 times
        #print("hello")
        #print(x[train])
        #print(y[train])
        clf.fit(x[train], y.iloc[train]) # fit
        result += score_func(clf.predict(x[test]), y.iloc[test]) # evaluate score function on held-out data
    return result / nfold # average

In [82]:
Log_clf = LogisticRegression()

#ytrain = ytrain.reset_index()

print(cv_score(Log_clf, Xtrain, ytrain))

0.899305413687


In [73]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Log_clf.fit(Xtraining, ytraining)
print(confusion_matrix(yholdout, Log_clf.predict(Xholdout), labels = [1,0]))
print(classification_report(yholdout, Log_clf.predict(Xholdout)))

[[17591     0]
 [ 1989     0]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1989
          1       0.90      1.00      0.95     17591

avg / total       0.81      0.90      0.85     19580



  'precision', 'predicted', average, warn_for)


In [22]:
print(len(Log_clf.predict(Xholdout)), np.sum(Log_clf.predict(Xholdout)))
#the classifier predicts every attack as a success

19580 19580


In [70]:
Cs = [0.001, 0.1, 1, 10, 100]

for c in Cs:
    reg_clf = LogisticRegression(C = c)
    reg_clf.fit(Xtraining, ytraining)
    print(c)
    print(confusion_matrix(yholdout, reg_clf.predict(Xholdout), labels = [1,0]))

#for every regularization parameter, still predicts everything as success

0.001
[[17591     0]
 [ 1989     0]]
0.1
[[17591     0]
 [ 1989     0]]
1
[[17591     0]
 [ 1989     0]]
10
[[17591     0]
 [ 1989     0]]
100
[[17591     0]
 [ 1989     0]]


In [72]:
from sklearn.ensemble import RandomForestClassifier


Rf_clf = RandomForestClassifier()

Rf_clf.fit(Xtraining, ytraining)

print(confusion_matrix(yholdout, Rf_clf.predict(Xholdout),labels=[1,0]))
#hey this predicts some failures!

print(f1_score(yholdout, Rf_clf.predict(Xholdout)))

[[17417   174]
 [ 1726   263]]
0.948276800784


In [83]:
print(classification_report(yholdout, Rf_clf.predict(Xholdout)))
print(type(classification_report(yholdout, Rf_clf.predict(Xholdout))))

             precision    recall  f1-score   support

          0       0.60      0.13      0.22      1989
          1       0.91      0.99      0.95     17591

avg / total       0.88      0.90      0.87     19580

<class 'str'>


In [88]:
#i'm guessing tuning to prevent overfitting the model will mean the model will predict even more successes
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

param_grid = {'max_depth':[3,5,10,20,100], 'min_impurity_decrease':[1e-7,1e-6,1e-5, 1e-4, 1e-3, 1e-2]}
scorer = make_scorer(f1_score)
Rf_clf = RandomForestClassifier()
Rf_clf_cv = GridSearchCV(Rf_clf, param_grid, cv = 5, scoring = scorer)
Rf_clf_cv.fit(Xtrain, ytrain)

print(Rf_clf_cv.best_params_)

{'max_depth': 10, 'min_impurity_decrease': 1e-06}


In [90]:
Rf_clf_tuned = RandomForestClassifier(max_depth = 10, min_impurity_decrease = 1e-6)

Rf_clf_tuned.fit(Xtrain,ytrain)
print(classification_report(ytrain, Rf_clf_tuned.predict(Xtrain)))

             precision    recall  f1-score   support

          0       0.68      0.12      0.20      9858
          1       0.91      0.99      0.95     88042

avg / total       0.89      0.91      0.87     97900

