In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.svm import SVR
from xgboost import XGBRegressor

#post-modelling metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [3]:
df = pd.read_csv("../final_data/incidents.csv")
df

Unnamed: 0,dispnum,incidnum,y_duration,year,a_country,a_rev_territory,a_rev_policy,a_rev_regime,a_rev_other,a_fatalities,...,a_hostlev,a_coalition,b_country,b_rev_territory,b_rev_policy,b_rev_regime,b_rev_other,b_fatalities,b_hiact,b_hostlev
0,3551,3551010,1.0,1993,2,0,1,0,0,0,...,2,0.0,345,0,0,0,0,0,0,1
1,3551,3551002,1404.0,1992,coalition,0,1,0,0,0,...,4,8.0,345,0,0,0,0,0,0,1
2,3551,3551001,1.0,1992,coalition,0,1,0,0,0,...,3,12.0,345,0,0,0,0,0,0,1
3,3551,3551009,185.0,1993,2,0,1,0,0,0,...,4,0.0,345,0,0,0,0,0,0,1
4,3551,3551008,1.0,1993,coalition,0,1,0,0,0,...,3,2.0,345,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4125,4723,4723002,1.0,2012,740,1,0,0,0,0,...,1,0.0,732,1,0,0,0,0,7,3
4126,4724,4724002,1.0,2013,740,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1
4127,4724,4724001,2.0,2013,732,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1
4128,4725,4725001,1.0,2014,732,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1


In [4]:
mystery_meat = ['a_georegion', 'a_poliregion', 'b_georegion', 'b_poliregion']
one_hots = ['year', 'a_country', 'b_country']


features = ['a_rev_territory', 'a_rev_policy', 'a_rev_regime', 'a_rev_other', 'a_fatalities', 'a_hiact', 'a_hostlev', 'a_coalition', 
            'b_rev_territory', 'b_rev_policy', 'b_rev_regime', 'b_rev_other', 'b_fatalities', 'b_hiact', 'b_hostlev'
           ]

target = "y_duration"

In [5]:
def onehot(df, hotlist):
    for feature in hotlist:
        dumdum = pd.get_dummies(df[feature], prefix=f'{feature}', drop_first=True)
        df = pd.concat([df, dumdum], axis=1)
    return df



def append_hot_features(featlist, df, hotlist):
    for column in df.columns:
        for feature in hotlist:
            if feature in column:
                featlist.append(column)
    return featlist

In [6]:
hot_df = onehot(df, one_hots)

In [7]:
features = append_hot_features(features, hot_df, one_hots)

In [11]:
features.remove("year")
features.remove("a_country")
features.remove("b_country")

In [12]:
X = hot_df[features]
y = hot_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3097 entries, 2 to 2018
Columns: 285 entries, a_rev_territory to b_country_coalition
dtypes: float64(1), int64(14), uint8(270)
memory usage: 1.2 MB


---
---

In [14]:
#Baseline
DummyPipe = Pipeline([
    ('ss', StandardScaler()),
    ('dr', DummyRegressor(strategy="median"))
])

params = {}

DummyGrid = GridSearchCV(DummyPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

DummyGrid.fit(X_train, y_train)

dummy_preds = DummyGrid.predict(X_test)
mean_absolute_error(y_true=y_test, y_pred=dummy_preds)

3.655372700871249

---
---

In [15]:
LinearPipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

params = {}

LinearGrid = GridSearchCV(LinearPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

LinearGrid.fit(X_train, y_train)

preds = LinearGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

3104519673238.5977

---
---

In [16]:
SVPipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR())
])

params = {}

SVGrid = GridSearchCV(SVPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

SVGrid.fit(X_train, y_train)

preds = SVGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

3.7034486945273715

---
---

In [17]:
ForestPipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestRegressor())
])

params = {}

ForestGrid = GridSearchCV(ForestPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

ForestGrid.fit(X_train, y_train)

preds = ForestGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

6.171048136836309

---
---

In [18]:
XGBPipe = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBRegressor())
])

params = {}

XGBGrid = GridSearchCV(XGBPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

XGBGrid.fit(X_train, y_train)

preds = XGBGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

6.034665327450659

---
---

In [19]:
LassoPipe = Pipeline([
    ('ss', StandardScaler()),
    ('ls', LassoCV())
])

params = {}

LassoGrid = GridSearchCV(LassoPipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

LassoGrid.fit(X_train, y_train)

preds = LassoGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

6.577802054225902

---
---

In [20]:
RidgePipe = Pipeline([
    ('ss', StandardScaler()),
    ('ls', RidgeCV())
])

params = {}

RidgeGrid = GridSearchCV(RidgePipe, param_grid=params, cv=5, verbose=0)

#####################################################################################

RidgeGrid.fit(X_train, y_train)

preds = RidgeGrid.predict(X_test)

for i in range(len(preds)):
    if preds[i]<1.0:
        preds[i] = 1.0

mean_absolute_error(y_true=y_test, y_pred=preds)

7.651810768151471