In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import silhouette_score, r2_score, mean_squared_error, mean_absolute_error, accuracy_score

#additional models
from sklearn.cluster import DBSCAN
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.classification import AutoSklearnClassifier
#from statsmodels.miscmodels.ordinal_model import OrderedModel

  self.re = re.compile(self.reString)


In [2]:
df = pd.read_csv("./final_data/incidents.csv")
df

Unnamed: 0,dispnum,incidnum,y_duration,year,a_country,a_rev_territory,a_rev_policy,a_rev_regime,a_rev_other,a_fatalities,...,a_hostlev,a_coalition,b_country,b_rev_territory,b_rev_policy,b_rev_regime,b_rev_other,b_fatalities,b_hiact,b_hostlev
0,3551,3551010,1.0,1993,2,0,1,0,0,0,...,2,0.0,345,0,0,0,0,0,0,1
1,3551,3551002,1404.0,1992,coalition,0,1,0,0,0,...,4,8.0,345,0,0,0,0,0,0,1
2,3551,3551001,1.0,1992,coalition,0,1,0,0,0,...,3,12.0,345,0,0,0,0,0,0,1
3,3551,3551009,185.0,1993,2,0,1,0,0,0,...,4,0.0,345,0,0,0,0,0,0,1
4,3551,3551008,1.0,1993,coalition,0,1,0,0,0,...,3,2.0,345,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4125,4723,4723002,1.0,2012,740,1,0,0,0,0,...,1,0.0,732,1,0,0,0,0,7,3
4126,4724,4724002,1.0,2013,740,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1
4127,4724,4724001,2.0,2013,732,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1
4128,4725,4725001,1.0,2014,732,0,1,0,0,0,...,3,0.0,731,0,1,0,0,0,0,1


In [3]:
###############################################################################################################################
#####  Data structures - lists & basics  ######################################################################################

one_hots = ['year', 'a_country', 'b_country']

###############################################################################################################################
#####  Data structures - pipelines & parameter grids  #########################################################################

PCA_SVRPipe = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(random_state = 42)),
    ('svr', SVR())
])

pca_grid = {
    "pca__n_components":[None, 8, 20]
}

svr_grid = {
    "svr__C":[0.1, 1, 10]
}

In [20]:
#Function definitions

def onehot(df, target):
    #Returns an augmented dataframe that has appended one-hot columns and removed the original column\
    dumdum = pd.get_dummies(df[target], prefix=target, drop_first=True)
    df = pd.concat([df, dumdum], axis=1)
    df.drop(columns=target, inplace=True)
    return df



def autohot(df, target_list):
    #returns a dataframe that has one-hot encoded the target list and appended
    for feature in target_list:
        df = onehot(df, feature)
    return df



def drop_low_freqs(df, prefix_targets):
    #drops all columns from the target list whose sum <= 2
    target_list = []
    for prefix in prefix_targets:
        targets = [feature for feature in df.columns if prefix in feature]
        target_list += targets
    
    low_freqs = [col for col in target_list if sum(df[col]) <= 3]
    
    return df.drop(columns=low_freqs)


def bounding_limits(preds, min_bound, max_bound):
    #limits the results of a regression prediction to minimum or maximum limits (such as if it is predicting time or ordinals)
    if min_bound != None:
        for i in range(len(preds)):
            if preds[i] < min_bound:
                preds[i] = min_bound
    
    if max_bound != None:
        for i in range(len(preds)):
            if preds[i] > max_bound:
                preds[i] = max_bound
    
    return preds

def autogrid(X, y, Pipe, model_grid, estimator_grid={}, min_bound=None, max_bound=None):
    #runs a grid on the pipe, based on parameters, and then returns predictions from the best model. 
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model_grid.update(estimator_grid)
    
    Grid = GridSearchCV(Pipe, param_grid=model_grid, cv=5, verbose=0)
    Grid.fit(X_train, y_train)
    
    preds = Grid.predict(X_test)
    preds = bounding_limits(preds, min_bound, max_bound)
    
    return y_test, preds



def make_whole(array):
    #rounds an array
    int_array = array
    for i in range(len(int_array)):
        int_array[i] = int(round(int_array[i]))
    return int_array



def model_metrics(title, true, preds, mae=False, acc=False):
    #title is the name of the variable being selected.  mae or acc= True to choose metrics.  Accuracy expects an ordinal variable predicted by regression and will round before running.
    if mae==True:
        print(f'Mean Absolute Error for predicting {title}: {mean_absolute_error(y_true=true, y_pred=preds)}')

    if acc==True:
        preds = make_whole(preds)
        print(f'Precision accuracy score for {title} (after rounding):  {round(accuracy_score(y_true=true, y_pred=preds), 3) * 100}%')

---
---

In [8]:
#Data setup
prep = autohot(df, one_hots)
prep = drop_low_freqs(prep, one_hots)

---
---

## Modelling & Code 

---
---

1. Target: hiact (sides a & b)
2. Preprocessing: Drop low frequencies + PCA
3. Models: SVR

In [10]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'a_hiact']
target = "a_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

In [14]:
a_svr_true, a_svr_preds = autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

---
---


In [15]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'b_hiact']
target = "b_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

In [16]:
b_svr_true, b_svr_preds = autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

In [21]:
model_metrics("a_hiact", a_svr_true, a_svr_preds, mae=True, acc=True)
print('')
model_metrics("b_hiact", b_svr_true, b_svr_preds, mae=True, acc=True)

Mean Absolute Error for predicting a_hiact: 1.3155856727976767
Precision accuracy score for a_hiact (after rounding):  57.699999999999996%

Mean Absolute Error for predicting b_hiact: 0.9244917715392061
Precision accuracy score for b_hiact (after rounding):  75.6%


---
---

## Modelling & Code 

---
---

1. Target: hiact (sides a & b)
2. Preprocessing & Models: AutoSklearn

In [25]:
def autolearn(X, y, Machine):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    Machine.fit(X_train, y_train, dataset_name='MIDs')

    preds = Machine.predict(X_test)

    for i in range(len(preds)):
        if preds[i] < 0.0:
            preds[i] = 0.0
        if preds[i] > 21.0:
            preds[i] = 21.0

    return X_test, y_test, preds

In [26]:
AutoML = AutoSklearnRegressor(
    time_left_for_this_task=60,
    per_run_time_limit=30,
)

a_drops = ['dispnum', 'incidnum', 'y_duration', 'a_hiact']
a_target = "a_hiact"

b_drops = ['dispnum', 'incidnum', 'y_duration', 'b_hiact']
b_target = "b_hiact"

In [27]:
X_a = prep.drop(columns=a_drops)
y_a = prep[a_target]

X_b = prep.drop(columns=b_drops)
y_b = prep[b_target]

In [29]:
df_a, ya_true, a_preds = autolearn(X_a, y_a, AutoML)
df_b, yb_true, b_preds = autolearn(X_b, y_b, AutoML)

In [30]:
model_metrics("a_hiact", ya_true, a_preds, mae=True, acc=True)
print('')
model_metrics("b_hiact", yb_true, b_preds, mae=True, acc=True)

Mean Absolute Error for predicting a_hiact: 0.5674796159944032
Precision accuracy score for a_hiact (after rounding):  67.10000000000001%

Mean Absolute Error for predicting b_hiact: 0.17262684690844726
Precision accuracy score for b_hiact (after rounding):  89.60000000000001%


---
---

In [31]:
#calculate mode (for accuracy)
from scipy.stats import mode
a_mode = mode(ya_true)[0][0]
b_mode = mode(yb_true)[0][0]
a_mode_baseline = [a_mode for i in range(len(ya_true))]
b_mode_baseline = [b_mode for i in range(len(yb_true))]

#calculate mean (for MAE)
a_mean = ya_true.mean()
b_mean = yb_true.mean()
a_mean_baseline = [a_mean for i in range(len(ya_true))]
b_mean_baseline = [b_mean for i in range(len(yb_true))]

In [32]:
print(f'(as compared to mean a_hiact MAE baseline): {mean_absolute_error(y_true=ya_true, y_pred=a_mean_baseline)}')
print(f'(as compared to b_hiact MAE baseline):      {mean_absolute_error(y_true=yb_true, y_pred=b_mean_baseline)}')

print('')

print(f'(as compared to mode a_hiact accuracy baseline):        {round(accuracy_score(y_true=ya_true, y_pred=a_mode_baseline), 3) * 100}%')
print(f'(as compared to mode b_hiact accuracy baseline):        {round(accuracy_score(y_true=yb_true, y_pred=b_mode_baseline), 3) * 100}%')

(as compared to mean a_hiact MAE baseline): 5.769563738357344
(as compared to b_hiact MAE baseline):      6.3013094502895255

(as compared to mode a_hiact accuracy baseline):        20.7%
(as compared to mode b_hiact accuracy baseline):        62.9%


---
---
### Modelling to try and predict highest hostility level *by country* by incident

In [34]:
action_df = pd.read_csv('./final_data/incids_by_country.csv')

In [35]:
act_drops = ['dispnum', 'incidnum', 'action']
act_target = "action"

act_prep = autohot(action_df, ['ccode', 'year'])
act_prep = drop_low_freqs(act_prep, ['ccode', 'year'])

X_act = act_prep.drop(columns=act_drops)
y_act = act_prep[act_target]

In [36]:
df_act, y_act_true, act_preds = autolearn(X_act, y_act, AutoML)

In [37]:
#calculate mode (for accuracy)
act_mode = mode(y_act_true)[0][0]
act_mode_baseline = [act_mode for i in range(len(y_act_true))]

#calculate mean (for MAE)
act_mean = y_act_true.mean()
act_mean_baseline = [act_mean for i in range(len(y_act_true))]

In [38]:
model_metrics("a_hiact", y_act_true, act_preds, mae=True, acc=True)
print('')

print(f'(as compared to mean per-country hiact MAE baseline): {mean_absolute_error(y_true=y_act_true, y_pred=act_mean_baseline)}')
print(f'(as compared to mode action accuracy baseline):        {round(accuracy_score(y_true=y_act_true, y_pred=act_mode_baseline), 3) * 100}%')

Mean Absolute Error for predicting a_hiact: 0.43995632808354934
Precision accuracy score for a_hiact (after rounding):  74.8%

(as compared to mean per-country hiact MAE baseline): 6.265370568073271
(as compared to mode action accuracy baseline):        39.4%


---
---

In [39]:
subdf = action_df[action_df['side_a']==0]
print("avg no. of allies on side b:", subdf['allies'].mean())
print("total number of side b states:", len(subdf))

avg no. of allies on side b: 0.015971606033717833
total number of side b states: 4508


In [40]:
subdf = action_df[action_df['side_a']==1]
print("avg no. of allies on side a:", subdf['allies'].mean())
print("total number of side a states:", len(subdf))

avg no. of allies on side a: 1.4474662492662884
total number of side a states: 5111


---
---

### Strict modelling:  less data

In [41]:
a_drops = ['dispnum', 'incidnum', 'y_duration', 'a_hiact', 'a_hostlev']
a_target = "a_hiact"

b_drops = ['dispnum', 'incidnum', 'y_duration', 'b_hiact', 'b_hostlev']
b_target = "b_hiact"

##################################################################################

X_a_strict = prep.drop(columns=a_drops)
y_a_strict = prep[a_target]

X_b_strict = prep.drop(columns=b_drops)
y_b_strict = prep[b_target]

##################################################################################

df_a_strict, ya_true_strict, a_preds_strict = autolearn(X_a_strict, y_a_strict, AutoML)
df_b_strict, yb_true_strict, b_preds_strict = autolearn(X_b_strict, y_b_strict, AutoML)

In [42]:
model_metrics("a_hiact", ya_true_strict, a_preds_strict, mae=True, acc=True)
print('')
model_metrics("b_hiact", yb_true_strict, b_preds_strict, mae=True, acc=True)

Mean Absolute Error for predicting a_hiact: 1.7131220556271898
Precision accuracy score for a_hiact (after rounding):  42.4%

Mean Absolute Error for predicting b_hiact: 0.6751223967335624
Precision accuracy score for b_hiact (after rounding):  82.3%


---
---

### Strictest modelling:  no information about state disposition or outcome

In [43]:
a_drops = ['dispnum', 'incidnum', 'y_duration', 'a_hiact', 'a_hostlev', 'b_hiact', 'b_hostlev']
a_target = "a_hiact"

b_drops = ['dispnum', 'incidnum', 'y_duration', 'a_hiact', 'a_hostlev', 'b_hiact', 'b_hostlev']
b_target = "b_hiact"

##################################################################################

X_a_strict2 = prep.drop(columns=a_drops)
y_a_strict2 = prep[a_target]

X_b_strict2 = prep.drop(columns=b_drops)
y_b_strict2 = prep[b_target]

##################################################################################

df_a_strict2, ya_true_strict2, a_preds_strict2 = autolearn(X_a_strict2, y_a_strict2, AutoML)
df_b_strict2, yb_true_strict2, b_preds_strict2 = autolearn(X_b_strict2, y_b_strict2, AutoML)

In [44]:
model_metrics("a_hiact", ya_true_strict2, a_preds_strict2, mae=True, acc=True)
print('')
model_metrics("b_hiact", yb_true_strict2, b_preds_strict2, mae=True, acc=True)

Mean Absolute Error for predicting a_hiact: 3.2020648885353777
Precision accuracy score for a_hiact (after rounding):  18.7%

Mean Absolute Error for predicting b_hiact: 2.551986944895156
Precision accuracy score for b_hiact (after rounding):  42.5%


---
---

### Add'l Grids (for the project)

In [45]:
###############################################################################################################################
#####  Data structures - pipelines ############################################################################################

PCA_RidgePipe = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(random_state = 42)),
    ('rdg', RidgeCV())
])

PCA_XGBPipe = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(random_state = 42)),
    ('xgb', XGBRegressor())
])


###############################################################################################################################
#####  Data structures - parameter grids  #####################################################################################

pca_grid = {
    "pca__n_components":[None, 8, 20]
}

ridge_grid = {
#    "rdg__alphas":np.logspace(0.1, 1, 10),
    "rdg__alpha_per_target":[False, True]
}

xgb_grid = {
    "xgb__gamma":[0, 0.1, 1, 5],
#    "xgb__max_depth":[5, 6, 8]
}

---

In [47]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'a_hiact']
target = "a_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

a_rdg_true, a_rdg_preds = autogrid(X, y, PCA_RidgePipe, model_grid=ridge_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

In [48]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'b_hiact']
target = "b_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

b_rdg_true, b_rdg_preds = autogrid(X, y, PCA_RidgePipe, model_grid=ridge_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

In [49]:
a_rdg_preds = make_whole(a_rdg_preds)
b_rdg_preds = make_whole(b_rdg_preds)

print(f'MAE a_hiact:  {mean_absolute_error(y_true=a_rdg_true, y_pred=a_rdg_preds)}')
print(f'ACC a_hiact:  {round(accuracy_score(y_true=a_rdg_true, y_pred=a_rdg_preds), 3)*100}%')
print(f'MAE b_hiact:  {mean_absolute_error(y_true=b_rdg_true, y_pred=b_rdg_preds)}')
print(f'ACC b_hiact:  {round(accuracy_score(y_true=b_rdg_true, y_pred=b_rdg_preds), 3)*100}%')

MAE a_hiact:  0.8363988383349468
ACC a_hiact:  41.9%
MAE b_hiact:  0.3707647628267183
ACC b_hiact:  74.2%


---

In [50]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'a_hiact']
target = "a_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

a_xgb_true, a_xgb_preds = autogrid(X, y, PCA_XGBPipe, model_grid=xgb_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

In [51]:
drop_columns = ['dispnum', 'incidnum', 'y_duration', 'b_hiact']
target = "b_hiact"

X = prep.drop(columns=drop_columns)
y = prep[target]

b_xgb_true, b_xgb_preds = autogrid(X, y, PCA_XGBPipe, model_grid=xgb_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

In [52]:
a_xgb_preds = make_whole(a_xgb_preds)
b_xgb_preds = make_whole(b_xgb_preds)

print(f'MAE a_hiact:  {mean_absolute_error(y_true=a_xgb_true, y_pred=a_xgb_preds)}')
print(f'ACC a_hiact:  {round(accuracy_score(y_true=a_xgb_true, y_pred=a_xgb_preds), 3)*100}%')
print(f'MAE b_hiact:  {mean_absolute_error(y_true=b_xgb_true, y_pred=b_xgb_preds)}')
print(f'ACC b_hiact:  {round(accuracy_score(y_true=b_xgb_true, y_pred=b_xgb_preds), 3)*100}%')

MAE a_hiact:  1.2032913843175217
ACC a_hiact:  51.7%
MAE b_hiact:  0.9912875121006777
ACC b_hiact:  70.1%


---
---

### DBScan Modelling

In [53]:
df.columns

Index(['dispnum', 'incidnum', 'y_duration', 'year', 'a_country',
       'a_rev_territory', 'a_rev_policy', 'a_rev_regime', 'a_rev_other',
       'a_fatalities', 'a_hiact', 'a_hostlev', 'a_coalition', 'b_country',
       'b_rev_territory', 'b_rev_policy', 'b_rev_regime', 'b_rev_other',
       'b_fatalities', 'b_hiact', 'b_hostlev'],
      dtype='object')

In [54]:
dbscan_drops = ['dispnum', 'incidnum', 'a_country', 'b_country',]


X_scaled = StandardScaler().fit_transform(df.drop(columns=dbscan_drops))

dbscan = DBSCAN(eps=3.2, min_samples=4)
dbscan.fit(X_scaled)
set(dbscan.labels_)

{-1, 0, 1, 2, 3, 4, 5, 6, 7}

In [55]:
silhouette_score(X_scaled, dbscan.labels_)

0.47900503487897406

In [56]:
df['cluster'] = dbscan.labels_
df['cluster'].value_counts()

 0    3913
 2      76
 4      45
-1      31
 3      27
 5      15
 1      15
 7       4
 6       4
Name: cluster, dtype: int64

In [57]:
df.groupby('cluster').mean().T[[-1, 0, 1, 2, 3, 4, 5, 6]]

cluster,-1,0,1,2,3,4,5,6
dispnum,4399.903,4362.677,4048.667,4148.039,4119.852,4275.444,4242.2,4479.0
incidnum,4399928.0,4362730.0,4048672.0,4148047.0,4119865.0,4275454.0,4242211.0,4479065.75
y_duration,266.2581,3.449783,1.266667,1.75,4.703704,6.533333,3.533333,107.0
year,2004.258,2004.087,1996.6,1997.158,1997.185,2001.111,1997.267,2005.75
a_rev_territory,0.06451613,0.2811142,0.0,0.01315789,0.0,0.0,0.0,0.0
a_rev_policy,0.483871,0.5849732,0.0,0.1184211,0.0,0.2888889,0.0,0.5
a_rev_regime,0.2903226,0.0,0.0,0.0,0.0,1.0,1.0,1.0
a_rev_other,0.06451613,0.0,1.0,1.0,0.0,0.0,0.0,0.0
a_fatalities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a_hiact,12.32258,9.674163,17.0,11.15789,0.0,10.88889,17.0,9.25
