## DISPUTES MODELLING

Several models will be tested here on duration; the best amongst them will be used to test highest military action & for incidents modelling.
1. Remove the low-frequency occurences of country codes.  Run models: RidgeCV, XGBoost, & SVR. 
2. Run PCA on the data after country codes are one-hot encoded. Run models: RidgeCV, XGBoost, & SVR.  
3. Run DBScan to try and group the country code categories, then run models: RidgeCV, XGBoost, & SVR.


4. Take the highest-performance preprocess/model pair and run that combination on the data to predict for highest action (side a, then side b)
5. Take the highest performance preprocess and use that with ordinal models (Statsmodel + custom) for HiAct.
6. Run AutoSklearn Regressor on HiAct.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import silhouette_score, mean_absolute_error, accuracy_score

#additional models
from sklearn.cluster import DBSCAN
from autosklearn.regression import AutoSklearnRegressor
#from statsmodels.miscmodels.ordinal_model import OrderedModel

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  self.re = re.compile(self.reString)


In [3]:
df = pd.read_csv("../final_data/disputes.csv")
df

Unnamed: 0,dispnum,y_avgdur,year,outcome,settle,recip,a_country,a_rev_territory,a_rev_policy,a_rev_regime,...,b_country,b_rev_territory,b_rev_policy,b_rev_regime,b_rev_other,b_fatalities,b_hiact,b_hostlev,b_orig,b_coalition
0,3551,1540.0,1992,1,1,1,coalition,0,1,0,...,345,0,0,0,0,0,15,4,1,0.0
1,3552,515.0,1992,5,3,0,coalition,0,0,0,...,645,0,1,0,0,0,17,4,1,0.0
2,3554,23.0,1992,5,3,0,339,0,1,0,...,345,0,0,0,0,0,0,1,1,0.0
3,3555,173.0,1992,5,3,1,344,0,0,0,...,345,1,0,0,0,0,17,4,1,0.0
4,3556,7.0,1992,6,1,1,344,1,0,0,...,346,0,0,0,0,0,17,4,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,4722,1.0,2011,8,3,1,740,1,0,0,...,732,1,0,0,0,0,7,3,1,0.0
843,4723,1.0,2012,8,3,1,740,1,0,0,...,732,1,0,0,0,0,7,3,1,0.0
844,4724,2.0,2013,8,3,0,coalition,0,1,0,...,731,0,1,0,0,0,0,1,1,0.0
845,4725,1.0,2014,8,3,0,732,0,1,0,...,731,0,1,0,0,0,0,1,1,0.0


In [3]:
###############################################################################################################################
#####  Data structures - lists & basics  ######################################################################################

one_hots = ['year', 'a_country', 'b_country']

###############################################################################################################################
#####  Data structures - pipelines ############################################################################################

BasicRidgePipe = Pipeline([
    ('ss', StandardScaler()),
    ('rdg', RidgeCV())
])

BasicXGBPipe = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBRegressor())
])

BasicSVRPipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR())
])

#######################################################

PCA_RidgePipe = Pipeline([
    ('pca', PCA()),
    ('rdg', RidgeCV())
])

PCA_XGBPipe = Pipeline([
    ('pca', PCA()),
    ('xgb', XGBRegressor())
])

PCA_SVRPipe = Pipeline([
    ('pca', PCA()),
    ('svr', SVR())
])


###############################################################################################################################
#####  Data structures - parameter grids  #####################################################################################

pca_grid = {
    "pca__n_components":[None, 8, 20]
}

ridge_grid = {
#    "rdg__alphas":np.logspace(0.1, 1, 10),
    "rdg__alpha_per_target":[False, True]
}

xgb_grid = {
    "xgb__gamma":[0, 0.1, 1, 5],
#    "xgb__max_depth":[5, 6, 8]
}

svr_grid = {
    "svr__C":np.logspace(0.01, 10, 5)
}

In [4]:
#Function definitions

def onehot(df, target):
    #Returns an augmented dataframe that has appended one-hot columns and removed the original column\
    dumdum = pd.get_dummies(df[target], prefix=target, drop_first=True)
    df = pd.concat([df, dumdum], axis=1)
    df.drop(columns=target, inplace=True)
    return df



def autohot(df, target_list):
    #returns a dataframe that has one-hot encoded the target list and appended
    for feature in target_list:
        df = onehot(df, feature)
    return df



def drop_low_freqs(df, prefix_targets):
    #drops all columns from the target list whose sum <= 2
    target_list = []
    for prefix in prefix_targets:
        targets = [feature for feature in df.columns if prefix in feature]
        target_list += targets
    
    low_freqs = [col for col in target_list if sum(df[col]) <= 3]
    
    return df.drop(columns=low_freqs)


def autogrid(X, y, Pipe, model_grid, estimator_grid={}, min_bound=None, max_bound=None):
    #runs a grid on the pipe, based on parameters, and then returns predictions from the best model. 
    
    model_grid.update(estimator_grid)
    
    Grid = GridSearchCV(Pipe, param_grid=model_grid, cv=5, verbose=0)
    Grid.fit(X_train, y_train)
    preds = Grid.predict(X_test)
    
    if min_bound != None:
        for i in range(len(preds)):
            if preds[i] < min_bound:
                preds[i] = min_bound
    
    if max_bound != None:
        for i in range(len(preds)):
            if preds[i] > max_bound:
                preds[i] = max_bound
    
    return preds



def make_whole(array):
    #rounds an array
    int_array = array
    for i in range(len(int_array)):
        int_array[i] = int(round(int_array[i]))
    return int_array



def model_metrics(title, true, preds, mae=False, acc=False)
    #title is the name of the variable being selected.  mae or acc= True to choose metrics.  Accuracy expects an ordinal variable predicted by regression and will round before running.
    if mae=True:
        print(f'Mean Absolute Error for predicting {title}: {mean_absolute_error(y_true=ya_true_strict2, y_pred=a_preds_strict2)}')

    if acc=True:
        preds = make_whole(preds)
        print('')
        print(f'Precision accuracy score for {title} (after rounding):  {round(accuracy_score(y_true=ya_true_strict2, y_pred=a_preds_int_strict2), 3) * 100}%')

---
---

## Modelling & Code 

---
---

1. Target: Duration
2. Preprocessing: Drop low frequencies
3. Models: Linear regression, XGB, and SVR

In [5]:
drop_columns = ['outcome', 'settle', 'y_avgdur']
target = "y_avgdur"

In [6]:
prep = autohot(df, one_hots)
prep = drop_low_freqs(prep, one_hots)

In [7]:
X = prep.drop(columns=drop_columns)
y = prep[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
autogrid(X, y, BasicRidgePipe, model_grid=ridge_grid, min_bound=1.0)

164.92256620290092

In [11]:
autogrid(X, y, BasicSVRPipe, model_grid=svr_grid, min_bound=1.0)

143.32965411189272

In [12]:
autogrid(X, y, BasicXGBPipe, model_grid=xgb_grid, min_bound=1.0)

168.73151737395324

---
---

## Modelling & Code 

---
---

1. Target: Duration
2. Preprocessing: PCA + drop low frequencies
3. Models: Linear regression, XGB, and SVR

In [13]:
autogrid(X, y, PCA_RidgePipe, model_grid=ridge_grid, min_bound=1.0)

160.955961284223

In [21]:
autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, min_bound=1.0)

141.70929645643466

In [16]:
autogrid(X, y, PCA_XGBPipe, model_grid=xgb_grid, min_bound=1.0)

197.2388922515905

---
---

## Modelling & Code 

---
---

1. Target: Duration
2. Preprocessing: *only* PCA
3. Models: Linear regression, XGB, and SVR

In [18]:
prep2 = autohot(df, one_hots)
X = prep2.drop(columns=drop_columns)
y = prep2[target]

In [19]:
autogrid(X, y, PCA_RidgePipe, model_grid=ridge_grid, min_bound=1.0)

161.20292219418428

In [22]:
autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, min_bound=1.0)

In [20]:
autogrid(X, y, PCA_XGBPipe, model_grid=xgb_grid, min_bound=1.0)

209.3377030424352

---
---

## Modelling & Code 

---
---

1. Target: Duration
2. Preprocessing: Setting labels via DBScan
3. Models: Linear regression, XGB, and SVR

In [12]:
dbscan_drops = ['dispnum', 'outcome', 'settle', 'a_country', 'b_country']


X_scaled = StandardScaler().fit_transform(df.drop(columns=dbscan_drops))

dbscan = DBSCAN(eps=3, min_samples=4)
dbscan.fit(X_scaled)
set(dbscan.labels_)

{-1, 0, 1, 2, 3, 4, 5, 6}

In [13]:
silhouette_score(X_scaled, dbscan.labels_)

0.16219043146079573

In [14]:
df['cluster'] = dbscan.labels_

In [15]:
df['cluster'].value_counts()

 0    682
 1     84
-1     46
 3     11
 5      7
 4      7
 2      6
 6      4
Name: cluster, dtype: int64

In [16]:
df['b_hiact'].mean()

6.1676505312868946

In [17]:
df.groupby('cluster').mean().T[[-1, 0, 1, 2, 3, 4, 5, 6]]

cluster,-1,0,1,2,3,4,5,6
dispnum,4192.782609,4247.958944,4292.559524,4066.5,4145.545455,4186.571429,4281.0,4450.5
y_avgdur,641.717391,110.556452,159.529762,315.166667,151.136364,67.571429,240.714286,298.5
y_hiact,14.086957,13.294721,13.238095,16.333333,14.545455,13.142857,17.0,7.0
year,1991.543478,1994.055718,2001.714286,1994.5,1988.818182,1998.857143,2000.857143,2005.25
outcome,4.913043,5.601173,6.119048,2.333333,4.909091,5.142857,5.428571,5.75
settle,2.76087,2.903226,2.809524,2.166667,2.909091,2.857143,2.428571,3.0
recip,0.630435,0.3739,0.654762,1.0,0.0,0.0,1.0,1.0
a_georegion,,,,,,,,
a_poliregion,,,,,,,,
a_rev_territory,0.195652,0.127566,0.72619,0.0,0.0,0.0,1.0,1.0


In [51]:
prep = df.drop(columns=dbscan_drops)

X = prep.drop(columns=target)
y = prep[target]

In [52]:
autogrid(X, y, BasicRidgePipe, model_grid=ridge_grid, min_bound=1.0)

169.93161340522818

In [55]:
autogrid(X, y, BasicSVRPipe, model_grid=svr_grid, min_bound=1.0)

137.38821433168877

In [56]:
autogrid(X, y, BasicXGBPipe, model_grid=xgb_grid,estimator_grid=bound=1.0)

179.09194840575165

---
---

## Modelling & Code 

---
---

1. Target: "hiact" for sides a & b
2. Preprocessing: PCA + drop low freqs
3. Models: SVR

In [28]:
#a_hiact prediction:

latest_drops = ['dispnum', 'outcome', 'settle', 'a_hiact']
latest_target = "a_hiact"

In [29]:
prep = autohot(df, one_hots)
prep = drop_low_freqs(prep, one_hots)

In [30]:
X = prep.drop(columns=latest_drops)
y = prep[latest_target]

In [31]:
pca_grid = {
    "pca__n_components":[None, 10]
}

svr_grid = {
    "svr__C":[0.1, 1, 10]
}

autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

2.2018935885990123

---
---

In [32]:
latest_drops = ['dispnum', 'outcome', 'settle', 'b_hiact']
latest_target = "b_hiact"

In [33]:
prep = autohot(df, one_hots)
prep = drop_low_freqs(prep, one_hots)

In [34]:
X = prep.drop(columns=latest_drops)
y = prep[latest_target]

In [35]:
pca_grid = {
    "pca__n_components":[None, 10]
}

svr_grid = {
    "svr__C":[0.1, 1, 10]
}


autogrid(X, y, PCA_SVRPipe, model_grid=svr_grid, estimator_grid=pca_grid, min_bound=0.0, max_bound=21.0)

4.331207244148111