In [80]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, ElasticNet, ElasticNetCV, MultiTaskElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE



In [133]:
acled_df = pd.read_csv('../data/acled/africa.csv') 
# acled_df = acled_df.drop(['SOURCE', 'SOURCE_SCALE', 'TAGS', 'EVENT_DATE', 'YEAR', 'GEO_PRECISION'], axis=1)

data = acled_df
X = data.drop('FATALITIES', axis=1)


# vector = np.array(data['FATALITIES'])
# y = np.where(vector > 0, np.log(vector), 0)
y = data['FATALITIES']


  acled_df = pd.read_csv('../data/acled/africa.csv')


In [134]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = preprocessor.fit_transform(X)

# # SVD does not work very well
# truncatedSVD=TruncatedSVD(20)
# X = truncatedSVD.fit_transform(X)


In [144]:
variance = np.var(y, axis=0)
variance

mean = np.mean(y, axis=0)
mean

2.6818631602406953

In [145]:
variance = np.var(y, axis=0)
variance

716.9301328335149

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = RidgeCV(alphas=np.arange(0.01, 15, 1), cv=cv, scoring='neg_mean_squared_error')

model.fit(X_train, y_train)

print(model.alpha_)

14.01


In [140]:
y_pred = model.predict(X_test)
y_pred_zero = np.where(model.predict(X_test) < 0, 0, model.predict(X_test))

yp = y_pred

mae = mean_absolute_error(y_test, yp)
print("Mean Absolute Error:", mae)

mse = mean_squared_error(y_test, yp)
print(f"Mean Squared Error: {mse}")

# very low r2 for svd, means that there is very little correlation between variables 
r2 = r2_score(y_test, yp)
print(f"R-squared: {r2}")

# should not use MAPE if i have a lot of zeros
mape = mean_absolute_percentage_error(y_test, yp)
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 4.52183153963852
Mean Squared Error: 721.2470978040858
R-squared: 0.020148526708258285
Mean Absolute Percentage Error: 9159360885341690.0


In [58]:
## PCA doesn't work, data too sparse
## trying Truncated SVD, which provides a low rank approximation for high dimensional 
## reference: https://rukshanpramoditha.medium.com/truncated-svd-for-dimensionality-reduction-in-sparse-feature-matrices-c083b4af7ddc#:~:text=Just%20like%20normal%20SVD%2C%20truncated,Truncated%20SVD%20accepts%20sparse%20matrices.

truncatedSVD=TruncatedSVD(10)
X_truncated = truncatedSVD.fit_transform(X)

# Percentage of variance explained by each of the selected components.
truncatedSVD.explained_variance_ratio_.sum()


0.5589956249665272

**raw fatality numbers**

Mean Absolute Error: 5.482602426749992
Mean Squared Error: 284.5993788017796
R-squared: 0.48320824865133005
Mean Absolute Percentage Error: 9857729264887162.0

**log percentage errors**

Mean Absolute Error: 0.6384085004082225
Mean Squared Error: 0.785817688746483
R-squared: 0.2814913864849885
Mean Absolute Percentage Error: 1470050233078672.5