In [27]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, ElasticNet, ElasticNetCV, MultiTaskElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


In [6]:
acled_df = pd.read_csv('../data/acled/south_sudan.csv') 
acled_df = acled_df.drop(['SOURCE', 'SOURCE_SCALE', 'TAGS', 'EVENT_DATE', 'YEAR', 'GEO_PRECISION'], axis=1)

data = acled_df
X = data.drop('FATALITIES', axis=1)
y = data['FATALITIES']


In [10]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = preprocessor.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = RidgeCV(alphas=np.arange(0.01, 1, 0.1), cv=cv, scoring='neg_mean_absolute_error')

model.fit(X_train, y_train)

print(model.alpha_)

0.01


In [25]:
y_pred_train = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred_train)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.02322991032613606


In [26]:

y_pred = model.predict(X_test)
y_pred_zero = np.where(model.predict(X_test) < 0, 0, model.predict(X_test))

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

Mean Absolute Error: 5.482602426749992
Mean Squared Error: 284.5993788017796
R-squared: 0.48320824865133005


array([ 4.84818091,  2.10359722, 44.82386471, ...,  7.95253243,
       -0.12371995, -1.33318915])

In [28]:

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)

# df['pca-one'] = pca_result[:,0]
# df['pca-two'] = pca_result[:,1] 

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))


TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.