# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet
import joblib
from sklearn.preprocessing import StandardScaler

# Data preparation

Read data

In [21]:
data = pd.read_excel('data.xlsx')

Sorting by year (avoids misshifting)

In [22]:
data = data.sort_values(by='Year')

Define target columns for model and add new column for previous year pending cases

In [23]:
target_specific = [column for column in data.columns if 'CC' in column and column != 'CC_all']
target_all = 'CC_all'
for column in [col for col in data.columns if 'PC' in col or 'CC' in col]:
    data[column + '_prev_year'] = data.groupby(['Court', 'Municipality', 'Bench'])[column].shift(1)

Drop Incoming Cases and Pending Cases columns (unknown when predicting)

In [24]:
data = data.drop(columns=[col for col in data.columns if 'PC_all' in col or 'IC' in col or ('PC' in col and '_prev_year' not in col) or 'CC_all' in col])

Encode categorical columns

In [25]:
categorical_columns = ['Court', 'Municipality', 'Bench']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)

Drop 'Informatic People' variable - only 1 entry in 2017, with value 1

In [26]:
data = data.drop(columns=['Informatic People'])

Drop Justice Officials - sum of 6 prior columns

In [27]:
data = data.drop(columns=['Justice Officials'])

In [28]:
data = data.dropna(axis=0, how='any')


In [29]:
data.to_csv('data_processed.csv', index=False)

In [30]:
data_2023 = data[data['Year'] == 2023]

In [31]:
data = data.drop(columns=['Year'])

Train test split

In [33]:
train_X, test_X, train_y, test_y = train_test_split(data.drop(columns=target_specific), data[target_specific], test_size=0.2, random_state=42)

Train test split for ensemble


In [34]:
train_X_ensemble = train_X.sample(frac=0.75, random_state=42)
train_y_ensemble = train_y.loc[train_X_ensemble.index]
validation_X_ensemble = train_X.drop(train_X_ensemble.index)
validation_y_ensemble = train_y.loc[validation_X_ensemble.index]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

train_X_lr = train_X_ensemble.sample(frac=0.33, random_state=42)
train_y_lr = train_y_ensemble.loc[train_X_lr.index]

train_X_rf = train_X_ensemble.drop(train_X_lr.index).sample(frac=0.5, random_state=42)
train_y_rf = train_y_ensemble.loc[train_X_rf.index]

train_X_xgb = train_X_ensemble.drop(train_X_lr.index).drop(train_X_rf.index)
train_y_xgb = train_y_ensemble.loc[train_X_xgb.index]


Features

In [35]:
print('Court')
print('Municipality')
print('Bench')
for col in train_X.columns:
    print(col) if 'Court' not in col and 'Municipality' not in col and 'Bench' not in col else None

Court
Municipality
Bench
Judges
Justice Secretary
Law Clerck
Auxiliar Clerck
Administrative/Technical People
Operational/Auxiliar People
CC_Civil_prev_year
CC_Criminal_prev_year
CC_labor_prev_year
CC_criminal_labor_prev_year
CC_tutelar_prev_year
CC_militar_prev_year
PC_Civil_prev_year
PC_Criminal_prev_year
PC_labor_prev_year
PC_criminal_labor_prev_year
PC_tutelar_prev_year
PC_militar_prev_year


# First models

Linear Regression

In [36]:
lr = LinearRegression()
lr.fit(train_X, train_y)

predictions_lr = lr.predict(test_X)
predictions_lr = predictions_lr.round(0)


mse_lr = mean_squared_error(test_y, predictions_lr)
mae_lr = mean_absolute_error(test_y, predictions_lr)
r2_lr = r2_score(test_y, predictions_lr)

print(f"Mean Squared Error: {mse_lr}, Mean Absolute Error: {mae_lr}, R2 Score: {r2_lr}")

Mean Squared Error: 74590.44995759119, Mean Absolute Error: 54.07209499575911, R2 Score: 0.9299592752109356


In [74]:
joblib.dump(lr, 'linear_regression.joblib')

['linear_regression.joblib']

Decision Tree

In [38]:
dt = DecisionTreeRegressor(random_state=42)

dt.fit(train_X, train_y)

predictions_dt = dt.predict(test_X)
predictions_dt = predictions_dt.round(0)

mse_dt = mean_squared_error(test_y, predictions_dt)
mae_dt = mean_absolute_error(test_y, predictions_dt)
r2_dt = r2_score(test_y, predictions_dt)

print(f"Mean Squared Error: {mse_dt}, Mean Absolute Error: {mae_dt}, R2 Score: {r2_dt}")

Mean Squared Error: 101205.20483460561, Mean Absolute Error: 44.749787955894824, R2 Score: 0.7953032935568632


In [39]:
print(dt.max_features_)
print(dt.n_features_in_)
print(dt.n_outputs_)
dt.tree_.max_depth

248
248
6


33

XGBoost

In [40]:
xgb = XGBRegressor(random_state=42)

xgb.fit(train_X, train_y)

predictions_xgb = xgb.predict(test_X)
predictions_xgb = predictions_xgb.round(0)

mse_xgb = mean_squared_error(test_y, predictions_xgb)
mae_xgb = mean_absolute_error(test_y, predictions_xgb)
r2_xgb = r2_score(test_y, predictions_xgb)

print(f"Mean Squared Error: {mse_xgb}, Mean Absolute Error: {mae_xgb}, R2 Score: {r2_xgb}")

Mean Squared Error: 112877.15625, Mean Absolute Error: 38.06340408325195, R2 Score: 0.9024385809898376


In [69]:
joblib.dump(xgb, 'xgb.joblib')

['xgb.joblib']

Random Forest

In [41]:
rf = RandomForestRegressor(random_state=42)

rf.fit(train_X, train_y)

predictions_rf = rf.predict(test_X)
predictions_rf = predictions_rf.round(0)

mse_rf = mean_squared_error(test_y, predictions_rf)
mae_rf = mean_absolute_error(test_y, predictions_rf)
r2_rf = r2_score(test_y, predictions_rf)

print(f"Mean Squared Error: {mse_rf}, Mean Absolute Error: {mae_rf}, R2 Score: {r2_rf}")

Mean Squared Error: 80186.38486005088, Mean Absolute Error: 35.82251908396946, R2 Score: 0.8795833922943278


In [42]:
print(rf.n_features_in_)
print(rf.n_outputs_)
len(rf.estimators_)

248
6


100

In [43]:
maxd= 0
for t in rf.estimators_:
    if t.tree_.max_depth > maxd:
        maxd = t.tree_.max_depth
print(maxd)

41


Check correlations between the errors of each model

In [44]:
residual_lr = test_y.values - predictions_lr
residual_xgb = test_y.values - predictions_xgb
residual_rf = test_y.values - predictions_rf

In [45]:
correlation_results = {}

for i, target in enumerate(target_specific):
    # Build a DataFrame for the residuals for this target
    df_target = pd.DataFrame({
        'Linear Regression': residual_lr[:, i],
        'XGBoost': residual_xgb[:, i],
        'Random Forest': residual_rf[:, i]
    })

    correlation_results[target] = df_target.corr()

for target, corr_matrix in correlation_results.items():
    print(f"Correlation matrix for {target}:")
    print(corr_matrix)
    print("\n")


model_pairs = [
    ('Linear Regression', 'XGBoost'),
    ('Linear Regression', 'Random Forest'),
    ('XGBoost', 'Random Forest')
]

avg_correlations = {pair: [] for pair in model_pairs}

for i, target in enumerate(target_specific):
    df_target = pd.DataFrame({
        'Linear Regression': residual_lr[:, i],
        'XGBoost': residual_xgb[:, i],
        'Random Forest': residual_rf[:, i]
    })
    corr = df_target.corr()
    for pair in model_pairs:
        avg_correlations[pair].append(corr.loc[pair[0], pair[1]])


print("Average Correlations Across Targets:")
for pair, values in avg_correlations.items():
    avg_corr = np.mean(values)
    print(f"{pair}: {avg_corr:.3f}")

Correlation matrix for CC_Civil:
                   Linear Regression   XGBoost  Random Forest
Linear Regression           1.000000  0.491921       0.663481
XGBoost                     0.491921  1.000000       0.894176
Random Forest               0.663481  0.894176       1.000000


Correlation matrix for CC_Criminal:
                   Linear Regression   XGBoost  Random Forest
Linear Regression           1.000000  0.736867       0.810291
XGBoost                     0.736867  1.000000       0.775679
Random Forest               0.810291  0.775679       1.000000


Correlation matrix for CC_labor:
                   Linear Regression   XGBoost  Random Forest
Linear Regression           1.000000  0.815744       0.899546
XGBoost                     0.815744  1.000000       0.857196
Random Forest               0.899546  0.857196       1.000000


Correlation matrix for CC_criminal_labor:
                   Linear Regression   XGBoost  Random Forest
Linear Regression           1.000000  0.8495

# Hyper parameter tuning

Random Forest

In [24]:
# Define the parameter grid
param_grid = {'n_estimators': range(10,201,10), 'max_depth': range(1, 21)}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Perform Grid Search with Cross-Validation
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(train_X, train_y)

# Get the best number of estimators
best_rf = grid_search_rf.best_estimator_
best_n_estimators = grid_search_rf.best_params_['n_estimators']
best_max_depth = grid_search_rf.best_params_['max_depth']
print(f"Best number of estimators: {best_n_estimators}")
print(f"Best max depth: {best_max_depth}")

KeyboardInterrupt: 

In [21]:
best_rf.predict(test_X)

mse_brf = mean_squared_error(test_y, best_rf.predict(test_X))
mae_brf = mean_absolute_error(test_y, best_rf.predict(test_X))
r2_brf = r2_score(test_y, best_rf.predict(test_X))

print(f"Mean Squared Error: {mse_brf}, Mean Absolute Error: {mae_brf}, R2 Score: {r2_brf}")

Mean Squared Error: 101820.5675597646, Mean Absolute Error: 39.17148055688731, R2 Score: 0.9073725601697918


XGBoost

In [22]:
param_grid_xgb = {
    'n_estimators': range(10, 101, 5),
    'max_depth': [3, 5, 7],
}

xgb = XGBRegressor(random_state=42)

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_xgb.fit(train_X, train_y)

best_xgb = grid_search_xgb.best_estimator_
grid_search_xgb.best_params_

{'max_depth': 5, 'n_estimators': 10}

In [23]:
best_xgb.predict(test_X)

mse_bxgb = mean_squared_error(test_y, best_xgb.predict(test_X))
mae_bxgb = mean_absolute_error(test_y, best_xgb.predict(test_X))
r2_bxgb = r2_score(test_y, best_xgb.predict(test_X))

print(f"Mean Squared Error: {mse_bxgb}, Mean Absolute Error: {mae_bxgb}, R2 Score: {r2_bxgb}")

Mean Squared Error: 107589.7109375, Mean Absolute Error: 46.88821029663086, R2 Score: -0.8458577990531921


Same takeaway

# Ensemble

Weighted Average Ensemble

In [46]:
def ensemble_mse_target(weights, predictions_target, y_true_target):
    """
    Compute MSE for a single target variable.
    """
    # Weighted prediction for one target: dot product of weights and model predictions.
    ensemble_pred = np.dot(weights, predictions_target)  # predictions_target shape: (n_models, n_samples)
    mse = np.mean((y_true_target - ensemble_pred) ** 2)
    return mse

def ensemble_r2_target(weights, predictions_target, y_true_target):
    """
    Compute R2 for a single target variable.
    """
    # Weighted prediction for one target: dot product of weights and model predictions.
    ensemble_pred = np.dot(weights, predictions_target)  # predictions_target shape: (n_models, n_samples)
    return -r2_score(y_true_target, ensemble_pred)

def ensemble_mae_target(weights, predictions_target, y_true_target):
    """
    Compute MAE for a single target variable.
    """
    # Weighted prediction for one target: dot product of weights and model predictions.
    ensemble_pred = np.dot(weights, predictions_target)  # predictions_target shape: (n_models, n_samples)
    mae = np.mean(np.abs(y_true_target - ensemble_pred))
    return mae

In [47]:
print('Linear Regression:')
lrs = []
for train_idx, val_idx in kf.split(train_X_lr):
    X_train, X_val = train_X_lr.iloc[train_idx], train_X_lr.iloc[val_idx]
    y_train = train_y_lr.iloc[train_idx]

    model = LinearRegression()
    model.fit(X_train, y_train)  # trains on 704 samples
    lrs.append(model)

predictions_lre = np.mean([lr.predict(validation_X_ensemble) for lr in lrs], axis=0)
predictions_lre_rounded = predictions_lre.round(0)
mse_lre = mean_squared_error(validation_y_ensemble, predictions_lre_rounded)
mae_lre = mean_absolute_error(validation_y_ensemble, predictions_lre_rounded)
r2_lre = r2_score(validation_y_ensemble, predictions_lre_rounded)
print(f"Mean Squared Error: {mse_lre}, Mean Absolute Error: {mae_lre}, R2 Score: {r2_lre}")

print('Random Forest:')
rfs = []
for train_idx, val_idx in kf.split(train_X_rf):
    X_train, X_val = train_X_rf.iloc[train_idx], train_X_rf.iloc[val_idx]
    y_train = train_y_rf.iloc[train_idx]

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)  # trains on 704 samples
    rfs.append(model)

predictions_rfe = np.mean([rf.predict(validation_X_ensemble) for rf in rfs], axis=0)
predictions_rfe_rounded = predictions_rfe.round(0)
mse_rfe = mean_squared_error(validation_y_ensemble, predictions_rfe_rounded)
mae_rfe = mean_absolute_error(validation_y_ensemble, predictions_rfe_rounded)
r2_rfe = r2_score(validation_y_ensemble, predictions_rfe_rounded)
print(f"Mean Squared Error: {mse_rfe}, Mean Absolute Error: {mae_rfe}, R2 Score: {r2_rfe}")

print('XGBoost:')
xgbs = []
for train_idx, val_idx in kf.split(train_X_xgb):
    X_train, X_val = train_X_xgb.iloc[train_idx], train_X_xgb.iloc[val_idx]
    y_train = train_y_xgb.iloc[train_idx]

    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)  # trains on 704 samples
    xgbs.append(model)

predictions_xgbe = np.mean([xgb.predict(validation_X_ensemble) for xgb in xgbs], axis=0)
predictions_xgbe_rounded = predictions_xgbe.round(0)
mse_xgbe = mean_squared_error(validation_y_ensemble, predictions_xgbe_rounded)
mae_xgbe = mean_absolute_error(validation_y_ensemble, predictions_xgbe_rounded)
r2_xgbe = r2_score(validation_y_ensemble, predictions_xgbe_rounded)
print(f"Mean Squared Error: {mse_xgbe}, Mean Absolute Error: {mae_xgbe}, R2 Score: {r2_xgbe}")

Linear Regression:
Mean Squared Error: 198420.1257421544, Mean Absolute Error: 81.97688719253605, R2 Score: 0.8881033336990193
Random Forest:
Mean Squared Error: 165908.79558948262, Mean Absolute Error: 41.24300254452926, R2 Score: 0.7868392271797399
XGBoost:
Mean Squared Error: 172991.46875, Mean Absolute Error: 45.73176193237305, R2 Score: 0.7128705978393555


In [48]:
test_predictions = np.array([np.mean([lr.predict(test_X) for lr in lrs], axis=0),
                             np.mean([xgb.predict(test_X) for xgb in xgbs], axis=0), 
                             np.mean([rf.predict(test_X) for rf in rfs], axis=0)])
test_predictions_rounded = test_predictions.round(0)

MSE as target

In [49]:
import numpy as np
from scipy.optimize import minimize

n_models = 3  # Number of models
n_targets = 6  # Number of targets
n_samples = validation_y_ensemble.shape[0]  # Number of samples in the test set

predictions = np.array([predictions_lre, predictions_xgbe, predictions_rfe])
y_val = validation_y_ensemble.values

# Assume y_val and predictions are defined as before.
optimal_weights_per_target = np.zeros((n_models, n_targets)) # 4 models, 6 targets


for i in range(n_targets):  # Assuming 6 targets
    # Extract predictions for target i: shape becomes (n_models, n_samples)
    predictions_i = predictions[:, :, i]
    y_true_i = y_val[:, i]
    
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * n_models
    initial_weights = np.full(n_models, 1.0 / n_models)
    
    result = minimize(ensemble_mse_target, initial_weights, args=(predictions_i, y_true_i),
                      bounds=bounds, constraints=constraints)
    
    optimal_weights_per_target[:, i] = result.x

model_names = ['Linear Regression', 'XGBoost', 'Random Forest']
print("Optimal Weights and Corresponding Models per Target:")
for i, target in enumerate(target_specific):
    print(f"{target}:")
    for weight, model in zip(optimal_weights_per_target[:, i], model_names):
        print(f"  {model}: {weight:.4f}")


Optimal Weights and Corresponding Models per Target:
CC_Civil:
  Linear Regression: 0.3659
  XGBoost: 0.4276
  Random Forest: 0.2065
CC_Criminal:
  Linear Regression: 0.5608
  XGBoost: 0.3027
  Random Forest: 0.1365
CC_labor:
  Linear Regression: 0.4654
  XGBoost: 0.0058
  Random Forest: 0.5288
CC_criminal_labor:
  Linear Regression: 0.8567
  XGBoost: 0.0360
  Random Forest: 0.1073
CC_tutelar:
  Linear Regression: 0.5048
  XGBoost: 0.2437
  Random Forest: 0.2514
CC_militar:
  Linear Regression: 1.0000
  XGBoost: 0.0000
  Random Forest: 0.0000


In [50]:
# Compute ensemble predictions for each target using the corresponding weights
n_test_samples = test_predictions.shape[1]  # Number of samples in the test set
ensemble_pred_separate = np.zeros((n_test_samples, n_targets))
for i in range(n_targets):
    ensemble_pred_separate[:, i] = np.dot(optimal_weights_per_target[:, i], test_predictions[:, :, i])

In [45]:
ensemble_pred_separate = ensemble_pred_separate.round(0)
ensemble_mse = mean_squared_error(test_y, ensemble_pred_separate)
ensemble_mae = mean_absolute_error(test_y, ensemble_pred_separate)
ensemble_r2 = r2_score(test_y, ensemble_pred_separate)

print(f"Mean Squared Error: {ensemble_mse}, Mean Absolute Error: {ensemble_mae}, R2 Score: {ensemble_r2}")

Mean Squared Error: 87949.09783989836, Mean Absolute Error: 54.04997882253283, R2 Score: 0.9045132708098724


R^2 as target

In [51]:
n_models = 3  # Number of models
n_targets = 6  # Number of targets
n_samples = validation_y_ensemble.shape[0]  # Number of samples in the validation set

predictions = np.array([predictions_lre, predictions_xgbe, predictions_rfe])
y_val = validation_y_ensemble.values

# Assume y_val and predictions are defined as before.
optimal_weights_per_target_r2 = np.zeros((n_models, n_targets)) # 3 models, 6 targets


for i in range(n_targets):  # Assuming 6 targets
    # Extract predictions for target i: shape becomes (n_models, n_samples)
    predictions_i = predictions[:, :, i]
    y_true_i = y_val[:, i]
    
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * n_models
    initial_weights = np.full(n_models, 1.0 / n_models)
    
    result = minimize(ensemble_r2_target, initial_weights, args=(predictions_i, y_true_i),
                      bounds=bounds, constraints=constraints)
    
    optimal_weights_per_target_r2[:, i] = result.x

model_names = ['Linear Regression', 'XGBoost', 'Random Forest']
print("Optimal Weights and Corresponding Models per Target:")
for i, target in enumerate(target_specific):
    print(f"{target}:")
    for weight, model in zip(optimal_weights_per_target_r2[:, i], model_names):
        print(f"  {model}: {weight:.4f}")

Optimal Weights and Corresponding Models per Target:
CC_Civil:
  Linear Regression: 0.3663
  XGBoost: 0.4280
  Random Forest: 0.2057
CC_Criminal:
  Linear Regression: 0.5608
  XGBoost: 0.3027
  Random Forest: 0.1365
CC_labor:
  Linear Regression: 0.4654
  XGBoost: 0.0058
  Random Forest: 0.5288
CC_criminal_labor:
  Linear Regression: 0.8571
  XGBoost: 0.0355
  Random Forest: 0.1074
CC_tutelar:
  Linear Regression: 0.5064
  XGBoost: 0.2439
  Random Forest: 0.2497
CC_militar:
  Linear Regression: 1.0000
  XGBoost: 0.0000
  Random Forest: 0.0000


In [52]:
# Compute ensemble predictions for each target using the corresponding weights
n_test_samples = test_predictions.shape[1]  # Number of samples in the test set
ensemble_pred_separate_r2 = np.zeros((n_test_samples, n_targets))
for i in range(n_targets):
    ensemble_pred_separate_r2[:, i] = np.dot(optimal_weights_per_target_r2[:, i], test_predictions[:, :, i])

In [53]:
ensemble_pred_separate_r2 = ensemble_pred_separate_r2.round(0)
ensemble_mse = mean_squared_error(test_y, ensemble_pred_separate_r2)
ensemble_mae = mean_absolute_error(test_y, ensemble_pred_separate_r2)
ensemble_r2 = r2_score(test_y, ensemble_pred_separate_r2)

print(f"Mean Squared Error: {ensemble_mse}, Mean Absolute Error: {ensemble_mae}, R2 Score: {ensemble_r2}")

Mean Squared Error: 85549.26950805767, Mean Absolute Error: 48.944232400339274, R2 Score: 0.8954370560710793


MAE as target

In [54]:
n_models = 3  # Number of models
n_targets = 6  # Number of targets
n_samples = validation_y_ensemble.shape[0]  # Number of samples in the validation set

predictions = np.array([predictions_lre, predictions_xgbe, predictions_rfe])
y_val = validation_y_ensemble.values

# Assume y_val and predictions are defined as before.
optimal_weights_per_target_mae = np.zeros((n_models, n_targets)) # 3 models, 6 targets


for i in range(n_targets):  # Assuming 6 targets
    # Extract predictions for target i: shape becomes (n_models, n_samples)
    predictions_i = predictions[:, :, i]
    y_true_i = y_val[:, i]
    
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * n_models
    initial_weights = np.full(n_models, 1.0 / n_models)
    
    result = minimize(ensemble_mae_target, initial_weights, args=(predictions_i, y_true_i),
                      bounds=bounds, constraints=constraints)
    
    optimal_weights_per_target_mae[:, i] = result.x

model_names = ['Linear Regression', 'XGBoost', 'Random Forest']
print("Optimal Weights and Corresponding Models per Target:")
for i, target in enumerate(target_specific):
    print(f"{target}:")
    for weight, model in zip(optimal_weights_per_target_mae[:, i], model_names):
        print(f"  {model}: {weight:.4f}")

Optimal Weights and Corresponding Models per Target:
CC_Civil:
  Linear Regression: 0.0111
  XGBoost: 0.2143
  Random Forest: 0.7746
CC_Criminal:
  Linear Regression: 0.0989
  XGBoost: 0.5991
  Random Forest: 0.3019
CC_labor:
  Linear Regression: 0.0000
  XGBoost: 0.0000
  Random Forest: 1.0000
CC_criminal_labor:
  Linear Regression: 0.0000
  XGBoost: 0.6529
  Random Forest: 0.3471
CC_tutelar:
  Linear Regression: 0.0031
  XGBoost: 0.7822
  Random Forest: 0.2147
CC_militar:
  Linear Regression: 1.0000
  XGBoost: 0.0000
  Random Forest: 0.0000


In [55]:
# Compute ensemble predictions for each target using the corresponding weights
n_test_samples = test_predictions.shape[1]  # Number of samples in the test set
ensemble_pred_separate_mae = np.zeros((n_test_samples, n_targets))
for i in range(n_targets):
    ensemble_pred_separate_mae[:, i] = np.dot(optimal_weights_per_target_mae[:, i], test_predictions[:, :, i])

In [56]:
ensemble_pred_separate_mae = ensemble_pred_separate_mae.round(0)
ensemble_mse = mean_squared_error(test_y, ensemble_pred_separate_mae)
ensemble_mae = mean_absolute_error(test_y, ensemble_pred_separate_mae)
ensemble_r2 = r2_score(test_y, ensemble_pred_separate_mae)

print(f"Mean Squared Error: {ensemble_mse}, Mean Absolute Error: {ensemble_mae}, R2 Score: {ensemble_r2}")

Mean Squared Error: 156108.75254452927, Mean Absolute Error: 40.25424088210348, R2 Score: 0.8576586121294735


Meta-Learner

In [57]:
meta_X = np.hstack([predictions_lre, predictions_xgbe, predictions_rfe])

meta_models = []
for i in range(6):
    y_val_i = y_val[:, i]
    meta_model = Ridge()
    meta_model.fit(meta_X, y_val_i)
    meta_models.append(meta_model)

# To predict on the test set:
meta_X_test = np.hstack([np.mean([lr.predict(test_X) for lr in lrs], axis=0),
                             np.mean([xgb.predict(test_X) for xgb in xgbs], axis=0), 
                             np.mean([rf.predict(test_X) for rf in rfs], axis=0)]
                             )

y_pred_meta_ensemble = np.column_stack([
    meta_model.predict(meta_X_test) for meta_model in meta_models
])

In [58]:
meta_mse = mean_squared_error(test_y, y_pred_meta_ensemble)
meta_mae = mean_absolute_error(test_y, y_pred_meta_ensemble)
meta_r2 = r2_score(test_y, y_pred_meta_ensemble)

print(f"Mean Squared Error: {meta_mse}, Mean Absolute Error: {meta_mae}, R2 Score: {meta_r2}")

Mean Squared Error: 91558.73700924998, Mean Absolute Error: 55.617985590518686, R2 Score: 0.849659525528371


In [34]:
for _ in range(10):

    rfr = RandomForestRegressor()
    rfr.fit(train_X, train_y)
    predictions_rfr = rfr.predict(test_X)
    predictions_rfr = predictions_rfr.round(0)

    mse_rfr = mean_squared_error(test_y, predictions_rfr)
    mae_rfr = mean_absolute_error(test_y, predictions_rfr)
    r2_rfr = r2_score(test_y, predictions_rfr)
    print(f"Mean Squared Error: {mse_rfr}, Mean Absolute Error: {mae_rfr}, R2 Score: {r2_rfr}")

Mean Squared Error: 116259.46293943243, Mean Absolute Error: 39.79500211774671, R2 Score: 0.9051820007427479
Mean Squared Error: 103286.13002964847, Mean Absolute Error: 38.64464210080474, R2 Score: 0.9106146356425548
Mean Squared Error: 112395.8958068615, Mean Absolute Error: 39.664548919949176, R2 Score: 0.9097691849497639
Mean Squared Error: 114282.16539601861, Mean Absolute Error: 39.40724269377383, R2 Score: 0.9069180521948127
Mean Squared Error: 105342.02117746715, Mean Absolute Error: 38.56077933079204, R2 Score: 0.9101732744556772
Mean Squared Error: 106317.28759000421, Mean Absolute Error: 39.302414231257934, R2 Score: 0.9036262974177296
Mean Squared Error: 100831.59085133417, Mean Absolute Error: 38.49047013977129, R2 Score: 0.9070415002184289
Mean Squared Error: 104960.95235069886, Mean Absolute Error: 38.71812791190174, R2 Score: 0.9073506505111171
Mean Squared Error: 109341.61562897079, Mean Absolute Error: 38.71685726387125, R2 Score: 0.9065626985094976
Mean Squared Error

# More Linear Models

Ridge

In [59]:
ridge = Ridge(alpha=8)
ridge.fit(train_X, train_y)

predictions_ridge = ridge.predict(test_X)
predictions_ridge = predictions_ridge.round(0)


mse_ridge = mean_squared_error(test_y, predictions_ridge)
mae_ridge = mean_absolute_error(test_y, predictions_ridge)
r2_ridge = r2_score(test_y, predictions_ridge)

print(f"Mean Squared Error: {mse_ridge}, Mean Absolute Error: {mae_ridge}, R2 Score: {r2_ridge}")

Mean Squared Error: 73380.14609838846, Mean Absolute Error: 50.528201865988116, R2 Score: 0.9322181909701354


In [60]:
joblib.dump(ridge, 'ridge.joblib')

['ridge.joblib']

In [65]:
pd.DataFrame(ridge.coef_, columns=train_X.columns).to_csv("ridge_coefs.csv", index=False)
pd.Series(ridge.intercept_).to_csv("ridge_intercepts.csv", index=False, header=False)

In [62]:
ridge.coef_

array([[ 1.39000000e+02, -2.46700546e+02,  1.21828996e+01, ...,
         1.81429945e+02, -8.24160618e+01,  1.83553667e+02],
       [-3.86849419e+00,  1.66277722e-01,  2.55719266e+00, ...,
        -1.25373922e+01,  2.63867711e+01,  8.18934644e+01],
       [-2.85420270e-01,  1.31426447e+01, -1.50482171e+00, ...,
         1.44971568e+00,  2.23058544e+00,  3.75510078e+00],
       [ 1.87330938e-01, -7.29540189e-01, -8.48531890e-02, ...,
        -9.86956239e-01, -1.11177661e+00,  5.89731450e-01],
       [ 4.63146699e-01, -2.67706791e-02,  1.17250350e+00, ...,
        -1.27717462e+01, -1.36651894e+01, -2.63664207e+00],
       [ 3.09421589e-02,  1.45514777e-02,  4.92650419e-03, ...,
        -3.67662728e-02,  4.18753837e-02, -5.00430271e-02]],
      shape=(6, 248))

In [63]:
ridge.intercept_

array([-1.56524355e+02,  9.22888930e+00,  3.07393969e+00,  6.95010588e-01,
        5.55776228e+00, -2.74380571e-02])

Lasso

In [66]:
lasso = Lasso()
lasso.fit(train_X, train_y)

predictions_lasso = lasso.predict(test_X)
predictions_lasso = predictions_lasso.round(0)

mse_lasso = mean_squared_error(test_y, predictions_lasso)
mae_lasso = mean_absolute_error(test_y, predictions_lasso)
r2_lasso = r2_score(test_y, predictions_lasso)

print(f"Mean Squared Error: {mse_lasso}, Mean Absolute Error: {mae_lasso}, R2 Score: {r2_lasso}")

Mean Squared Error: 72029.77353689568, Mean Absolute Error: 48.008057675996604, R2 Score: 0.9120933094787619


Elastic Net (mix of Ridge and Lasso)

In [67]:
en = ElasticNet()
en.fit(train_X, train_y)

predictions_en = en.predict(test_X)
predictions_en = predictions_en.round(0)

mse_en = mean_squared_error(test_y, predictions_en)
mae_en = mean_absolute_error(test_y, predictions_en)
r2_en = r2_score(test_y, predictions_en)

print(f"Mean Squared Error: {mse_en}, Mean Absolute Error: {mae_en}, R2 Score: {r2_en}")

Mean Squared Error: 81382.8159457167, Mean Absolute Error: 46.14249363867685, R2 Score: 0.9122718619047429


Ridge alpha optimization

In [68]:
# Define the parameter grid for alpha
param_grid_ridge = {'alpha': np.arange(0.01,15,0.01)}

# Initialize the Ridge Regressor
ridge = Ridge()

# Perform Grid Search with Cross-Validation
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_ridge.fit(train_X, train_y)

# Get the best alpha value
best_ridge = grid_search_ridge.best_estimator_
best_alpha = grid_search_ridge.best_params_['alpha']
print(f"Best alpha: {best_alpha}")

# Evaluate the best model on the test set
predictions_best_ridge = best_ridge.predict(test_X)
predictions_best_ridge = predictions_best_ridge.round(0)

mse_best_ridge = mean_squared_error(test_y, predictions_best_ridge)
mae_best_ridge = mean_absolute_error(test_y, predictions_best_ridge)
r2_best_ridge = r2_score(test_y, predictions_best_ridge)

print(f"Mean Squared Error: {mse_best_ridge}, Mean Absolute Error: {mae_best_ridge}, R2 Score: {r2_best_ridge}")

Best alpha: 10.17
Mean Squared Error: 73358.20038167939, Mean Absolute Error: 49.98409669211196, R2 Score: 0.9324519894076184


# XGBoost Linear Approximation

In [70]:
linear_model = LinearRegression()
ridge_model = Ridge()
lasso_model = Lasso()
en_model = ElasticNet()

linear_models = [linear_model, ridge_model, lasso_model, en_model]

xgb = joblib.load('xgb.joblib')

xgb_Y = xgb.predict(train_X)
xgb_Y_test = xgb.predict(test_X)

for model in linear_models:

    model.fit(train_X, xgb_Y)
    train_predictions = model.predict(train_X)
    test_predictions = model.predict(test_X)

    train_mse = mean_squared_error(xgb_Y, train_predictions)
    train_mae = mean_absolute_error(xgb_Y, train_predictions)
    train_r2 = r2_score(xgb_Y, train_predictions)

    test_mse = mean_squared_error(xgb_Y_test, test_predictions)
    test_mae = mean_absolute_error(xgb_Y_test, test_predictions)
    test_r2 = r2_score(xgb_Y_test, test_predictions)

    print(f"Model: {model.__class__.__name__}\nTrain:\nMean Squared Error: {train_mse}, Mean Absolute Error: {train_mae}, R2 Score: {train_r2}")
    print(f"Test:\nMean Squared Error: {test_mse}, Mean Absolute Error: {test_mae}, R2 Score: {test_r2}\n")




Model: LinearRegression
Train:
Mean Squared Error: 108516.89344272856, Mean Absolute Error: 51.47972445958654, R2 Score: 0.9384480986302227
Test:
Mean Squared Error: 95556.38385120589, Mean Absolute Error: 52.06407352788963, R2 Score: 0.947195127269571

Model: Ridge
Train:
Mean Squared Error: 108579.84410303934, Mean Absolute Error: 50.815370375577636, R2 Score: 0.938416689353522
Test:
Mean Squared Error: 94677.04082883865, Mean Absolute Error: 51.08124179038601, R2 Score: 0.9478272208966402

Model: Lasso
Train:
Mean Squared Error: 109886.87263619372, Mean Absolute Error: 47.375098502699494, R2 Score: 0.9187974022642801
Test:
Mean Squared Error: 92647.37427532733, Mean Absolute Error: 45.63315575801159, R2 Score: 0.9283937215441126

Model: ElasticNet
Train:
Mean Squared Error: 124136.38616147464, Mean Absolute Error: 46.36644964378157, R2 Score: 0.9199475304544128
Test:
Mean Squared Error: 106389.18800733976, Mean Absolute Error: 41.39918619411195, R2 Score: 0.9301377577020761



In [72]:
joblib.dump(ridge_model, 'xgb_ridge_aprox.joblib')

['xgb_ridge_aprox.joblib']

In [71]:
joblib.dump(linear_model, 'xgb_linreg_aprox.joblib')

['xgb_linreg_aprox.joblib']

In [73]:
p=ridge_model.predict(test_X)

mse = mean_squared_error(test_y, p)
mae = mean_absolute_error(test_y, p)
r2 = r2_score(test_y, p)
print(f"Mean Squared Error: {mse}, Mean Absolute Error: {mae}, R2 Score: {r2}")

Mean Squared Error: 74026.29111514364, Mean Absolute Error: 53.16997193188106, R2 Score: 0.9306869567365544
