In [44]:
import pandas as pd
import numpy as np

from typing import List
import matplotlib.pyplot as plt
from eval_plots import plot_prediction, convert_to_datetime, shade_between

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

In [45]:
import pmdarima as pm  # numpy==1.26.4
from typing import List
from dateutil.relativedelta import relativedelta

def __arima_feed(series, h=6):
    series = series.dropna()
    arima_model = pm.auto_arima(series, stepwise=True)
    forecast = arima_model.predict(n_periods=h)
    forecast_index = pd.date_range(
        series.index[-1] + relativedelta(months=3), periods=h, freq="QS"
    )
    forecast_series = pd.Series(forecast, index=forecast_index)
    # series = series.append(forecast_series)
    # return series
    return forecast_series

In [46]:
target_col_name = "GDPC1"

In [47]:
X_train = pd.read_csv("./data/1_art_data_prep/2017-01-01/X_train.csv", index_col=0)
X_test = pd.read_csv("./data/1_art_data_prep/2017-01-01/X_test.csv", index_col=0)
y_train = pd.read_csv("./data/1_art_data_prep/2017-01-01/y_train.csv", index_col=0)
y_test = pd.read_csv("./data/1_art_data_prep/2017-01-01/y_test.csv", index_col=0)

In [49]:
X_train = convert_to_datetime(X_train.reset_index().rename(columns={"index": "date"}), ["date"])
X_test = convert_to_datetime(X_test.reset_index().rename(columns={"index": "date"}), ["date"])
y_train = convert_to_datetime(y_train.reset_index().rename(columns={"index": "date"}), ["date"])
X_train = X_train.set_index("date")
X_test = X_test.set_index("date")
y_train = y_train.set_index("date")

X_train = X_train.sort_index()
X_test = X_test.sort_index()
y_train = y_train.sort_index()

In [62]:
oom_data = pd.read_excel("./data/US/2017-01-01.xlsx", sheet_name="other")
oom_data = convert_to_datetime(oom_data, ["ReferenceDate"])
oom_data = oom_data.pivot(index="ReferenceDate", columns="VariableCode", values="VariableValue")

exog = oom_data[['ICSA']].sort_index().pct_change().resample('MS').sum()

X_train = X_train.merge(exog, left_index=True, right_index=True, how="left")
X_test = X_test.merge(exog, left_index=True, right_index=True, how="left")


The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.



In [63]:
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)  # Fit scaler on training data
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)        # Apply the same scaler to test data

### Linear model

In [79]:
model = Ridge()

model.fit(X_train_scaled.drop(columns=["ICSA"]), y_train)

ols_predictions = model.predict(X_train_scaled.drop(columns=["ICSA"])).reshape(1, -1)[0]

train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": ols_predictions,
    "actuals": y_train[target_col_name].values
})
# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

print(model.predict(X_test_scaled.drop(columns=["ICSA"])))

[[0.70689488]]


In [86]:
from sklearn.linear_model import Ridge
from lineartree import LinearTreeRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

# model = LinearTreeRegressor(base_estimator=LinearRegression())
# model.fit(X_train_scaled, y_train)

# Define hyperparameters to tune for the underlying DecisionTree in LinearTreeRegressor
param_grid = {
    'max_depth': [3, 5, 10, 15],  # Maximum depth of the individual trees
    'min_samples_split': [6, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples required in a leaf node
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
}

tscv = TimeSeriesSplit(n_splits=5)
lin_tree = LinearTreeRegressor(base_estimator=Ridge())
grid_search = GridSearchCV(estimator=lin_tree, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)

model = grid_search.best_estimator_
model.fit(X_train_scaled, y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

model.predict(X_test_scaled)[0]




120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lineartree/lineartree.py", line 187, in fit
    self._fit(X, y, sample_weight)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lineartree/_classes.py", line 493, in _fit
    raise ValueError(
ValueError: min_samples_leaf must be an integer greate

Best hyperparameters: {'max_depth': 3, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 5, 'min_samples_split': 6}


0.672461748123169

In [90]:
from sklearn.linear_model import Ridge
from lineartree import LinearForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

# model = LinearForestRegressor(base_estimator=LinearRegression(), max_features=None)
# model.fit(X_train_scaled, y_train)

param_grid = {
    # 'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [3, 5, 10, 15],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [1e-6, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
    'min_weight_fraction_leaf': [1e-6, 0.01, 0.1]  # Minimum fraction of weight required to be in a leaf node
}


tscv = TimeSeriesSplit(n_splits=5)
lin_forest = LinearForestRegressor(base_estimator=Ridge(), random_state=42)
grid_search = GridSearchCV(estimator=lin_forest, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled.drop(columns=["ICSA"]), y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled.drop(columns=["ICSA"]), y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled.drop(columns=["ICSA"])),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

model.predict(X_test_scaled.drop(columns=["ICSA"]))[0]

Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_impurity_decrease': 1e-06, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.1}



invalid value encountered in cast



0.5657948990683761

In [91]:
from sklearn.linear_model import Ridge
from lineartree import LinearBoostRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import pandas as pd

# model = LinearBoostRegressor(base_estimator=LinearRegression())
# model.fit(X_train_scaled, y_train)

# Define hyperparameters to tune for LinearBoostRegressor
param_grid = {
    # 'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [3, 5, 10, 15],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
    'min_weight_fraction_leaf': [0.0, 0.01, 0.1]  # Minimum fraction of weight required to be in a leaf node
}

tscv = TimeSeriesSplit(n_splits=5)
lin_boost = LinearBoostRegressor(base_estimator=Ridge(), random_state=42)
grid_search = GridSearchCV(estimator=lin_boost, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled.drop(columns=["ICSA"]), y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled.drop(columns=["ICSA"]), y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled.drop(columns=["ICSA"])),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

model.predict(X_test_scaled.drop(columns=["ICSA"]))[0]

Best hyperparameters: {'max_depth': 3, 'max_features': 'log2', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0}



invalid value encountered in cast



0.74329937

In [92]:
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

param_grid = {
    # 'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],  # Different layer configurations
    # 'activation': ['relu', 'tanh'],  # Activation functions
    # 'solver': ['adam', 'lbfgs'],  # Optimization algorithms
    # 'alpha': np.arange(0.001, 1, 0.01),  # L2 regularization term
    # 'learning_rate_init': [0.001, 0.01, 0.1],  # Initial learning rate
    # 'max_iter': [500, 1000]  # Maximum number of iterations
}

tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(
    estimator=MLPRegressor(random_state=42, activation='relu', solver='adam', alpha=0.001, learning_rate='adaptive', max_iter=1000),
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1
    )
grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)

model = grid_search.best_estimator_
model.fit(X_train_scaled, y_train)

# Create predictions on the training set
train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

# Test set predictions
model.predict(X_test_scaled)[0]

Best hyperparameters: {}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



0.8979318816169372

#### Error correction

In [85]:
ols_residuals = pd.DataFrame(y_train.values.reshape(1, -1)[0]-ols_predictions, index=y_train.index, columns=['y_resid'])

ols_residuals.apply(__arima_feed, h=y_test.shape[0], axis=0)

Unnamed: 0,y_resid
2017-01-01,0.0


### Error analysis

In [21]:
from resid_analysis import plot_resid, qqplot, qqplotly

In [23]:
%%capture
residuals = ols_residuals["y_resid"]
standarized_resid = (residuals - residuals.mean()) / residuals.std()
qqplot_data = qqplot(standarized_resid, line='s').gca().lines

In [24]:
shade_between(dt=ols_residuals.index, y=ols_residuals["y_resid"], yaxis_label="", plt_title="", date_ranges=[("2020-01-01", "2021-01-01")])

In [25]:
plot_resid(predicted=ols_predictions, residuals=residuals)

In [26]:
qqplotly(qqplot_data=qqplot_data)