In [1]:
import pandas as pd
import numpy as np
import random
from typing import List
import matplotlib.pyplot as plt

from eval_plots import plot_prediction, convert_to_datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [3]:
target_col_name = "GDPC1"
predictions = []

In [4]:
X_train = pd.read_csv("./data/1_art_data_prep/2017-01-01/X_train.csv", index_col=0)
X_test = pd.read_csv("./data/1_art_data_prep/2017-01-01/X_test.csv", index_col=0)
y_train = pd.read_csv("./data/1_art_data_prep/2017-01-01/y_train.csv", index_col=0)
y_test = pd.read_csv("./data/1_art_data_prep/2017-01-01/y_test.csv", index_col=0)

In [5]:
X_train = convert_to_datetime(X_train.reset_index().rename(columns={"index": "date"}), ["date"])
X_test = convert_to_datetime(X_test.reset_index().rename(columns={"index": "date"}), ["date"])
y_train =convert_to_datetime(y_train.reset_index().rename(columns={"index": "date"}), ["date"])
X_train = X_train.set_index("date")
X_test = X_test.set_index("date")
y_train = y_train.set_index("date")

X_train = X_train.sort_index()
X_test = X_test.sort_index()
y_train = y_train.sort_index()

In [6]:
oom_data = pd.read_excel("./data/US/2017-01-01.xlsx", sheet_name="other")
oom_data = oom_data.pivot(index="ReferenceDate", columns="VariableCode", values="VariableValue")

exog = oom_data[['ICSA']].sort_index().pct_change().resample('MS').sum()

  exog = oom_data[['ICSA']].sort_index().pct_change().resample('MS').sum()


In [7]:
X_train = X_train.merge(exog, left_index=True, right_index=True, how="left")
X_test = X_test.merge(exog, left_index=True, right_index=True, how="left")

In [8]:
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)  # Fit scaler on training data
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)        # Apply the same scaler to test data

### Linear models

In [9]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train_scaled, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled).reshape(1, -1)[0],
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0][0])
print(predictions[-1])

0.0935983365874371


In [10]:
from sklearn.linear_model import Ridge

model = Ridge()

model.fit(X_train_scaled, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled).reshape(1, -1)[0],
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0][0])
print(predictions[-1])

0.5862885182117477


In [11]:
from sklearn.linear_model import BayesianRidge

model = BayesianRidge()

model.fit(X_train_scaled, y_train)

ols_predictions = model.predict(X_train_scaled).reshape(1, -1)[0]

train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": ols_predictions,
    "actuals": y_train[target_col_name].values
})
# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

print(model.predict(X_test_scaled))

[1.30342694]


In [12]:
from sklearn.linear_model import Lasso

model = Lasso()

model.fit(X_train_scaled, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])

2.1595667187963814


In [13]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# model = ElasticNet()

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],  # Regularization strength
    'l1_ratio': [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]  # The balance between L1 and L2 regularization (Lasso and Ridge)
}

grid_search = GridSearchCV(estimator=ElasticNet(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_


model.fit(X_train_scaled, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])

Best hyperparameters: {'alpha': 0.1, 'l1_ratio': 0.5}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

1.7609944828942292


### Tree-based models

In [14]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

model.fit(X_train, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])

3.6861971002505545


In [15]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor()
# model.fit(X_train_scaled, y_train)

param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [3, 5, 10],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
    'min_weight_fraction_leaf': [0.0, 0.01, 0.1]  # Minimum fraction of weight required to be in a leaf node
}

tscv = TimeSeriesSplit(n_splits=5)

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled, y_train)
train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Best hyperparameters: {'max_depth': 10, 'max_features': 'log2', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50}


2.457981080976688


In [16]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

# model = GradientBoostingRegressor()

# Define hyperparameters to tune for Gradient Boosting
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages
    'max_depth': [3, 5, 10],  # Maximum depth of individual trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks the contribution of each tree
    'subsample': [0.8, 1.0]  # Fraction of samples used for fitting individual trees
}

# Set up time series split
tscv = TimeSeriesSplit(n_splits=5)

gb = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled, y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = colu

Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 50, 'subsample': 0.8}


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


2.0599427395387524


In [17]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd

# model = XGBRegressor()

# Define hyperparameters to tune for XGBRegressor
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds (trees)
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 5, 10],  # Maximum depth of the trees
    'subsample': [0.8, 1.0],  # Fraction of samples used for each tree
    'colsample_bytree': [0.8, 1.0],  # Fraction of features used for each tree
    'gamma': [0, 0.1, 0.5]  # Minimum loss reduction to make a split
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model using grid search
grid_search.fit(X_train_scaled, y_train)

# Print best parameters
print("Best hyperparameters:", grid_search.best_params_)

# Use the best model
model = grid_search.best_estimator_

# Fit the model with the best parameters (optional, already fitted during grid search)
model.fit(X_train_scaled, y_train)

# Create predictions on the training set
train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# Plot the predictions and actuals (use your custom plot function)
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

# Predict on the test set
predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])


invalid value encountered in cast



Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


2.5113804


In [18]:
import xgboost as xgb

# XGBoost with linear booster (ridge-like model in boosted iterations)
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
param = {'booster': 'gblinear', 'objective': 'reg:squarederror'}
bst = xgb.train(param, dtrain, num_boost_round=100)

train_preds = pd.DataFrame.from_dict({
    "date": X_train_scaled.index,
    "predictions": bst.predict(dtrain),
    "actuals": y_train[target_col_name].values
})

# # plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# # plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# # plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

dtest = xgb.DMatrix(X_test)
predictions.append(bst.predict(dtest)[0])
print(predictions[-1])

-1.1236893


In [19]:
from sklearn.linear_model import Ridge
from lineartree import LinearTreeRegressor

# model = LinearTreeRegressor(base_estimator=LinearRegression())
# model.fit(X_train_scaled, y_train)

# Define hyperparameters to tune for the underlying DecisionTree in LinearTreeRegressor
param_grid = {
    'max_depth': [3, 5, 10],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples required in a leaf node
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
}

tscv = TimeSeriesSplit(n_splits=5)
lin_tree = LinearTreeRegressor(base_estimator=Ridge())
grid_search = GridSearchCV(estimator=lin_tree, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)

model = grid_search.best_estimator_
model.fit(X_train_scaled, y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])




315 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lineartree/lineartree.py", line 187, in fit
    self._fit(X, y, sample_weight)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lineartree/_classes.py", line 476, in _fit
    raise ValueError(
ValueError: min_samples_split must be an integer great

Best hyperparameters: {'max_depth': 3, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 5, 'min_samples_split': 10}


0.672461748123169


In [20]:
from sklearn.linear_model import Ridge
from lineartree import LinearForestRegressor

# model = LinearForestRegressor(base_estimator=LinearRegression(), max_features=None)
# model.fit(X_train_scaled, y_train)

param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [3, 5, 10],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
    'min_weight_fraction_leaf': [0.0, 0.01, 0.1]  # Minimum fraction of weight required to be in a leaf node
}


tscv = TimeSeriesSplit(n_splits=5)
lin_forest = LinearForestRegressor(base_estimator=Ridge(), random_state=42)
grid_search = GridSearchCV(estimator=lin_forest, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled, y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])

Best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_impurity_decrease': 0.01, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.1, 'n_estimators': 50}



invalid value encountered in cast



0.4057857284393367


In [21]:
from sklearn.linear_model import Ridge
from lineartree import LinearBoostRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import pandas as pd

# model = LinearBoostRegressor(base_estimator=LinearRegression())
# model.fit(X_train_scaled, y_train)

# Define hyperparameters to tune for LinearBoostRegressor
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [3, 5, 10],  # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 3, 5],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.01, 0.1],  # A node will be split if this decrease in impurity is observed
    'min_weight_fraction_leaf': [0.0, 0.01, 0.1]  # Minimum fraction of weight required to be in a leaf node
}

tscv = TimeSeriesSplit(n_splits=5)
lin_boost = LinearBoostRegressor(base_estimator=Ridge(), random_state=42)
grid_search = GridSearchCV(estimator=lin_boost, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

print("Best hyperparameters:", grid_search.best_params_)
model = grid_search.best_estimator_

model.fit(X_train_scaled, y_train)

train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

predictions.append(model.predict(X_test_scaled)[0])
print(predictions[-1])


Best hyperparameters: {'max_depth': 3, 'max_features': 'log2', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50}



invalid value encountered in cast



1.1037949


### Neural Networks

In [22]:
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

alpha_values = np.arange(0.001, 1, 0.01)

# Initialize variables to store the best configuration and score
best_model = None
best_score = float('inf')  # Initialize to a large number
best_alpha = None

# Loop through alpha values
for alpha in alpha_values:
    # print(f"Training with alpha: {alpha}")
    
    # Create MLPRegressor model with current alpha and default parameters
    model = MLPRegressor(activation='relu',
                         solver='adam',
                         learning_rate='adaptive',
                         max_iter=1000,
                         alpha=alpha,
                         random_state=42)
    
    # Train the model on the training data
    model.fit(X_train_scaled, y_train.values)
    
    # Make predictions on the test data
    test_predictions = model.predict(X_test_scaled)
    
    # Calculate the mean squared error on the test data
    mse = mean_squared_error(y_test.values, test_predictions)
    # print(f"MSE on test data with alpha={alpha}: {mse}")
    
    # If the current configuration has a lower error, store it as the best
    if mse < best_score:
        best_score = mse
        best_model = model
        best_alpha = alpha

print(f"Best alpha: {best_alpha}")
print(f"Best test MSE: {best_score}")

# Now you can use the best model to make predictions on the training data
train_preds = pd.DataFrame.from_dict({
    "date": X_train.index,
    "predictions": best_model.predict(X_train_scaled),
    "actuals": y_train[target_col_name].values
})

# Plot the predictions and actuals
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode="lines+markers", legend_position=(1, 1.075))

# Test set predictions
predictions.append(best_model.predict(X_test_scaled)[0])
print(predictions[-1])

Best alpha: 0.001
Best test MSE: 0.041543977410961705


0.8979318816169372


In [23]:
import torch
from nowcast_lstm.LSTM import LSTM
    

data = pd.merge(X_train_scaled, y_train, left_index=True, right_index=True)
data = convert_to_datetime(data.reset_index(), ["date"])

model = LSTM(
        data = data,
        target_variable = "GDPC1",
        n_timesteps = 1,
        # fill_na_func = np.nanmean,
        # fill_ragged_edges_func = np.nanmean,
        # n_models = 10,
        # train_episodes = 500,
        # batch_size = 50,
        # decay = 0.98,
        # n_hidden = 10,
        # n_layers = 1,
        # dropout = 0.0,
        criterion = torch.nn.MSELoss(),
        optimizer = torch.optim.Adam,
        optimizer_parameters = {"lr":1e-2, "weight_decay":0.0}
    )
model.train(quiet=True)

test_data = X_test_scaled.copy()
test_data["GDPC1"] = np.nan
test_data = test_data.reset_index()

pred = model.predict(pd.concat([data, test_data]).reset_index(drop=True))
pred = convert_to_datetime(pred, ["date"])
pred = pred.set_index("date")

train_preds = pd.DataFrame.from_dict({
    "date": pred.drop(y_test.index).index,
    "predictions": pred.drop(y_test.index)["predictions"],
    "actuals": pred.drop(y_test.index)["actuals"]
})

# # # plt.plot(train_preds.date, train_preds.actuals, label="actual") 
# # # plt.plot(train_preds.date, train_preds.predictions, label="predictions")
# # # plt.legend();
plot_prediction(dt=train_preds.date, y_pred=train_preds.predictions, y_actual=train_preds.actuals, mode = "lines+markers", legend_position=(1, 1.075))

predictions.append(pred["predictions"].tail(1))
print(predictions[-1])

date
2017-01-01    2.461874
Name: predictions, dtype: float32


### Traditional methods

In [24]:
import pandas as pd
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

train_data = pd.concat([X_train_scaled, y_train], axis=1)
test_data = pd.concat([X_test_scaled, y_test], axis=1)

var_model = VAR(train_data)
var_fit = var_model.fit(maxlags=2)  # You can adjust the maxlags based on the model's AIC/BIC criteria

train_preds = var_fit.fittedvalues

lag_order = var_fit.k_ar
forecast_input = train_data.values[-lag_order:]
test_forecasts = var_fit.forecast(y=forecast_input, steps=len(test_data))

train_preds_df = pd.DataFrame(train_preds, columns=train_data.columns)
test_preds_df = pd.DataFrame(test_forecasts, columns=test_data.columns)

plot_prediction(train_data.index, y_pred=train_preds_df[target_col_name], y_actual=train_data[target_col_name], mode = "lines+markers", legend_position=(1, 1.075))

print(test_preds_df[target_col_name])  # Test predictions



No frequency information was provided, so inferred frequency QS-OCT will be used.



0    10.297863
1     6.043810
Name: GDPC1, dtype: float64


In [25]:
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor

train_data = pd.concat([X_train_scaled, y_train], axis=1)
test_data = pd.concat([X_test_scaled, y_test], axis=1)

dfm_model = DynamicFactor(train_data, k_factors=1, factor_order=2)  # Adjust factors and factor order as necessary
dfm_fit = dfm_model.fit()

# Step 2: Make predictions on training data
train_preds = dfm_fit.fittedvalues

# Step 3: Forecast on test data
test_forecasts = dfm_fit.forecast(steps=len(test_data))

# Step 4: Convert predictions to DataFrame and plot
train_preds_df = pd.DataFrame(train_preds, columns=train_data.columns)
test_preds_df = pd.DataFrame(test_forecasts, columns=test_data.columns)

# Plot predictions vs actuals for the target variable
plot_prediction(train_data.index, y_pred=train_preds_df[target_col_name], y_actual=train_data[target_col_name], mode = "lines+markers", legend_position=(1, 1.075))

print(test_preds_df[target_col_name]) 

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           54     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.63294D+01    |proj g|=  6.07298D-01

At iterate    5    f=  3.49602D+01    |proj g|=  3.43089D-02



No frequency information was provided, so inferred frequency QS-OCT will be used.

 This problem is unconstrained.



At iterate   10    f=  3.48277D+01    |proj g|=  1.78698D-01

At iterate   15    f=  3.47508D+01    |proj g|=  1.57436D-01

At iterate   20    f=  3.46027D+01    |proj g|=  4.03277D-01

At iterate   25    f=  3.44257D+01    |proj g|=  1.28111D-01

At iterate   30    f=  3.43140D+01    |proj g|=  9.58460D-02

At iterate   35    f=  3.40735D+01    |proj g|=  2.93811D-01

At iterate   40    f=  3.33082D+01    |proj g|=  5.22361D-01

At iterate   45    f=  3.29310D+01    |proj g|=  1.74179D-01

At iterate   50    f=  3.29122D+01    |proj g|=  1.11160D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   54     50     62   


Maximum Likelihood optimization failed to converge. Check mle_retvals



2017-01-01    0.417783
2017-04-01    0.250839
Freq: QS-OCT, Name: GDPC1, dtype: float64
