In [1]:
import pandas as pd
import numpy as np
from functions import *
from models import *
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
import warnings
from sklearn.exceptions import ConvergenceWarning
# set default plotting parameters
plt.rcParams.update({'font.size': 12, 'figure.figsize': (10, 4), 'figure.dpi': 300})

%load_ext autoreload
%autoreload 2

# Load and prepare the data

In [2]:
# read data
df = pd.read_csv('data/data.csv')

In [3]:
# --- Prepare dataframe ---
feature_cols = [col for col in df.columns if col not in ['timestamp', 'ticker', 'target']]
df_norm = df.copy(deep=True)
df_norm = df_norm.dropna(subset=feature_cols)  # should not drop any rows
df_norm['timestamp'] = pd.to_datetime(df_norm['timestamp'])

# drop data from 2025
df_norm = df_norm[df_norm['timestamp'] < '2025-01-01']

# Identify dummy vs. numeric columns
cat_cols = [c for c in feature_cols if c.startswith('NACE_') or c.startswith('month_')] + ['divi','divo','sin']
numeric_cols = [c for c in feature_cols if c not in cat_cols]
feature_cols = numeric_cols + cat_cols


# Prepare containers
periods = {
    '20' : '2020-01-01', 
    '21' : '2021-01-01', 
    '22' : '2022-01-01',  
    '23': '2023-01-01'
}

X_train, X_test = {}, {}
y_train, y_test = {}, {}
preprocessors = {}

y_values = df_norm['target'].values.astype('float32')

for name, period in periods.items():
    period = pd.to_datetime(period)
    tr_mask = df_norm['timestamp']- pd.DateOffset(years=1) < period
    te_mask = (df_norm['timestamp'] - pd.DateOffset(years=1) >= period) & \
              (df_norm['timestamp'] - pd.DateOffset(years=2) < period)

    # extract feature DataFrames
    X_tr_df = df_norm.loc[tr_mask, feature_cols]
    X_te_df = df_norm.loc[te_mask, feature_cols]
    y_tr = y_values[tr_mask]
    y_te = y_values[te_mask]

    # compute winsorization bounds on train
    lower = X_tr_df[numeric_cols].quantile(0.01)
    upper = X_tr_df[numeric_cols].quantile(0.99)

    # apply clipping to train, val, test
    X_tr_df[numeric_cols] = X_tr_df[numeric_cols].clip(lower=lower, upper=upper, axis=1)
    X_te_df[numeric_cols] = X_te_df[numeric_cols].clip(lower=lower, upper=upper, axis=1)

    # fit scaler only on training set
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_cols),
        ('cat', 'passthrough', cat_cols)
    ])
    preprocessor.fit(X_tr_df)
    preprocessors[name] = preprocessor

    # ttransform splits
    X_train[name] = preprocessor.transform(X_tr_df).astype('float32')
    X_test[name]  = preprocessor.transform(X_te_df).astype('float32')

    # targets
    y_train[name] = y_tr
    y_test[name]  = y_te

# OLS

In [4]:
# linear model
# estimate the parameters
ols_est = {}
ols_pred_train = {}
ols_pred_test = {}

for year in periods.keys():
    print(f"Estimating OLS for {year}...")
    x_tr = X_train[year]
    y_tr = y_train[year]
    x_te = X_test[year]
    y_te = y_test[year]


    # estimate the parameters
    ols_est[year] = estimate(y_tr, x_tr)
    ols_pred_train[year] = ols_est[year]['b_hat'] @ x_tr.T
    ols_pred_test[year] = ols_est[year]['b_hat'] @ x_te.T

Estimating OLS for 20...
Estimating OLS for 21...
Estimating OLS for 22...
Estimating OLS for 23...


  se = np.sqrt(np.diag(cov)).reshape(-1, 1)


# Lasso

In [5]:
# linear model
# create a grid using numpy.geomspace
penalty_grid = np.geomspace(1e-7, 100, num = 1000)

lasso_est = {}
lasso_pred_train = {}
lasso_pred_test = {}


with warnings.catch_warnings():
    warnings.simplefilter("ignore", ConvergenceWarning)
    for year in periods.keys():
        print(f"Estimating Lasso for {year}...")
        x_tr = X_train[year]
        y_tr = y_train[year]
        x_te = X_test[year]
        y_te = y_test[year]

        # estimate the model using LassoCV
        fit_CV = LassoCV(cv=5, alphas=penalty_grid, max_iter=1000, eps=1e-3, n_jobs=-1).fit(x_tr,y_tr)
        lasso_pred_train[year] = fit_CV.predict(x_tr)
        lasso_pred_test[year] = fit_CV.predict(x_te)

Estimating Lasso for 20...
Estimating Lasso for 21...
Estimating Lasso for 22...
Estimating Lasso for 23...
