In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import Ridge


Load datasets

In [165]:
train = pd.read_csv("./data/train.csv")
train_labels = pd.read_csv("./data/train_labels.csv")
test = pd.read_csv("./data/test.csv")
target_pairs = pd.read_csv("./data/target_pairs.csv")

In [166]:
train = train[0:1827] #ensure the train and test set are independent for CV
test_labels = train_labels[1827::]
train_labels = train_labels[0:1827]

In [103]:
target_pairs[~target_pairs['pair'].str.contains(' - ')] # only 4 targets are just log returns (not pairwise differences)

Unnamed: 0,target,lag,pair
0,target_0,1,US_Stock_VT_adj_close
106,target_106,2,US_Stock_VXUS_adj_close
212,target_212,3,FX_ZARUSD
318,target_318,4,FX_NOKEUR


### Functions copied from Kaggle comp:
This shows that at each date_id, t, the target label is the (t+lag+1)/(t+1)

In [104]:
import warnings

def generate_log_returns(data, lag):
    log_returns = pd.Series(np.nan, index=data.index)

    # Compute log returns based on the rules
    for t in range(len(data)):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                log_returns.iloc[t] = np.log(data.iloc[t + lag + 1] / data.iloc[t + 1])
            except Exception:
                log_returns.iloc[t] = np.nan
    return log_returns


def generate_targets(column_a: pd.Series, column_b: pd.Series, lag: int) -> pd.Series:
    a_returns = generate_log_returns(column_a, lag)
    b_returns = generate_log_returns(column_b, lag)
    return a_returns - b_returns

### Multivariate Ridge Regression
#### Most basic model: Using only predefined targets as features. X = target labels at time t and y = target labels at time t+1 (same datasets, just offset by 1)
** This is NOT valid because it uses look ahead/leakage!!
target at time t has information up to 5 days ahead, when we are predicting information up to 6 days ahead, so we're never actually predicting more than 1 day ahead (so targets with lag>1 are not valid)

In [105]:
#generate datasets
num_features = len(target_pairs)
num_samples = len(train)-1
y_train = np.zeros((num_samples, num_features)) #for multivariate, we are predicting all features from single model
X_train = np.zeros((num_samples, num_features))

X_train = train_labels.drop(columns=['date_id'])[0:-1].fillna(0).values
y_train = train_labels.drop(columns=['date_id'])[1::].fillna(0).values

#fit
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)

Predict using test set and compare to a naiive baseline (y_pred=X_test)

In [106]:
#get test sets
X_test = test_labels.drop(columns=['date_id'])[0:-1].fillna(0).values
y_test = test_labels.drop(columns=['date_id'])[1::].fillna(0).values

#make predictions
y_pred = clf.predict(X_test)

#get metrics for model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Multivariate ridge metrics: ')
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R²: {r2:.6f}")

#compare to naiive baseline (y_pred=X_test)
rmse = mean_squared_error(y_test, X_test, squared=False)
mae = mean_absolute_error(y_test, X_test)
r2 = r2_score(y_test, X_test)

print('Naive baseline metrics: ')
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R²: {r2:.6f}")


Multivariate ridge metrics: 
RMSE: 0.014013
MAE: 0.010333
R²: 0.689933
Naive baseline metrics: 
RMSE: 0.027213
MAE: 0.019427
R²: -0.306022




#### Separate models per lag (4 total models)

In [107]:
target_pairs_lag1 = target_pairs[target_pairs['lag']==1]
target_pairs_lag2 = target_pairs[target_pairs['lag']==2]
target_pairs_lag3 = target_pairs[target_pairs['lag']==3]
target_pairs_lag4 = target_pairs[target_pairs['lag']==4]
train_labels_lag1 = train_labels[target_pairs_lag1['target']]
train_labels_lag2 = train_labels[target_pairs_lag2['target']]
train_labels_lag3 = train_labels[target_pairs_lag3['target']]
train_labels_lag4 = train_labels[target_pairs_lag4['target']]
test_labels_lag1 = test_labels[target_pairs_lag1['target']]
test_labels_lag2 = test_labels[target_pairs_lag2['target']]
test_labels_lag3 = test_labels[target_pairs_lag3['target']]
test_labels_lag4 = test_labels[target_pairs_lag4['target']]
train_dfs = [train_labels_lag1, train_labels_lag2, train_labels_lag3, train_labels_lag4]
test_dfs = [test_labels_lag1, test_labels_lag2, test_labels_lag3, test_labels_lag4]
#106 pairs each

Train model for each subset of training data

In [108]:
clfs = {}
#each model should be trained on entire training set
X_train = train_labels.drop(columns=['date_id']).iloc[:-1].fillna(0).values
for lag, train_df in enumerate(train_dfs):
    y_train = train_df.iloc[1:].fillna(0).values

    clf = Ridge(alpha=1)
    clf.fit(X_train, y_train)
    clfs[lag] = clf
    

Evaluate each model

In [None]:
rmses = []
maes = []
r2s = []
X_test = test_labels.drop(columns=['date_id']).iloc[:-1].fillna(0).values
for (lag, clf), test_df in zip(clfs.items(), test_dfs):
    
    y_test = test_df.iloc[1:].fillna(0).values

    y_pred = clf.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

  
    rmses.append(rmse)
    maes.append(mae)
    r2s.append(r2)

print(f"Average RMSE: {np.mean(rmses):.6f}")
print(f"Average MAE: {np.mean(maes):.6f}")
print(f"Average R2: {np.mean(r2s):.6f}")
# performs exactly the same as model for all lags

Average RMSE: 0.014013
Average MAE: 0.010333
Average R2: 0.689933




#### Full model using raw prices (train) as input rather than targets

In [181]:
#generate datasets

X_train = train[:len(train)//2-1].drop(columns=['date_id']).fillna(0).values
print(X_train)
y_train = train_labels[1:len(train_labels)//2].drop(columns=['date_id']).fillna(0).values
print(y_train)

#fit
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)

[[2.2645000e+03 7.2050000e+03 2.5700000e+03 ... 7.8135000e-02
  1.3822740e+01 5.9163000e-02]
 [2.2280000e+03 7.1470000e+03 2.5790000e+03 ... 7.9066000e-02
  1.3888146e+01 5.9895000e-02]
 [2.2500000e+03 7.1885000e+03 2.5870000e+03 ... 7.9287000e-02
  1.3983675e+01 6.0037000e-02]
 ...
 [2.5560000e+03 9.5110000e+03 2.2865000e+03 ... 6.4707000e-02
  1.2953394e+01 5.0644000e-02]
 [2.5300000e+03 9.3120000e+03 2.2935000e+03 ... 6.4341000e-02
  1.2708448e+01 5.0404000e-02]
 [2.4995000e+03 9.4550000e+03 2.2900000e+03 ... 6.4714000e-02
  1.2665867e+01 5.0664000e-02]]
[[ 0.00578281 -0.02411763 -0.00705199 ...  0.02502068  0.00354846
   0.02094043]
 [ 0.00104825  0.02383639 -0.00893406 ...  0.00483537 -0.00907498
   0.00170587]
 [ 0.00169973 -0.02461781  0.01194331 ... -0.01510236  0.
  -0.03301016]
 ...
 [-0.0114905   0.01454261 -0.0055731  ... -0.01492095  0.03621293
   0.0508364 ]
 [ 0.01465372 -0.00017481  0.00613803 ... -0.01473078  0.02352966
   0.03467502]
 [ 0.00238936 -0.00153154 -0.00239

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [175]:
#get test sets
X_test = train[len(train)//2-1:-1].drop(columns=['date_id']).fillna(0).values
y_test = train_labels[len(train)//2:].drop(columns=['date_id']).fillna(0).values

#make predictions
y_pred = clf.predict(X_test)

#get metrics for model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Multivariate ridge metrics: ')
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R²: {r2:.6f}")


Multivariate ridge metrics: 
RMSE: 0.402171
MAE: 0.315654
R²: -304.792437




## Time-series Cross Validation

In [190]:
def time_series_cv(train_df, target_df, n_splits=5, alpha=1.0):
    """
    Perform rolling-origin CV for one set of targets.
    train_df should be features from train.csv
    target_df should be targets from train_labels.csv
    """
    n_samples = len(train_df)
    fold_size = n_samples // (n_splits + 1)  # size of each validation block

    rmses, maes, r2s = [], [], []

    for i in range(1, n_splits + 1):
        # Define train/val split
        split_point = fold_size * i
        X_train = train_df.drop(columns=['date_id']).iloc[:split_point-1].fillna(0).values
        y_train = target_df.drop(columns=['date_id']).iloc[1:split_point].fillna(0).values
        print(X_train)
        print(y_train)

        X_val = train_df.drop(columns=['date_id']).iloc[split_point-1:-1].fillna(0).values
        y_val = target_df.drop(columns=['date_id']).iloc[split_point:].fillna(0).values

        # Fit model
        clf = Ridge(alpha=alpha)
        clf.fit(X_train, y_train)

        # Predict & evaluate
        y_pred = clf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)

    return np.mean(rmses), np.mean(maes), np.mean(r2s)


In [191]:
rmse, mae, r2 = time_series_cv(train, train_labels, alpha=1.0, n_splits=1)
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R²: {r2:.6f}")


[[2.2645000e+03 7.2050000e+03 2.5700000e+03 ... 7.8135000e-02
  1.3822740e+01 5.9163000e-02]
 [2.2280000e+03 7.1470000e+03 2.5790000e+03 ... 7.9066000e-02
  1.3888146e+01 5.9895000e-02]
 [2.2500000e+03 7.1885000e+03 2.5870000e+03 ... 7.9287000e-02
  1.3983675e+01 6.0037000e-02]
 ...
 [2.5560000e+03 9.5110000e+03 2.2865000e+03 ... 6.4707000e-02
  1.2953394e+01 5.0644000e-02]
 [2.5300000e+03 9.3120000e+03 2.2935000e+03 ... 6.4341000e-02
  1.2708448e+01 5.0404000e-02]
 [2.4995000e+03 9.4550000e+03 2.2900000e+03 ... 6.4714000e-02
  1.2665867e+01 5.0664000e-02]]
[[ 0.00578281 -0.02411763 -0.00705199 ...  0.02502068  0.00354846
   0.02094043]
 [ 0.00104825  0.02383639 -0.00893406 ...  0.00483537 -0.00907498
   0.00170587]
 [ 0.00169973 -0.02461781  0.01194331 ... -0.01510236  0.
  -0.03301016]
 ...
 [-0.0114905   0.01454261 -0.0055731  ... -0.01492095  0.03621293
   0.0508364 ]
 [ 0.01465372 -0.00017481  0.00613803 ... -0.01473078  0.02352966
   0.03467502]
 [ 0.00238936 -0.00153154 -0.00239

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
