In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier,Ridge
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostClassifier,CatBoostRegressor
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression

In [30]:
X = pd.read_csv('data/X_train.csv',index_col='ROW_ID')
X_test_final = pd.read_csv('data/X_test.csv',index_col='ROW_ID')
y = pd.read_csv('data/y_train.csv',index_col='ROW_ID')
y_bin = (y > 0).astype(int) #binariser l outcome pour faire de la classification et pas regression
sample_submission = pd.read_csv('data/sample_submission.csv',index_col='ROW_ID')

In [31]:

RET_features = [f'RET_{i}' for i in range(1, 20)]
SIGNED_VOLUME_features = [f'SIGNED_VOLUME_{i}' for i in range(1, 20)]
TURNOVER_features = ['AVG_DAILY_TURNOVER']

for i in range(2, 21):
    X[f'AVERAGE_PERF_{i}'] = X[RET_features[:i]].mean(1)
    X[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')
    X_test_final[f'AVERAGE_PERF_{i}'] = X_test_final[RET_features[:i]].mean(1)
    X_test_final[f'ALLOCATIONS_AVERAGE_PERF_{i}'] = X_test_final.groupby('TS')[f'AVERAGE_PERF_{i}'].transform('mean')


In [32]:
X["RET_VOLATILITY_20"] = X[RET_features].std(axis=1)
X_test_final["RET_VOLATILITY_20"] = X_test_final[RET_features].std(axis=1)

X["RET_MOMENTUM"] = X["RET_1"] - X[RET_features].mean(axis=1)
X_test_final["RET_MOMENTUM"] = X_test_final["RET_1"] - X_test_final[RET_features].mean(axis=1)

# Ratio perf/vol (Sharpe-like)
X["RET_SHARPE"] = X["RET_MOMENTUM"] / (X["RET_VOLATILITY_20"] + 1e-6)
X_test_final["RET_SHARPE"] = X_test_final["RET_MOMENTUM"] / (X_test_final["RET_VOLATILITY_20"] + 1e-6)

# Liquidity volatility
X["SIGNED_VOLUME_VOL"] = X[SIGNED_VOLUME_features].std(axis=1)
X_test_final["SIGNED_VOLUME_VOL"] = X_test_final[SIGNED_VOLUME_features].std(axis=1)

X["TS_num"] = X["TS"].str.extract("(\d+)").astype(int)
X_test_final["TS_num"] = X_test_final["TS"].str.extract("(\d+)").astype(int)

X["RET_TREND_5"] = X["RET_20"] - X["RET_15"]
X["VOL_TREND_5"] = X["SIGNED_VOLUME_20"] - X["SIGNED_VOLUME_15"]
X_test_final["RET_TREND_5"] = X_test_final["RET_20"] - X_test_final["RET_15"]
X_test_final["VOL_TREND_5"] = X_test_final["SIGNED_VOLUME_20"] - X_test_final["SIGNED_VOLUME_15"]

# Décalage du dernier retour
X["RET_LAG1"] = X["RET_1"]
X["RET_DIFF1"] = X["RET_1"] - X["RET_2"]
X_test_final["RET_LAG1"] = X_test_final["RET_1"]
X_test_final["RET_DIFF1"] = X_test_final["RET_1"] - X_test_final["RET_2"]
# Skewness et kurtosis sur les 20 derniers jours
X["RET_SKEW_20"] = X[RET_features].skew(axis=1)
X["RET_KURT_20"] = X[RET_features].kurt(axis=1)
X_test_final["RET_SKEW_20"] = X_test_final[RET_features].skew(axis=1)
X_test_final["RET_KURT_20"] = X_test_final[RET_features].kurt(axis=1)
# Ratio volume/performance (liquidité relative)
X["VOL_PERF_RATIO"] = X[SIGNED_VOLUME_features].mean(axis=1) / (X[RET_features].std(axis=1) + 1e-6)

# Moyenne pondérée des performances récentes (momentum lissé)
weights = np.linspace(1, 2, 19)
X["RET_WEIGHTED_MOMENTUM"] = (X[RET_features].values * weights).sum(axis=1) / weights.sum()

X_test_final["VOL_PERF_RATIO"] = X_test_final[SIGNED_VOLUME_features].mean(axis=1) / (X_test_final[RET_features].std(axis=1) + 1e-6)

# Moyenne pondérée des performances récentes (momentum lissé)
weights = np.linspace(1, 2, 19)
X_test_final["RET_WEIGHTED_MOMENTUM"] = (X_test_final[RET_features].values * weights).sum(axis=1) / weights.sum()




  X["TS_num"] = X["TS"].str.extract("(\d+)").astype(int)
  X_test_final["TS_num"] = X_test_final["TS"].str.extract("(\d+)").astype(int)


In [33]:
features = (
    RET_features
    + SIGNED_VOLUME_features
    + TURNOVER_features
    + [f'AVERAGE_PERF_{i}' for i in range(2, 21)]
    + [f'ALLOCATIONS_AVERAGE_PERF_{i}' for i in range(2, 21)]
    + ["RET_VOLATILITY_20", "RET_MOMENTUM", "RET_SHARPE", "SIGNED_VOLUME_VOL","VOL_TREND_5","RET_TREND_5"]
    + ["alloc_enc"]
    +["VOL_PERF_RATIO","RET_KURT_20","RET_SKEW_20","RET_DIFF1","RET_LAG1"]
)

In [34]:
alloc_col = "ALLOCATION"
ret_cols = [f'RET_{i}' for i in range(1, 21)]
def compute_time_features_for_allocation(df_alloc):
    mean_rets = df_alloc[ret_cols].mean()

    x = np.arange(1, 21).reshape(-1, 1)
    model = LinearRegression().fit(x, mean_rets.values)
    slope = model.coef_[0]

    short_mean = mean_rets[:5].mean()
    long_mean = mean_rets[5:].mean()
    short_long_ratio = short_mean / (long_mean + 1e-6)

    autocorr = np.corrcoef(mean_rets[:-1], mean_rets[1:])[0, 1]

    fft_vals = np.fft.rfft(mean_rets - mean_rets.mean())
    spectral_energy = np.sum(np.abs(fft_vals)**2)

    # 5. Volatilité et asymétrie
    volatility = mean_rets.std()
    skew = ((mean_rets - mean_rets.mean())**3).mean() / (volatility**3 + 1e-6)

    return pd.Series({
        "ALLOC_TREND_SLOPE": slope,
        "ALLOC_SHORT_LONG_RATIO": short_long_ratio,
        "ALLOC_AUTOCORR": autocorr,
        "ALLOC_SPECTRAL_ENERGY": spectral_energy,
        "ALLOC_VOLATILITY": volatility,
        "ALLOC_SKEW": skew
    })

In [35]:
unique_dates = np.sort(X['TS'].unique())
n = len(unique_dates)
train_dates = unique_dates[: int(0.8 * n)]   # 80% train
test_dates = unique_dates[int(0.8 * n):]     # 20% test

train_idx = X['TS'].isin(train_dates)
test_idx = X['TS'].isin(test_dates)

X_train, y_train_bin, y_train_cont = X.loc[train_idx, :], y_bin.loc[train_idx], y.loc[train_idx]
X_test, y_test_bin, y_test_cont = X.loc[test_idx, :], y_bin.loc[test_idx], y.loc[test_idx]


In [36]:
enc = TargetEncoder()
X_train["alloc_enc"] = enc.fit_transform(X_train["ALLOCATION"], y_train_bin)
X_test["alloc_enc"] = enc.transform(X_test["ALLOCATION"], y_test_bin)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["alloc_enc"] = enc.fit_transform(X_train["ALLOCATION"], y_train_bin)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["alloc_enc"] = enc.transform(X_test["ALLOCATION"], y_test_bin)


In [37]:
features=[f for f in features if f in X_train.columns]

In [38]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[features].fillna(0))
X_test_scaled = scaler.transform(X_test[features].fillna(0))

In [39]:
results_classif = {}

name, model="RidgeClassifier",RidgeClassifier(alpha=1e-2, fit_intercept=False)
model.fit(X_train_scaled, y_train_bin)
y_pred = model.predict(X_test_scaled)


# Évaluation
acc = accuracy_score(y_test_bin, y_pred)
results_classif[name] = acc
print(f"✅ Test Accuracy ({name}): {acc:.4f}")

# Importance des features
print("\n🔍 Top 10 feature importances:")

if hasattr(model, "feature_importances_"):
    importances = model.feature_importances_
elif hasattr(model, "coef_"):
    importances = np.abs(model.coef_.ravel())  # abs pour Ridge

importance_df = pd.DataFrame({
    "feature": features,
    "importance": importances
}).sort_values("importance", ascending=False)
top_half = importance_df.iloc[:importance_df.shape[0] // 2]["feature"].tolist()
best_ridge = top_half



  y = column_or_1d(y, warn=True)


✅ Test Accuracy (RidgeClassifier): 0.5304

🔍 Top 10 feature importances:


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
