In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# 1) Synthetic stock price data
np.random.seed(0)
dates = pd.date_range(start='2021-01-01', periods=200, freq='B')
trend = np.linspace(50, 150, 200)
seasonal = 10 * np.sin(np.arange(200) * 2 * np.pi / 20)
noise = np.random.normal(scale=2, size=200)
price = trend + seasonal + noise
df = pd.DataFrame({'date': dates, 'price': price}).set_index('date')

# 2) Feature engineering
for lag in [1,2,3]:
    df[f'lag{lag}'] = df['price'].shift(lag)
df['roll_mean5'] = df['price'].rolling(5).mean()
df.dropna(inplace=True)

# 3) Train/test split
n_train = int(len(df)*0.7)
train, test = df.iloc[:n_train], df.iloc[n_train:]
X_train, y_train = train.drop('price', axis=1), train['price']
X_test, y_test = test.drop('price', axis=1), test['price']

# 4) Candidate models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=50, random_state=0),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=0),
    'HistGB': HistGradientBoostingRegressor(max_iter=20, learning_rate=0.1, random_state=0)
}

# 5) TimeSeriesSplit CV
tscv = TimeSeriesSplit(n_splits=3)
cv_results = {}
for name, model in models.items():
    rmses = []
    for train_idx, val_idx in tscv.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        rmses.append(np.sqrt(mean_squared_error(y_val, preds)))
    cv_results[name] = np.mean(rmses)

# 6) Select best
best_name = min(cv_results, key=cv_results.get)
best_model = models[best_name]
best_model.fit(X_train, y_train)
y_pred_test = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# Results
results = pd.DataFrame.from_dict(cv_results, orient='index', columns=['CV_RMSE'])
results['Test_RMSE'] = np.nan
results.loc[best_name, 'Test_RMSE'] = test_rmse
results


Unnamed: 0,CV_RMSE,Test_RMSE
LinearRegression,2.682004,2.746601
RandomForest,9.323978,
GradientBoosting,9.133009,
HistGB,17.179023,
