In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pathlib import Path

data_dir = Path("data/features")
files = list(data_dir.glob("*.csv"))
stock_data = {f.stem.replace("_features", ""): pd.read_csv(f) for f in files}

# Standardize the datasets
scaler = StandardScaler()
targets = ["Date","MA5_inc", "MA10_inc", "MA20_inc"]

scaled_data = {}

for ticker, df in stock_data.items():
    X = df.drop(columns = targets)

    X_scaled = scaler.fit_transform(X)
    scaled_X = pd.DataFrame(X_scaled, columns = X.columns)

    for target in targets:
        scaled_X[target] = df[target].values

    scaled_data[ticker] = scaled_X

In [6]:
# Now we need to split the data into training and testing sets
# But since we have 3 different targets, we need to split the data for each target
# And since this is the time series data, we need to split the data by the index, not the random split

split_data = {}
results = []

targets = ["MA5_inc", "MA10_inc", "MA20_inc"]

for ticker, df in scaled_data.items():
    split_data[ticker] = {}

    X_all = df.drop(columns = ["Date"] +targets)

    split_idx = int(len(df)*0.8)

    for target in targets:
        y_all = df[target]

        X_train, X_test = X_all.iloc[:split_idx], X_all.iloc[split_idx:]
        y_train, y_test = y_all.iloc[:split_idx], y_all.iloc[split_idx:]

        split_data[ticker][target] = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }



In [18]:
# Running the Decision Tree Regression
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


dt_results = []
dt_models = {}

for ticker in split_data.keys():
    dt_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test = parts["X_test"]
        y_train = parts["y_train"]
        y_test = parts["y_test"]


        dt = DecisionTreeRegressor(
            random_state = 42,
            max_depth = None,
            min_samples_leaf = 5,
            criterion = "squared_error",
            splitter = "best"
        )

        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        dt_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "DecisionTree",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2
        })

        dt_models[ticker][target] = dt

dt_results_df = pd.DataFrame(dt_results).sort_values(['Ticker', 'Target'])

dt_mae_summary = dt_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
dt_mae_summary.columns = ["MA5","MA10","MA20"]
print(dt_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.312747  0.057872  0.120768
ADBE    0.320800  0.033481  0.165223
AMD     0.185820  0.012725  0.091052
CRM     0.271548  0.040811  0.119819
MSFT    0.327437  0.114777  0.201887
NOW     1.863551  0.413278  1.057560
NVDA    0.456628  0.104673  0.237987
ORCL    0.756167  0.320955  0.403340


In [17]:
# Running the SVM
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


svm_results = []
svm_models = {}

for ticker in split_data.keys():
    svm_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test = parts["X_test"]
        y_train = parts["y_train"]
        y_test = parts["y_test"]


        svm = SVR(kernel = 'linear', C = 1.0, epsilon = 0.1)
        svm.fit(X_train, y_train)

        y_pred = svm.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        svm_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "DecisionTree",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2
        })

        svm_models[ticker][target] = svm

svm_results_df = pd.DataFrame(svm_results).sort_values(['Ticker', 'Target'])

svm_mae_summary = svm_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
svm_mae_summary.columns = ["MA5","MA10","MA20"]
print(svm_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.319266  0.060262  0.164251
ADBE    0.335103  0.030571  0.202472
AMD     0.247962  0.032725  0.130087
CRM     0.360316  0.044951  0.176435
MSFT    0.559192  0.045457  0.243366
NOW     2.370290  0.073778  1.044309
NVDA    0.622107  0.062690  0.200815
ORCL    0.557643  0.106114  0.199934


In [25]:
# Runnding Bagging 

from sklearn.ensemble import BaggingRegressor

bag_results = []
bag_models = {}

for ticker in split_data.keys():
    bag_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        base = DecisionTreeRegressor(random_state=42)
        model = BaggingRegressor(
            estimator=base,        
            n_estimators=300,
            bootstrap=True,
            n_jobs=-1,
            random_state=42
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        bag_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "Bagging(DecisionTree)",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        bag_models[ticker][target] = model


bag_results_df = pd.DataFrame(bag_results).sort_values(["Ticker","Target"])


bag_mae_summary = bag_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
bag_mae_summary.columns = ["MA5","MA10","MA20"]
print(bag_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.274978  0.055038  0.111958
ADBE    0.164815  0.015926  0.086919
AMD     0.108200  0.004944  0.067416
CRM     0.184294  0.025279  0.095650
MSFT    0.296082  0.098720  0.175781
NOW     1.776831  0.366421  1.086662
NVDA    0.404223  0.073146  0.171551
ORCL    0.721141  0.306405  0.390842


In [27]:
# Random forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_results = []
rf_models = {}

for ticker in split_data.keys():
    rf_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        model = RandomForestRegressor(
            n_estimators=300,       
            random_state=42,
            n_jobs=-1,              
            bootstrap=True
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        rf_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "RandomForest",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        rf_models[ticker][target] = model


rf_results_df = pd.DataFrame(rf_results).sort_values(["Ticker","Target"])


rf_mae_summary = rf_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc", "MA10_inc", "MA20_inc"]]
rf_mae_summary.columns = ["MA5", "MA10", "MA20"]
print(rf_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.273985  0.056460  0.111620
ADBE    0.164232  0.015552  0.087073
AMD     0.108902  0.005162  0.066935
CRM     0.184380  0.025994  0.096264
MSFT    0.297040  0.099078  0.175446
NOW     1.789605  0.367137  1.082375
NVDA    0.404057  0.073129  0.171807
ORCL    0.716777  0.308605  0.389071


In [26]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ada_results = []
ada_models = {}

for ticker in split_data.keys():
    ada_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        
        weak = DecisionTreeRegressor(max_depth=3, random_state=42)

       
        try:
            model = AdaBoostRegressor(
                estimator=weak,          
                n_estimators=300,
                learning_rate=0.1,       
                random_state=42
            )
        except TypeError:
            model = AdaBoostRegressor(
                base_estimator=weak,     
                n_estimators=300,
                learning_rate=0.1,
                random_state=42
            )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        ada_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "AdaBoost(Depth3Tree)",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        ada_models[ticker][target] = model


ada_results_df = pd.DataFrame(ada_results).sort_values(["Ticker","Target"])


ada_mae_summary = ada_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
ada_mae_summary.columns = ["MA5","MA10","MA20"]
print(ada_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.474284  0.087056  0.170282
ADBE    0.584767  0.203223  0.326664
AMD     0.311776  0.048736  0.122051
CRM     0.525038  0.097813  0.225014
MSFT    0.621268  0.167596  0.327135
NOW     2.799714  0.467483  1.558773
NVDA    0.527903  0.079493  0.212349
ORCL    0.880081  0.329979  0.427005


In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

cat_results = []
cat_models = {}

for ticker in split_data.keys():
    cat_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        
        train_pool = Pool(X_train, y_train)
        test_pool  = Pool(X_test,  y_test)

        
        model = CatBoostRegressor(
            iterations=300,            
            depth=6,                   
            learning_rate=0.1,         
            loss_function="RMSE",
            random_state=42,
            verbose=False,             
            allow_writing_files=False, 
            thread_count=-1
        )

        model.fit(train_pool)
        y_pred = model.predict(test_pool)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        cat_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "CatBoost",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        cat_models[ticker][target] = model


cat_results_df = pd.DataFrame(cat_results).sort_values(["Ticker","Target"])



cat_mae_summary = cat_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
cat_mae_summary.columns = ["MA5","MA10","MA20"]
print(cat_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.277150  0.129817  0.155087
ADBE    0.217386  0.128760  0.145361
AMD     0.103473  0.053092  0.078677
CRM     0.240012  0.094920  0.111946
MSFT    0.301970  0.170170  0.196163
NOW     1.675655  0.910292  1.325264
NVDA    0.298510  0.148360  0.176845
ORCL    0.686432  0.461932  0.396456
