In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


data_dir = Path("data/features/only features")
files = list(data_dir.glob("*.csv"))
stock_data = {f.stem.replace("_features", ""): pd.read_csv(f) for f in files}



def make_direction_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"], errors = "coerce")

    for d in [5,10,20]:
        ma_col = f"MA{d}"
        ma_next = df[ma_col].shift(-1)
        dir_col = f"MA{d}_dir"
        df[dir_col] = np.where(ma_next > df[ma_col], 1 , -1)

    df = df.iloc[:-1].reset_index(drop = True)
    return df

labeled_data = {ticker: make_direction_labels(df) for ticker, df in stock_data.items()}

In [15]:
# Now we need to split the data into training and testing sets
# But since we have 3 different targets, we need to split the data for each target
# And since this is the time series data, we need to split the data by the index, not the random split

targets_dir = ["MA5_dir", "MA10_dir", "MA20_dir"]

split_data = {}

for ticker, df in labeled_data.items():

    df = df.copy()
    df = df.sort_values("Date")

    # 스케일 제외 컬럼
    exclude_cols = ["Date"] + targets_dir

    # 시계열 8:2 분할 (순서 유지)
    split_idx = int(len(df) * 0.8)

    X_all = df.drop(columns=exclude_cols, errors="ignore")  # 기술지표들만 남김
    X_train_raw = X_all.iloc[:split_idx].copy()
    X_test_raw  = X_all.iloc[split_idx:].copy()

    # 스케일링 (train fit → train/test transform)
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train_raw), 
                           columns=X_train_raw.columns, index=X_train_raw.index)
    X_test  = pd.DataFrame(scaler.transform(X_test_raw), 
                           columns=X_test_raw.columns, index=X_test_raw.index)

    # 각 타깃(MA5/10/20 방향)에 대해 y 생성
    split_data[ticker] = {"X_train": X_train, "X_test": X_test}
    for t in targets_dir:
        y_all = df[t]
        y_train = y_all.iloc[:split_idx]
        y_test  = y_all.iloc[split_idx:]
        split_data[ticker][t] = {"y_train": y_train, "y_test": y_test}

In [19]:
# Running the Decision Tree Regression
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

results= []
models = {}

for ticker, parts in split_data.items():
    X_train = parts["X_train"]
    X_test  = parts["X_test"]

    models[ticker] = {}

    for target in targets_dir:
        y_train = parts[target]["y_train"]
        y_test  = parts[target]["y_test"]

        reg = DecisionTreeRegressor(
            random_state=42,
            max_depth=None,
            min_samples_leaf=5,
            criterion="squared_error" 
        )

        reg.fit(X_train, y_train)

        y_pred_cont = reg.predict(X_test)
        y_pred = np.where(y_pred_cont > 0, 1, -1)   # 부호(sign)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)


        results.append({
            "Ticker": ticker,
            "Target": target.replace("_dir",""),  # 보기 좋게
            "Model": "DecisionTreeClassifier",
            "MAE": mae,                    # 연속 예측 vs ±1
            "MSE": mse,
            "RMSE": rmse,
            "R2": r2
        })

        models[ticker][target] = reg

# 결과 요약
res_df = pd.DataFrame(results).sort_values(["Ticker", "Target"])

res_df["Target"] = pd.Categorical(res_df["Target"], categories=["MA5","MA10","MA20"], ordered=True)
res_df = res_df.sort_values(["Ticker","Target"])

table = (
    res_df
    .set_index(["Ticker","Target","Model"])
    [["MAE","MSE","RMSE","R2"]]
    .sort_index()
)

print(table.to_string())

                                           MAE       MSE      RMSE        R2
Ticker Target Model                                                         
AAPL   MA5    DecisionTreeClassifier  0.535211  1.070423  1.034612 -0.079470
       MA10   DecisionTreeClassifier  0.366197  0.732394  0.855800  0.258188
       MA20   DecisionTreeClassifier  0.295775  0.591549  0.769122  0.387931
ADBE   MA5    DecisionTreeClassifier  0.443662  0.887324  0.941979  0.112280
       MA10   DecisionTreeClassifier  0.345070  0.690141  0.830747  0.298589
       MA20   DecisionTreeClassifier  0.232394  0.464789  0.681754  0.499359
AMD    MA5    DecisionTreeClassifier  0.725352  1.450704  1.204452 -0.453299
       MA10   DecisionTreeClassifier  0.387324  0.774648  0.880141  0.224390
       MA20   DecisionTreeClassifier  0.140845  0.281690  0.530745  0.716906
CRM    MA5    DecisionTreeClassifier  0.436620  0.873239  0.934473  0.126067
       MA10   DecisionTreeClassifier  0.338028  0.676056  0.822226  0.320574

In [20]:
acc = (y_test.values == y_pred).mean()
print("sanity:", mae, "vs", 2*(1-acc))

sanity: 0.352112676056338 vs 0.352112676056338


In [17]:
# Running the SVM
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


svm_results = []
svm_models = {}

for ticker in split_data.keys():
    svm_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test = parts["X_test"]
        y_train = parts["y_train"]
        y_test = parts["y_test"]


        svm = SVR(kernel = 'linear', C = 1.0, epsilon = 0.1)
        svm.fit(X_train, y_train)

        y_pred = svm.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        svm_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "DecisionTree",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2
        })

        svm_models[ticker][target] = svm

svm_results_df = pd.DataFrame(svm_results).sort_values(['Ticker', 'Target'])

svm_mae_summary = svm_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
svm_mae_summary.columns = ["MA5","MA10","MA20"]
print(svm_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.319266  0.060262  0.164251
ADBE    0.335103  0.030571  0.202472
AMD     0.247962  0.032725  0.130087
CRM     0.360316  0.044951  0.176435
MSFT    0.559192  0.045457  0.243366
NOW     2.370290  0.073778  1.044309
NVDA    0.622107  0.062690  0.200815
ORCL    0.557643  0.106114  0.199934


In [25]:
# Runnding Bagging 

from sklearn.ensemble import BaggingRegressor

bag_results = []
bag_models = {}

for ticker in split_data.keys():
    bag_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        base = DecisionTreeRegressor(random_state=42)
        model = BaggingRegressor(
            estimator=base,        
            n_estimators=300,
            bootstrap=True,
            n_jobs=-1,
            random_state=42
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        bag_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "Bagging(DecisionTree)",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        bag_models[ticker][target] = model


bag_results_df = pd.DataFrame(bag_results).sort_values(["Ticker","Target"])


bag_mae_summary = bag_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
bag_mae_summary.columns = ["MA5","MA10","MA20"]
print(bag_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.274978  0.055038  0.111958
ADBE    0.164815  0.015926  0.086919
AMD     0.108200  0.004944  0.067416
CRM     0.184294  0.025279  0.095650
MSFT    0.296082  0.098720  0.175781
NOW     1.776831  0.366421  1.086662
NVDA    0.404223  0.073146  0.171551
ORCL    0.721141  0.306405  0.390842


In [27]:
# Random forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_results = []
rf_models = {}

for ticker in split_data.keys():
    rf_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        model = RandomForestRegressor(
            n_estimators=300,       
            random_state=42,
            n_jobs=-1,              
            bootstrap=True
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        rf_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "RandomForest",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        rf_models[ticker][target] = model


rf_results_df = pd.DataFrame(rf_results).sort_values(["Ticker","Target"])


rf_mae_summary = rf_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc", "MA10_inc", "MA20_inc"]]
rf_mae_summary.columns = ["MA5", "MA10", "MA20"]
print(rf_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.273985  0.056460  0.111620
ADBE    0.164232  0.015552  0.087073
AMD     0.108902  0.005162  0.066935
CRM     0.184380  0.025994  0.096264
MSFT    0.297040  0.099078  0.175446
NOW     1.789605  0.367137  1.082375
NVDA    0.404057  0.073129  0.171807
ORCL    0.716777  0.308605  0.389071


In [26]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ada_results = []
ada_models = {}

for ticker in split_data.keys():
    ada_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        
        weak = DecisionTreeRegressor(max_depth=3, random_state=42)

       
        try:
            model = AdaBoostRegressor(
                estimator=weak,          
                n_estimators=300,
                learning_rate=0.1,       
                random_state=42
            )
        except TypeError:
            model = AdaBoostRegressor(
                base_estimator=weak,     
                n_estimators=300,
                learning_rate=0.1,
                random_state=42
            )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        ada_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "AdaBoost(Depth3Tree)",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        ada_models[ticker][target] = model


ada_results_df = pd.DataFrame(ada_results).sort_values(["Ticker","Target"])


ada_mae_summary = ada_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
ada_mae_summary.columns = ["MA5","MA10","MA20"]
print(ada_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.474284  0.087056  0.170282
ADBE    0.584767  0.203223  0.326664
AMD     0.311776  0.048736  0.122051
CRM     0.525038  0.097813  0.225014
MSFT    0.621268  0.167596  0.327135
NOW     2.799714  0.467483  1.558773
NVDA    0.527903  0.079493  0.212349
ORCL    0.880081  0.329979  0.427005


In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

cat_results = []
cat_models = {}

for ticker in split_data.keys():
    cat_models[ticker] = {}
    for target in split_data[ticker].keys():
        parts = split_data[ticker][target]
        X_train = parts["X_train"]
        X_test  = parts["X_test"]
        y_train = parts["y_train"]
        y_test  = parts["y_test"]

        
        train_pool = Pool(X_train, y_train)
        test_pool  = Pool(X_test,  y_test)

        
        model = CatBoostRegressor(
            iterations=300,            
            depth=6,                   
            learning_rate=0.1,         
            loss_function="RMSE",
            random_state=42,
            verbose=False,             
            allow_writing_files=False, 
            thread_count=-1
        )

        model.fit(train_pool)
        y_pred = model.predict(test_pool)

        mae  = mean_absolute_error(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_test, y_pred)

        cat_results.append({
            "Ticker": ticker,
            "Target": target,
            "Model": "CatBoost",
            "MAE": mae,
            "RMSE": rmse,
            "MSE": mse,
            "R2": r2,
            "n_train": len(X_train),
            "n_test": len(X_test),
        })

        cat_models[ticker][target] = model


cat_results_df = pd.DataFrame(cat_results).sort_values(["Ticker","Target"])



cat_mae_summary = cat_results_df.pivot_table(
    index="Ticker", columns="Target", values="MAE", aggfunc="first"
)[["MA5_inc","MA10_inc","MA20_inc"]]
cat_mae_summary.columns = ["MA5","MA10","MA20"]
print(cat_mae_summary)


             MA5      MA10      MA20
Ticker                              
AAPL    0.277150  0.129817  0.155087
ADBE    0.217386  0.128760  0.145361
AMD     0.103473  0.053092  0.078677
CRM     0.240012  0.094920  0.111946
MSFT    0.301970  0.170170  0.196163
NOW     1.675655  0.910292  1.325264
NVDA    0.298510  0.148360  0.176845
ORCL    0.686432  0.461932  0.396456
