In [1]:
# ==============================
# Cell 1: Import libraries
# ==============================
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# ==============================
# Cell 2: Load data
# ==============================
# Choose one stock file from processed_data (e.g., AAPL.csv)
file_path = "processed_data/AAPL.csv"

df = pd.read_csv(file_path)
print("Data shape:", df.shape)
df.head()

Data shape: (10965, 19)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Return,MA5,MA20,MA50,EMA12,EMA26,MACD,BB_Middle,BB_Upper,BB_Lower,Volatility20,RSI14
0,1981-02-24,0.107143,0.107143,0.106027,0.106027,0.081736,16979200,-0.035529,0.086382,0.093867,0.102204,0.089401,0.094904,-0.005503,0.093867,0.108697,0.079037,0.033941,31.324209
1,1981-02-25,0.112723,0.113281,0.112723,0.112723,0.086898,19488000,0.063155,0.085005,0.092706,0.101963,0.089016,0.094311,-0.005295,0.092706,0.105699,0.079713,0.038141,34.482285
2,1981-02-26,0.114397,0.114955,0.114397,0.114397,0.088188,10841600,0.014845,0.085005,0.091781,0.101851,0.088889,0.093858,-0.004969,0.091781,0.10311,0.080451,0.038251,36.665362
3,1981-02-27,0.118304,0.11942,0.118304,0.118304,0.0912,14761600,0.034154,0.086554,0.0912,0.101937,0.089244,0.093661,-0.004417,0.0912,0.101268,0.081131,0.038823,40.625213
4,1981-03-02,0.118862,0.11942,0.118862,0.118862,0.09163,11760000,0.004715,0.08793,0.09092,0.101989,0.089611,0.09351,-0.003899,0.09092,0.100587,0.081253,0.037097,47.058645


In [3]:
# ==============================
# Cell 3: Create Target
# ==============================
# Label = 1 if stock rises >= threshold within 'period' days, else 0
period = 30
threshold = 0.1

future_price = df["Adj Close"].shift(-period)
df["Target"] = ((future_price / df["Adj Close"] - 1) >= threshold).astype(int)

# Drop rows with NaN (due to shifting)
df.dropna(inplace=True)

df[["Date", "Adj Close", "Target"]].tail(10)

Unnamed: 0,Date,Adj Close,Target
10955,2024-08-08,213.063385,0
10956,2024-08-09,215.990005,0
10957,2024-08-12,217.529999,0
10958,2024-08-13,221.270004,0
10959,2024-08-14,221.720001,0
10960,2024-08-15,224.720001,0
10961,2024-08-16,226.050003,0
10962,2024-08-19,225.889999,0
10963,2024-08-20,226.509995,0
10964,2024-08-21,226.399994,0


In [4]:
# ==============================
# Cell 4: Split Features / Target
# ==============================
drop_cols = ["Date","Open","High","Low","Close","Adj Close","Target"]
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols]
y = df["Target"]

print("Features used:", feature_cols)
print("Number of features:", len(feature_cols))

Features used: ['Volume', 'Return', 'MA5', 'MA20', 'MA50', 'EMA12', 'EMA26', 'MACD', 'BB_Middle', 'BB_Upper', 'BB_Lower', 'Volatility20', 'RSI14']
Number of features: 13


In [5]:
# ==============================
# Cell 5: Train/Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (8772, 13)
Test shape: (2193, 13)


In [6]:
# ==============================
# Cell 6: Train Model (Random Forest)
# ==============================
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

print("Model training complete ✅")

Model training complete ✅


In [7]:
# ==============================
# Cell 7: Evaluate Model
# ==============================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

Accuracy: 0.7341541267669859

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      1613
           1       0.38      0.01      0.02       580

    accuracy                           0.73      2193
   macro avg       0.56      0.50      0.43      2193
weighted avg       0.64      0.73      0.63      2193



In [8]:
# ==============================
# Cell 8: Probability-based Recommendation
# ==============================
proba = model.predict_proba(X_test)[:,1]  # probability of upward movement

df_test = df.iloc[X_test.index].copy()
df_test["ProbUp"] = proba

# Show last 10 rows with prediction probability
df_test.tail(10)[["Date","Adj Close","ProbUp","Target"]]

Unnamed: 0,Date,Adj Close,ProbUp,Target
10955,2024-08-08,213.063385,0.011798,0
10956,2024-08-09,215.990005,0.011798,0
10957,2024-08-12,217.529999,0.015818,0
10958,2024-08-13,221.270004,0.011798,0
10959,2024-08-14,221.720001,0.015818,0
10960,2024-08-15,224.720001,0.014134,0
10961,2024-08-16,226.050003,0.014134,0
10962,2024-08-19,225.889999,0.014134,0
10963,2024-08-20,226.509995,0.014134,0
10964,2024-08-21,226.399994,0.004134,0
