# ==============================
# Cell 1: Import libraries
# ==============================
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# ==============================
# Cell 2: Load data
# ==============================
# Choose one stock file from processed_data (e.g., AAPL.csv)
file_path = "processed_data/AAPL.csv"

df = pd.read_csv(file_path)
print("Data shape:", df.shape)
df.head()

In [None]:
# ==============================
# Cell 3: Create Target
# ==============================
# Label = 1 if stock rises >= threshold within 'period' days, else 0
period = 30
threshold = 0.1

future_price = df["Adj Close"].shift(-period)
df["Target"] = ((future_price / df["Adj Close"] - 1) >= threshold).astype(int)

# Drop rows with NaN (due to shifting)
df.dropna(inplace=True)

df[["Date", "Adj Close", "Target"]].tail(10)

In [None]:
# ==============================
# Cell 4: Split Features / Target
# ==============================
drop_cols = ["Date","Open","High","Low","Close","Adj Close","Target"]
feature_cols = [c for c in df.columns if c not in drop_cols]

X = df[feature_cols]
y = df["Target"]

print("Features used:", feature_cols)
print("Number of features:", len(feature_cols))

In [None]:
# ==============================
# Cell 5: Train/Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
# ==============================
# Cell 6: Train Model (Random Forest)
# ==============================
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

print("Model training complete ✅")

In [None]:
# ==============================
# Cell 7: Evaluate Model
# ==============================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# ==============================
# Cell 8: Probability-based Recommendation
# ==============================
proba = model.predict_proba(X_test)[:,1]  # probability of upward movement

df_test = df.iloc[X_test.index].copy()
df_test["ProbUp"] = proba

# Show last 10 rows with prediction probability
df_test.tail(10)[["Date","Adj Close","ProbUp","Target"]]