In [1]:
import sys
from pathlib import Path
import importlib
import pandas as pd
import numpy as np

here = Path().resolve()
repo_root = None
for p in [here] + list(here.parents):
    if (p / "src" / "__init__.py").exists():
        repo_root = p
        break

if repo_root is None:
    raise RuntimeError("Could not find top-level 'src/__init__.py' from this notebook.")

sys.path.insert(0, str(repo_root))
print("[OK] sys.path[0] =", sys.path[0])

import src
print("Using src from:", getattr(src, "__file__", "<package>"))

for mod in ("src.data", "src.features", "src.io"):
    spec = importlib.util.find_spec(mod)
    print(f"{mod}: {'FOUND' if spec else 'missing'}")

[OK] sys.path[0] = /Users/hitakshikulhare/bootcamp_hitakshi_kulhare
Using src from: /Users/hitakshikulhare/bootcamp_hitakshi_kulhare/src/__init__.py
src.data: FOUND
src.features: FOUND
src.io: FOUND


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from src.io import save_artifacts, load_artifacts
from src import download_data, build_features, FEATURE_COLUMNS
from src.io import save_artifacts, load_artifacts

In [4]:
# 1) Data
df = download_data("AAPL", "2020-01-01", "2025-01-01")
df = build_features(df)

X = df[FEATURE_COLUMNS].copy()
y = df["Target"].astype(int)

# 2) Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42)

# 3) Impute -> SMOTE -> Scale
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
X_train_imp = imputer.fit_transform(X_train)
X_val_imp   = imputer.transform(X_val)
X_test_imp  = imputer.transform(X_test)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_imp, y_train)

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train_sm)
X_val_sc   = scaler.transform(X_val_imp)
X_test_sc  = scaler.transform(X_test_imp)

# 4) Model
rf = RandomForestClassifier(n_estimators=300, class_weight="balanced_subsample", random_state=42)
rf.fit(X_train_sm, y_train_sm)  # RF OK on unscaled

proba_val = rf.predict_proba(X_val_imp)[:,1]
print("VAL AUC:", round(roc_auc_score(y_val, proba_val), 4))
print(classification_report(y_val, (proba_val>=0.5).astype(int), zero_division=0))

VAL AUC: 0.4794
              precision    recall  f1-score   support

           0       0.45      0.48      0.46        88
           1       0.52      0.49      0.50       101

    accuracy                           0.48       189
   macro avg       0.48      0.48      0.48       189
weighted avg       0.48      0.48      0.48       189

