In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib


In [2]:
PROCESSED_DIR = "../../data/processed"
MODEL_DIR = "../../models"


In [3]:
data_path = os.path.join(PROCESSED_DIR, 'training_data.csv')
print(f"Loading training data from {data_path}...")
df = pd.read_csv(data_path)
df.info()


Loading training data from ../../data/processed\training_data.csv...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10804 entries, 0 to 10803
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Close              10804 non-null  float64
 1   High               10804 non-null  float64
 2   Low                10804 non-null  float64
 3   Open               10804 non-null  float64
 4   Volume             10804 non-null  int64  
 5   Date               10804 non-null  object 
 6   Ticker             10804 non-null  object 
 7   SMA_50             10804 non-null  float64
 8   Trend              10804 non-null  object 
 9   Target             10804 non-null  int64  
 10  Price_Change       10804 non-null  float64
 11  Distance_from_SMA  10804 non-null  float64
 12  Momentum_5d        10804 non-null  float64
 13  Volatility         10804 non-null  float64
 14  Next_Day_Target    10804 non-null  float64
dtypes

In [4]:
feature_cols = ['SMA_50', 'Price_Change', 'Distance_from_SMA', 'Momentum_5d', 'Volatility']
    
target_col = 'Next_Day_Target'
    
X = df[feature_cols].copy()
y = df[target_col].copy()
    
print(f"Features: {feature_cols}")
print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
    

Features: ['SMA_50', 'Price_Change', 'Distance_from_SMA', 'Momentum_5d', 'Volatility']
Feature matrix shape: (10804, 5)
Target distribution: {1.0: 7129, 0.0: 3675}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 8643 samples
Test set: 2161 samples


In [6]:
model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    
model.fit(X_train, y_train)
print("Model training complete!")
    

Model training complete!


In [7]:
feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
print("Feature Importance:")
print(feature_importance.to_string(index=False))
    

Feature Importance:
          feature  importance
Distance_from_SMA    0.814530
      Momentum_5d    0.094738
     Price_Change    0.040903
       Volatility    0.025320
           SMA_50    0.024509


Evaluate Model

In [8]:
y_pred = model.predict(X_test)
    
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred)
}
    
print("Model Performance:")
for metric_name, metric_value in metrics.items():
    print(f"  {metric_name.capitalize()}: {metric_value:.4f}")
    
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

Model Performance:
  Accuracy: 0.9348
  Precision: 0.9573
  Recall: 0.9432
  F1_score: 0.9502
Confusion Matrix:
[[ 675   60]
 [  81 1345]]


Save Metrics

In [9]:
metrics_path = os.path.join(MODEL_DIR, 'model_metrics.json')
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Metrics saved to {metrics_path}")


Metrics saved to ../../models\model_metrics.json
