In [221]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder # For 'subreddit' or 'ticker'
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import torch
import numpy as np

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [222]:
#Use the correctly merged and lagged file
data_file = '../data/stock_reddit_merged_lagged_corrected.csv'
data = pd.read_csv(data_file)

# Convert date_only to datetime if needed for sorting
data['date_only'] = pd.to_datetime(data['date_only'])
data = data.sort_values('date_only')

print(f"Loaded data shape: {data.shape}")
print("Columns:", data.columns.tolist())
print("Data sample:\n", data.head())
print("Check for NaNs:\n", data.isna().sum())

Loaded data shape: (35, 12)
Columns: ['date_only', 'ticker', 'Open', 'High', 'Low', 'Close', 'Volume', 'Movement', 'mean_sentiment_score', 'mean_vader_score', 'mean_finbert_score', 'post_count']
Data sample:
     date_only ticker        Open        High         Low       Close  \
23 2025-03-31   NVDA  105.129997  110.959999  103.650002  108.379997   
0  2025-04-01   TSLA  263.799988  277.450012  259.250000  268.459991   
24 2025-04-01   NVDA  108.519997  110.199997  106.470001  110.150002   
13 2025-04-01   AAPL  219.809998  223.679993  218.899994  223.190002   
1  2025-04-02   TSLA  254.600006  284.989990  251.270004  282.760010   

       Volume  Movement  mean_sentiment_score  mean_vader_score  \
23  299212700         1             -0.034150          -0.06830   
0   146486900         1              0.363325           0.72665   
24  222614000         1              0.242775           0.48555   
13   36412700         1              0.081800           0.16360   
1   212787800         1

In [224]:
 # Define Target (should already be 'Movement' from the merge script)
target = 'Movement'

# Define Features
# SELECT FEATURES CAREFULLY TO AVOID LEAKAGE
# Example: Use aggregated sentiment, post count, and SAME DAY OHLCV for the SPECIFIC ticker
# Avoid using OHLCV from OTHER tickers for the same day unless properly lagged.

# Get unique tickers to potentially build per-ticker models or use 'ticker' as a feature
unique_tickers = data['ticker'].unique()
print(f"Unique tickers in data: {unique_tickers}")

# For this example, let's predict for NVDA and use NVDA's data + sentiment
# If you want a general model, 'ticker' could be a feature (needs one-hot encoding)
data_predict_ticker = data[data['ticker'] == 'NVDA'].copy() # Example: Filter for one ticker
if data_predict_ticker.empty:
     raise ValueError("No data found for the specified ticker 'NVDA'. Check the merged data.")


# Define feature columns for the chosen ticker model
# Features are from day D, Target ('Movement') is derived from day D+1
numeric_features = [
    'Open', 'High', 'Low', 'Close', 'Volume', # Stock data for NVDA on day D
    'mean_sentiment_score', 'mean_vader_score', # Aggregated sentiment for NVDA on day D
    'mean_finbert_score', 'post_count'         # Aggregated sentiment for NVDA on day D
]
# Add categorical features if applicable (e.g., 'subreddit' if aggregated)
# categorical_features = ['subreddit_agg'] # Example if you aggregated subreddit info

# Ensure all selected features exist
missing_features = [f for f in numeric_features if f not in data_predict_ticker.columns]
if missing_features:
    raise ValueError(f"Missing required feature columns: {missing_features}")

X = data_predict_ticker[numeric_features]
y = data_predict_ticker[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Unique tickers in data: ['NVDA' 'TSLA' 'AAPL' 'GME']
Features (X) shape: (12, 9)
Target (y) shape: (12,)


In [225]:
# Use TimeSeriesSplit for realistic evaluation
n_splits = 5 # Number of splits for cross-validation
tscv = TimeSeriesSplit(n_splits=n_splits)

# Optional: Define a final hold-out test set (e.g., last 20% of time)
test_size_percentage = 0.20
split_index = int(len(X) * (1 - test_size_percentage))
X_train_val, X_test = X[:split_index], X[split_index:]
y_train_val, y_test = y[:split_index], y[split_index:]

print(f"Train/Validation set size: {len(X_train_val)}")
print(f"Hold-out Test set size: {len(X_test)}")


Train/Validation set size: 9
Hold-out Test set size: 3


In [226]:
# Handle NaNs and Scale features AFTER splitting
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute NaNs
    ('scaler', StandardScaler()) # Scale features
])

# If you had categorical features:
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# Combine transformers
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         #('cat', categorical_transformer, categorical_features) # Uncomment if using categorical
#     ])

# Using only numeric features in this example
preprocessor = numeric_transformer


In [227]:
# Create the full pipeline including preprocessing and classifier
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Define parameter grid (adjust based on compute resources and desired thoroughness)
param_grid = {
    'classifier__n_estimators': [50, 100], # Reduced for speed
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [5, 10],
    'classifier__min_samples_leaf': [3, 5],
    'classifier__max_features': ['sqrt', 0.5], # Use float for percentage
    # 'classifier__ccp_alpha': [0.0, 0.001] # CCP Pruning
}

# Use TimeSeriesSplit in GridSearchCV
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=tscv, scoring='accuracy', n_jobs=-1, verbose=1)

print("Starting Grid Search CV with TimeSeriesSplit...")
# Fit on the training/validation part of the data
grid_search.fit(X_train_val, y_train_val)

print('Best Hyperparameters: ', grid_search.best_params_)
best_model_pipeline = grid_search.best_estimator_

Starting Grid Search CV with TimeSeriesSplit...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Hyperparameters:  {'classifier__max_depth': 5, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}


In [228]:
# A. Cross-Validation Score (using TimeSeriesSplit on train/validation data)
# Refit CV on the train/validation set to get a score distribution
cv_scores = cross_val_score(best_model_pipeline, X_train_val, y_train_val, cv=tscv, scoring='accuracy')
print(f'\nTime-Series Cross-Validated Accuracy (on train/val data): {np.mean(cv_scores):.4f} +/- {np.std(cv_scores):.4f}')

# B. Final Evaluation on Hold-Out Test Set
print("\nEvaluating best model on the hold-out test set...")
y_pred_test = best_model_pipeline.predict(X_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
# Specify labels for precision/recall if target is binary (0, 1)
# Use zero_division=0 to avoid warnings if a class has no predicted samples
precision_test = precision_score(y_test, y_pred_test, labels=[0, 1], average='binary', pos_label=1, zero_division=0)
recall_test = recall_score(y_test, y_pred_test, labels=[0, 1], average='binary', pos_label=1, zero_division=0)

print('\nHold-Out Test Set Performance:')
print(f'Accuracy:  {accuracy_test:.4f}')
print(f'Precision (for class 1): {precision_test:.4f}')
print(f'Recall (for class 1):    {recall_test:.4f}')

print('\nClassification Report (Test Set):')
print(classification_report(y_test, y_pred_test, labels=[0, 1], target_names=['Down/NoChange', 'Up'], zero_division=0))

print('\nConfusion Matrix (Test Set):')
print(confusion_matrix(y_test, y_pred_test, labels=[0, 1]))


Time-Series Cross-Validated Accuracy (on train/val data): 0.6000 +/- 0.4899

Evaluating best model on the hold-out test set...

Hold-Out Test Set Performance:
Accuracy:  0.0000
Precision (for class 1): 0.0000
Recall (for class 1):    0.0000

Classification Report (Test Set):
               precision    recall  f1-score   support

Down/NoChange       0.00      0.00      0.00       3.0
           Up       0.00      0.00      0.00       0.0

     accuracy                           0.00       3.0
    macro avg       0.00      0.00      0.00       3.0
 weighted avg       0.00      0.00      0.00       3.0


Confusion Matrix (Test Set):
[[0 3]
 [0 0]]


In [None]:
# Access the classifier step in the pipeline
rf_model = best_model_pipeline.named_steps['classifier']
# If you used ColumnTransformer, get feature names correctly
# feature_names = best_model_pipeline.named_steps['preprocessor']...? # Need to get names after transform
# For simple numeric pipeline:
feature_names = numeric_features
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)


Feature Importances:
                Feature  Importance
2                   Low    0.212121
1                  High    0.151515
5  mean_sentiment_score    0.151515
0                  Open    0.090909
6      mean_vader_score    0.090909
7    mean_finbert_score    0.090909
8            post_count    0.090909
3                 Close    0.060606
4                Volume    0.060606


In [214]:
missing_columns = [col for col in features if col not in data.columns]
if missing_columns:
    print(f'Missing required columns : {missing_columns}')
    exit()


y = data['Movement']
X = data.drop(columns = ['subreddit'])

In [215]:
X_numeric = X.select_dtypes(include=['number'])

# Apply scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

In [216]:
# ─── Time‑aware train/test split ───
data['date'] = pd.to_datetime(data['created_day_before'])  # or reuse whichever date column survived
df = data.sort_values('date')
cutoff = pd.Timestamp('2025-03-31')  # train up through March 31
train = df[df['date'] <= cutoff]
test  = df[df['date'] >  cutoff]

X_train = train.drop(['Movement','date', 'subreddit'], axis=1)
y_train = train['Movement']
X_test  = test .drop(['Movement','date','subreddit'], axis=1)
y_test  = test ['Movement']

In [217]:
param_grid = {
    'n_estimators' : [20,40,60],
    'max_depth' : [5,8,10],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf' : [5,10],
    'max_features' : ['sqrt'],
    'ccp_alpha' : [0.001, 0.01,]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train)

ValueError: 
All the 540 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 348, in fit
    X, y = self._validate_data(
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1146, in check_X_y
    X = check_array(
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '2025-03-27'

--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 348, in fit
    X, y = self._validate_data(
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1146, in check_X_y
    X = check_array(
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/jonathanermias/miniforge3/envs/RedditStocksV2/lib/python3.8/site-packages/pandas/core/generic.py", line 1998, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '2025-03-26'


In [None]:
best_model = grid_search.best_estimator_
print('Best Hyperparameters : ', grid_search.best_params_)

Best Hyperparameters :  {'ccp_alpha': 0.001, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 40}


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='up')
recall = recall_score(y_test, y_pred, pos_label='up')

In [None]:
print('Model Performance')
print(f'Accuracy : {accuracy}')
print(f'Precision : {precision}')
print(f'Recall : {recall}')

Model Performance
Accuracy : 0.9993961352657005
Precision : 0.9959839357429718
Recall : 1.0


In [None]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

        down       1.00      1.00      1.00      1408
          up       1.00      1.00      1.00       248

    accuracy                           1.00      1656
   macro avg       1.00      1.00      1.00      1656
weighted avg       1.00      1.00      1.00      1656



In [None]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[1407    1]
 [   0  248]]


In [None]:
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='accuracy')
print('Cross-Validated Accuracy : ',cv_scores.mean())

Cross-Validated Accuracy :  0.9288265685888174
