In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import torch 

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [82]:
data= pd.read_csv('/Users/jonathanermias/Documents/GitHub/RedditStocks_v2.0/data/stock_reddit_merge.csv')

In [83]:
drop_columns = ['title','id', 'score', 'upvote_ratio', 'num_comments', 'url', 'selftext','clean_selftext','clean_title' 'has_target_stock', 'mentioned_tickers', 'ticker_', 'sentiment_category', 'combined_text',]
data = data.drop(columns=drop_columns, errors='ignore')

In [84]:
print(f"New data shape:" ,{data.shape})

New data shape: {(8276, 34)}


In [85]:
data.fillna(data.mean(numeric_only=True), inplace=True)

In [86]:
print('Columns in the dataset: ', data.columns)

Columns in the dataset:  Index(['subreddit', 'clean_title', 'author', 'has_target_stock',
       'title_sentiment_vader', 'post_sentiment_vader',
       'title_sentiment_finbert', 'post_sentiment_finbert', 'title_weight',
       'post_weight', 'vader_score', 'finbert_score', 'sentiment_score',
       'sentiment_normalized', 'Close_TSLA', 'High_TSLA', 'Low_TSLA',
       'Open_TSLA', 'Volume_TSLA', 'Close_GME', 'High_GME', 'Low_GME',
       'Open_GME', 'Volume_GME', 'Close_AAPL', 'High_AAPL', 'Low_AAPL',
       'Open_AAPL', 'Volume_AAPL', 'Close_NVDA', 'High_NVDA', 'Low_NVDA',
       'Open_NVDA', 'Volume_NVDA'],
      dtype='object')


In [87]:
data['Movement'] = data.apply(lambda row: 'up' if row['Close_NVDA'] > row['Open_NVDA'] else 'down', axis=1)

In [88]:
features = ['subreddit', 'title_sentiment_vader', 'post_sentiment_vader',
       'title_sentiment_finbert', 'post_sentiment_finbert', 'title_weight',
       'post_weight', 'vader_score', 'finbert_score', 'sentiment_score',
       'sentiment_normalized', 'Close_TSLA', 'High_TSLA', 'Low_TSLA',
       'Open_TSLA', 'Volume_TSLA', 'Close_GME', 'High_GME', 'Low_GME',
       'Open_GME', 'Volume_GME', 'Close_AAPL', 'High_AAPL', 'Low_AAPL',
       'Open_AAPL', 'Volume_AAPL', 'Close_NVDA', 'High_NVDA', 'Low_NVDA',
       'Open_NVDA', 'Volume_NVDA']

In [89]:
missing_columns = [col for col in features if col not in data.columns]
if missing_columns:
    print(f'Missing required columns : {missing_columns}')
    exit()

X = data[features]
y = data['Movement']

In [90]:
X_numeric = X.select_dtypes(include=['number'])

# Apply scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

In [None]:
# ─── Time‑aware train/test split ───
df['date'] = pd.to_datetime(df['date_only'])  # or reuse whichever date column survived
df = df.sort_values('date')
cutoff = pd.Timestamp('2025-03-31')  # train up through March 31
train = df[df['date'] <= cutoff]
test  = df[df['date'] >  cutoff]

X_train = train.drop(['Movement','date'], axis=1)
y_train = train['Movement']
X_test  = test .drop(['Movement','date'], axis=1)
y_test  = test ['Movement']

In [92]:
param_grid = {
    'n_estimators' : [20,40,60],
    'max_depth' : [5,8,10],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf' : [5,10],
    'max_features' : ['sqrt'],
    'ccp_alpha' : [0.001, 0.01,]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train)

In [93]:
best_model = grid_search.best_estimator_
print('Best Hyperparameters : ', grid_search.best_params_)

Best Hyperparameters :  {'ccp_alpha': 0.001, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 40}


In [94]:
y_pred = best_model.predict(X_test)

In [95]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='up')
recall = recall_score(y_test, y_pred, pos_label='up')

In [96]:
print('Model Performance')
print(f'Accuracy : {accuracy}')
print(f'Precision : {precision}')
print(f'Recall : {recall}')

Model Performance
Accuracy : 0.9993961352657005
Precision : 0.9959839357429718
Recall : 1.0


In [97]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

        down       1.00      1.00      1.00      1408
          up       1.00      1.00      1.00       248

    accuracy                           1.00      1656
   macro avg       1.00      1.00      1.00      1656
weighted avg       1.00      1.00      1.00      1656



In [98]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[1407    1]
 [   0  248]]


In [99]:
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='accuracy')
print('Cross-Validated Accuracy : ',cv_scores.mean())

Cross-Validated Accuracy :  0.9288265685888174
