In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import torch 

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
data= pd.read_csv('../data/stock_reddit_merge.csv')

In [4]:
drop_columns = ['title','id', 'score', 'upvote_ratio', 'num_comments', 'url', 'selftext','clean_selftext','clean_title' 'has_target_stock', 'mentioned_tickers', 'ticker_', 'sentiment_category', 'combined_text',]
data = data.drop(columns=drop_columns, errors='ignore')

In [5]:
print(f"New data shape:" ,{data.shape})

New data shape: {(8244, 33)}


In [6]:
data.fillna(data.mean(numeric_only=True), inplace=True)

In [7]:
print('Columns in the dataset: ', data.columns)

Columns in the dataset:  Index(['subreddit', 'has_target_stock', 'title_sentiment_vader',
       'post_sentiment_vader', 'title_sentiment_finbert',
       'post_sentiment_finbert', 'title_weight', 'post_weight', 'vader_score',
       'finbert_score', 'sentiment_score', 'sentiment_normalized',
       'Close_TSLA', 'High_TSLA', 'Low_TSLA', 'Open_TSLA', 'Volume_TSLA',
       'Close_GME', 'High_GME', 'Low_GME', 'Open_GME', 'Volume_GME',
       'Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'Volume_AAPL',
       'Close_NVDA', 'High_NVDA', 'Low_NVDA', 'Open_NVDA', 'Volume_NVDA',
       'created_day_before'],
      dtype='object')


In [8]:
data['Movement'] = data.apply(lambda row: 1 if row['Close_NVDA'] > row['Open_NVDA'] else 0, axis=1)

In [23]:
features = ['title_sentiment_vader', 'post_sentiment_vader',
       'title_sentiment_finbert', 'post_sentiment_finbert', 'title_weight',
       'post_weight', 'vader_score', 'finbert_score', 'sentiment_score',
       'sentiment_normalized', 'Close_TSLA', 'High_TSLA', 'Low_TSLA',
       'Open_TSLA', 'Volume_TSLA', 'Close_GME', 'High_GME', 'Low_GME',
       'Open_GME', 'Volume_GME', 'Close_AAPL', 'High_AAPL', 'Low_AAPL',
       'Open_AAPL', 'Volume_AAPL', 'Close_NVDA', 'High_NVDA', 'Low_NVDA',
       'Open_NVDA', 'Volume_NVDA']

In [24]:
missing_columns = [col for col in features if col not in data.columns]
if missing_columns:
    print(f'Missing required columns : {missing_columns}')
    exit()

X = data[features]
y = data['Movement']

In [25]:
X_numeric = X.select_dtypes(include=['number'])

# Apply scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

In [35]:
# ─── Time‑aware train/test split ───
data['date'] = pd.to_datetime(data['created_day_before'])  # or reuse whichever date column survived
df = data.sort_values('date')
cutoff = pd.Timestamp('2025-03-31')  # train up through March 31
train = df[df['date'] <= cutoff]
test  = df[df['date'] >  cutoff]


X_train = train.drop(['Movement', 'created_day_before', 'subreddit','date'], axis=1)
y_train = train['Movement']
X_test  = test.drop(['Movement', 'created_day_before', 'subreddit','date'], axis=1)
y_test  = test['Movement']

In [36]:

print(X_train.iloc[0])

has_target_stock           0.000000e+00
title_sentiment_vader      0.000000e+00
post_sentiment_vader       7.845000e-01
title_sentiment_finbert    0.000000e+00
post_sentiment_finbert     0.000000e+00
title_weight               7.500000e-01
post_weight                2.500000e-01
vader_score                1.961250e-01
finbert_score              0.000000e+00
sentiment_score            9.806250e-02
sentiment_normalized       5.490313e+01
Close_TSLA                 2.504100e+02
High_TSLA                  2.617068e+02
Low_TSLA                   2.345629e+02
Open_TSLA                  2.455436e+02
Volume_TSLA                1.595629e+08
Close_GME                  2.469226e+01
High_GME                   2.527634e+01
Low_GME                    2.348528e+01
Open_GME                   2.400714e+01
Volume_GME                 1.369777e+07
Close_AAPL                 1.947361e+02
High_AAPL                  2.014655e+02
Low_AAPL                   1.870791e+02
Open_AAPL                  1.918628e+02


In [37]:
param_grid = {
    'n_estimators' : [20,40,60],
    'max_depth' : [5,8,10],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf' : [5,10],
    'max_features' : ['sqrt'],
    'ccp_alpha' : [0.001, 0.01,]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=1)

# Fit the grid search with the encoded data
grid_search.fit(X_train, y_train)

In [38]:
best_model = grid_search.best_estimator_
print('Best Hyperparameters : ', grid_search.best_params_)

Best Hyperparameters :  {'ccp_alpha': 0.001, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 20}


In [39]:
y_pred = best_model.predict(X_test)

In [41]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

In [42]:
print('Model Performance')
print(f'Accuracy : {accuracy}')
print(f'Precision : {precision}')
print(f'Recall : {recall}')

Model Performance
Accuracy : 0.8645144890696492
Precision : 0.8753437213565536
Recall : 0.9832328283571113


In [43]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.51      0.11      0.18      1069
           1       0.88      0.98      0.93      6799

    accuracy                           0.86      7868
   macro avg       0.69      0.55      0.55      7868
weighted avg       0.83      0.86      0.82      7868



In [44]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[ 117  952]
 [ 114 6685]]


In [45]:
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='accuracy')
print('Cross-Validated Accuracy : ',cv_scores.mean())

Cross-Validated Accuracy :  0.9942961165048544
