In [125]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    confusion_matrix, classification_report
)
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


Using device: mps


In [126]:
data = pd.read_csv('../data/00/merged/sentiment_stock_merged_GME.csv')
print('Shape of data after loading:', data.shape)
print('Columns in the dataset:', data.columns.tolist())


Shape of data after loading: (89, 11)
Columns in the dataset: ['date_only', 'posts_per_day', 'avg_sentiment', 'avg_sentiment_normalized', 'date_only_stock', 'ticker', 'Open', 'High', 'Low', 'Close', 'Volume']


In [127]:
data['Movement'] = data.apply(lambda row: 'up' if row['Close'] > row['Open'] else 'down', axis=1)

features = [
    'date_only', 'posts_per_day', 'avg_sentiment', 'avg_sentiment_normalized',
    'date_only_stock', 'ticker', 'Open', 'High', 'Low', 'Close', 'Volume', 'Movement'
]
missing = [col for col in features if col not in data.columns]
if missing:
    print(f'Missing required columns: {missing}')
    raise SystemExit


In [128]:
date_cols = ['date_only', 'date_only_stock']
for col in date_cols:
    data[col] = pd.to_datetime(data[col])
    data[f'{col}_day']       = data[col].dt.day
    data[f'{col}_month']     = data[col].dt.month
    data[f'{col}_year']      = data[col].dt.year
    data[f'{col}_dayofweek'] = data[col].dt.dayofweek


In [129]:
numeric_features = [
    'posts_per_day', 'avg_sentiment', 'avg_sentiment_normalized',
    'Open', 'High', 'Low', 'Close', 'Volume',
    'date_only_day', 'date_only_month', 'date_only_year', 'date_only_dayofweek'
]
numeric_features = [c for c in numeric_features if c in data.columns]
print("Numeric features to be used:", numeric_features)

X = data[numeric_features]
y = data['Movement']
print(f'X shape: {X.shape}, y shape: {y.shape}')


Numeric features to be used: ['posts_per_day', 'avg_sentiment', 'avg_sentiment_normalized', 'Open', 'High', 'Low', 'Close', 'Volume', 'date_only_day', 'date_only_month', 'date_only_year', 'date_only_dayofweek']
X shape: (89, 12), y shape: (89,)


In [130]:
for col in numeric_features:
    X[col] = X[col].fillna(X[col].mean())
print('Remaining missing values per column:\n', X.isnull().sum())


Remaining missing values per column:
 posts_per_day               0
avg_sentiment               0
avg_sentiment_normalized    0
Open                        0
High                        0
Low                         0
Close                       0
Volume                      0
date_only_day               0
date_only_month             0
date_only_year              0
date_only_dayofweek         0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].mean())


In [131]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
param_grid = {
    'n_estimators':      [200, 500, 1000],
    'max_depth':         [2, 3, 4],
    'min_samples_split': [20, 50, 100],
    'min_samples_leaf':  [10,20, 50],
    'max_features':      ['sqrt', 'log2', 0.3]
}
print("Starting randomized search...")
rand_search = RandomizedSearchCV(
    RandomForestClassifier(
        random_state=42,
        oob_score=True,
        bootstrap=True
    ),
    param_distributions=param_grid,
    n_iter=50,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='accuracy',
    return_train_score=True,
    n_jobs=-1,
    random_state=42
)

rand_search.fit(X_train, y_train)

results = pd.DataFrame(rand_search.cv_results_)
results['gap'] = results['mean_train_score'] - results['mean_test_score']
print(results[['params', 'mean_train_score', 'mean_test_score', 'gap']])

best_model = rand_search.best_estimator_
print('Best Hyperparameters:', rand_search.best_params_)


Starting randomized search...
                                               params  mean_train_score  \
0   {'n_estimators': 1000, 'min_samples_split': 5,...          1.000000   
1   {'n_estimators': 1000, 'min_samples_split': 5,...          0.865413   
2   {'n_estimators': 500, 'min_samples_split': 2, ...          0.865584   
3   {'n_estimators': 1000, 'min_samples_split': 5,...          0.815820   
4   {'n_estimators': 1000, 'min_samples_split': 5,...          0.815820   
5   {'n_estimators': 1000, 'min_samples_split': 5,...          1.000000   
6   {'n_estimators': 500, 'min_samples_split': 5, ...          1.000000   
7   {'n_estimators': 1000, 'min_samples_split': 2,...          1.000000   
8   {'n_estimators': 1000, 'min_samples_split': 2,...          1.000000   
9   {'n_estimators': 1000, 'min_samples_split': 2,...          0.865413   
10  {'n_estimators': 1000, 'min_samples_split': 5,...          0.815820   
11  {'n_estimators': 500, 'min_samples_split': 2, ...          0.82639

In [133]:
y_pred = best_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label='up'))
print('Recall:',    recall_score(y_test, y_pred, pos_label='up'))
print('\nClassification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.7222222222222222
Precision: 0.3333333333333333
Recall: 0.25

Classification Report:
               precision    recall  f1-score   support

        down       0.80      0.86      0.83        14
          up       0.33      0.25      0.29         4

    accuracy                           0.72        18
   macro avg       0.57      0.55      0.56        18
weighted avg       0.70      0.72      0.71        18

Confusion Matrix:
 [[12  2]
 [ 3  1]]


In [134]:
# Cell 10: Feature Importances & Cross‑Validation
importances = pd.Series(best_model.feature_importances_, index=numeric_features)
print("Feature Importances:\n", importances.sort_values(ascending=False))

cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f"\nCross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Feature Importances:
 Volume                      0.282284
date_only_dayofweek         0.195824
Close                       0.174041
Open                        0.155499
date_only_day               0.054229
Low                         0.039505
High                        0.029359
avg_sentiment               0.019945
avg_sentiment_normalized    0.019340
posts_per_day               0.016342
date_only_month             0.011863
date_only_year              0.001771
dtype: float64

Cross-validation accuracy: 0.6974 ± 0.1127
