In [None]:
# Import data

import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "sp500_headlines_2008_2024.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "dyutidasmahaptra/s-and-p-500-with-financial-news-headlines-20082024",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

df = df.head(100)

#print("First 5 records:", df.head())

In [None]:
# importing libraries

import pandas as pd
import numpy as np

# For BERT
import random
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Cleaning text
from bs4 import BeautifulSoup
import re

# Plots
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
def text_cleaning(text):
    soup = BeautifulSoup(text, "html.parser")
    text = re.sub(r'\[[^]]*\]', '', soup.get_text())
    pattern = r"[^a-zA-Z0-9\s,']"
    text = re.sub(pattern, '', text)
    return text

df['TitleClean'] = df['Title'].apply(text_cleaning)

# Input text
text_list = df['TitleClean'].to_list()

def mean_pooling(model_output, attention_mask):
    """
    Mean pooling to get sentence embeddings. See:
    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
    """
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Load BERT tokenizer and model
model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoded_input = [tokenizer(expr, return_tensors="pt") for expr in text_list]

# Create word embeddings
print('Creating word embeddings')
sentence_embeddings = []
for i in range(len(encoded_input)) :
    print(str(i+1)+'/'+str(len(encoded_input)))
    model_output = model(**encoded_input[i])
    sentence_embeddings.append(mean_pooling(model_output, encoded_input[i]['attention_mask']).detach().numpy()[0])
print('Done!')

In [None]:
print(np.array(sentence_embeddings))

In [None]:
# Function to generate word cloud
def generate_wordcloud(text,Title):
    all_text = " ".join(text)
    wordcloud = WordCloud(width=800, 
                          height=400,
                          stopwords=set(STOPWORDS), 
                          background_color='black').generate(all_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(Title)
    plt.show()


generate_wordcloud(df['Title'].apply(text_cleaning),'  ')

In [None]:
df_unique = df[['Date', 'CP']].drop_duplicates().reset_index(drop = True)

print(df_unique)
df_unique['lnCP1'] = np.log(df_unique['CP'].shift(-1))

df = pd.merge(
    df,               
    df_unique[['Date', 'lnCP1']],        
    on='Date',        
    how='left'        
)


In [None]:
# Embedding data set

df_new = pd.concat([df, pd.DataFrame(np.array(sentence_embeddings))], axis=1)

df_new.to_csv('EmbeddedData.csv', index=False)

print(df_new)

In [None]:
y_vars = ['lnCP1']
X_vars = [col for col in df_new.columns if col not in ["Title", "Date", "CP", 'lnCP1', "TitleClean"]]

step_size = 1
test_size = 0.3

init_test = int(len(df_unique) * (1-test_size))

print(init_test)


In [None]:
random.seed(41223)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

# Initialize models
models = []

models.append(("LinearRegression", LinearRegression()))
models.append(("Ridge", Ridge()))
models.append(("Lasso", Lasso()))
models.append(("SVR", SVR()))
models.append(("KNeighbors", KNeighborsRegressor()))
models.append(("DecisionTree", DecisionTreeRegressor()))
models.append(("RandomForest", RandomForestRegressor()))
rf2 = RandomForestRegressor(n_estimators=100, criterion='squared_error',
                           max_depth=10, random_state=0, max_features=None)
models.append(("RandomForest2", rf2))
models.append(("GradientBoosting", GradientBoostingRegressor()))
models.append(("MLPRegressor", MLPRegressor(solver='lbfgs', random_state=0)))

a = 0

df_test = []

# Create dictionary with empty lists
model_pred = {name: [] for name, _ in models}

while a <= len(df_unique) - init_test :
    # For a = 0, train starts at zero and ends at init_test-2
    # Test starts at init_test-1 and ends in init_test-1
    train_dates, test_dates = df_unique.iloc[a:(a + init_test-1)]['Date'].to_list(), [df_unique.iloc[(a + init_test-1)]['Date']]

    train, test = df_new[df_new['Date'].isin(train_dates)], df_new[df_new['Date'].isin(test_dates)]

    df_test.append(test)
    y_train = train[y_vars]
    X_train = train[X_vars]

    y_test = test[y_vars]
    X_test = test[X_vars]

    for name, model in models:
        model.fit(X_train, y_train)  # Train the model
        model_pred[name] += model.predict(X_test).tolist()

    a += 1

df_test = pd.concat(df_test)

In [None]:
print(len(df_test))
print(len(pd.DataFrame(model_pred)))

comparison_df = pd.concat([df_test[['Date', 'lnCP1']].reset_index(drop=True), pd.DataFrame(model_pred).reset_index(drop=True)], axis = 1)
#comparison_df = comparison_df.groupby('Date', as_index=False).mean()

# First convert array columns to regular float columns
def extract_single_value(x):
    if isinstance(x, (list, np.ndarray)) and len(x) == 1:
        return x[0]
    return x

# Apply to all columns except Date
for col in comparison_df.columns:
    if col != 'Date':
        comparison_df[col] = comparison_df[col].apply(extract_single_value)

comparison_df = comparison_df.groupby('Date', as_index=False).mean()
print(comparison_df)

In [None]:
# Ensure Date is datetime
comparison_df['Date'] = pd.to_datetime(comparison_df['Date'])

# Get model columns (exclude Date and lnCP1)
model_cols = [col for col in comparison_df.columns if col not in ['Date', 'lnCP1']]

# Create individual plots for each model comparison
for model in model_cols:
    plt.figure(figsize=(12, 6))
    
    # Plot lnCP1 (actual values)
    plt.plot(comparison_df['Date'], comparison_df['lnCP1'], 
             'b-', linewidth=2, label='lnCP1 (Actual)')
    
    # Plot model predictions
    plt.plot(comparison_df['Date'], comparison_df[model], 
             'r--', linewidth=1.5, label=f'{model} (Predicted)')
    
    # Formatting
    plt.title(f'lnCP1 vs {model} Predictions')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

for model in model_pred.keys() :
    y_pred = comparison_df[model]
    mape = mean_absolute_percentage_error(y_pred[:-1], comparison_df["lnCP1"][:-1])
    rmse = np.sqrt(mean_squared_error(y_pred[:-1], comparison_df["lnCP1"][:-1]))
    print(f"--------------------------\nModel: {model}\nTest MAPE: {mape:.2f}\nTest RMSE: {rmse:.2f}")