# Project Overview
This notebook implements a BERT-based model to analyze financial news headlines and predict their effect on stock prices.

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import optuna
from datetime import datetime
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report



2025-06-12 22:34:12.664471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749782052.680282 2431904 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749782052.685332 2431904 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749782052.697736 2431904 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749782052.697753 2431904 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749782052.697754 2431904 computation_placer.cc:177] computation placer alr

# Data Preprocessing
This section covers data loading cleaning, tokenization, and label encoding.

## How I Preprocessed the before Data

In [3]:
def extract_sentiment(contents_str):
    try:
        data = json.loads(contents_str)
        if isinstance(data, list) and len(data) > 0 and 'sentiment' in data[0]:
            return data[0]['sentiment']
        return None
    except Exception:
        return None

In [4]:
df['sentiment'] = df['contents'].apply(extract_sentiment)

label_map = {'neutral': 0, 'negative': -1, 'positive': 1}
label_encoded_map = {'neutral': 1, 'positive': 2, 'negative': 0}

df['label'] = df['sentiment'].map(label_map)
df['label_encoded'] = df['sentiment'].map(label_encoded_map)

df = df.dropna(subset=['sentiment'])

In [5]:
df['text'] = df['title'].astype(str) + ' ' + df['summary'].astype(str)
df['label'] = df['label'].astype(int)
df['label_encoded'] = df['label_encoded'].astype(int)


## Loading the processed Data

In [2]:
df = pd.read_csv("dataset.csv")
df['date'] = pd.to_datetime(df['published_at']).dt.date

In [3]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)



In [4]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
texts = df['text'].tolist()
labels = df['label_encoded'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_dataset = NewsDataset(train_texts, train_labels)
val_dataset = NewsDataset(val_texts, val_labels)

# Model Hyperparameter Tuning
This section covers how I computed hyperparameter tuning using optuna. I tuned number of train epochs, batch sizes, learning rate, weight decays based on the evaluation accuracy

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

def model_init():
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

def objective(trial):
    args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="no",
        num_train_epochs=trial.suggest_int("num_train_epochs", 2, 6),
        per_device_train_batch_size=trial.suggest_categorical("batch_size", [8, 16, 32]),
        learning_rate=trial.suggest_float("learning_rate", 2e-5, 5e-5, log=True),
        weight_decay=trial.suggest_float("weight_decay", 0.0, 0.3),
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    result = trainer.train()
    return trainer.evaluate()["eval_accuracy"]


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)


[I 2025-06-12 22:38:28,304] A new study created in memory with name: no-name-67aa6451-8e2b-49ec-be53-1d4dc506da3a
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7959,0.749129,0.673743


In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

training_args = TrainingArguments(
    output_dir="./best_model",
    per_device_train_batch_size=16,
    num_train_epochs=4,
    learning_rate=2.3897116952456707e-05,
    weight_decay=0.21227675946613164,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16 = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset 
)

In [None]:
trainer.train()

In [None]:
preds = trainer.predict(val_dataset)
y_true = val_labels
y_pred = preds.predictions.argmax(axis=1)

print(classification_report(y_true, y_pred))

In [None]:
trainer.save_model("./saved_model")
tokenizer.sae_pretrained("./saved_model")