#Stock Price Prediction using LSTM Networks and Financial News Sentiment Analysis
## Abhimanyu Kumar

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

dir = "/content/drive/MyDrive/stockPricePrediction"


In [None]:
#library
import pandas as pd
import numpy as np
import os

In [None]:
from datasets import load_dataset

ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [None]:
type(ds)

datasets.dataset_dict.DatasetDict

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [None]:
train=ds['train']
train_ds=train.to_pandas()
validation=ds['validation']
validation_ds=validation.to_pandas()


data=pd.concat([train_ds,validation_ds], ignore_index=True)

In [None]:
data

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0
...,...,...
11926,Stocks making the biggest moves midday: TD Ame...,2
11927,Stocks making the biggest moves premarket: Fit...,2
11928,Stocks making the biggest moves premarket: Hom...,2
11929,Stocks making the biggest moves premarket: TD ...,2


In [None]:
data['text'][1]

'$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3'

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11931 entries, 0 to 11930
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11931 non-null  object
 1   label   11931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 186.6+ KB


sentiments = {
    "LABEL_0": "Bearish",
    "LABEL_1": "Bullish",
    "LABEL_2": "Neutral"
}

In [None]:
data.duplicated().sum()

np.int64(0)

In [None]:
data.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [None]:
data['text'] = data['text'].str.lower()

# FIX HYPERLINKS
data['text'] = data['text'].replace(r'https?://.*[\r\n]*', ' ',regex=True)
data['text'] = data['text'].replace(r'www.*[\r\n]*', ' ',regex=True)
data['text'] = data['text'].str.replace('https', '', regex=False)

In [None]:
	## FIX $ SYMBOL
	data['text'] = data['text'].str.replace('[$][0-9\.]', ' dollars ', regex=True)
	data['text'] = data['text'].str.replace('$', '', regex=False)

  data['text'] = data['text'].str.replace('[$][0-9\.]', ' dollars ', regex=True)


In [None]:
data.head()

Unnamed: 0,text,label
0,bynd - jpmorgan reels in expectations on beyon...,0
1,ccl rcl - nomura points to bookings weakness a...,0
2,"cx - cemex cut at credit suisse, j.p. morgan o...",0
3,ess: btig research cuts to neutral,0
4,fnko - funko slides after piper jaffray pt cut,0


In [None]:
total_positve=len(data[data['label'] == 1])
total_positve

2398

In [None]:
total_neutral=len(data[data['label'] == 2])
total_neutral

7744

In [None]:
total_negative=len(data[data['label'] ==0])
total_negative

1789

In [None]:
data.head()

Unnamed: 0,text,label
0,bynd - jpmorgan reels in expectations on beyon...,0
1,ccl rcl - nomura points to bookings weakness a...,0
2,"cx - cemex cut at credit suisse, j.p. morgan o...",0
3,ess: btig research cuts to neutral,0
4,fnko - funko slides after piper jaffray pt cut,0


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    data,
    test_size=0.2,
    stratify=data["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)


In [None]:
train_df.head()


Unnamed: 0,text,label
3444,"estee lauder eps beats by dollars .21, beats ...",1
2650,celyad reports q3 results,2
11758,stock market news: netflix climbs on subscribe...,2
10164,ceix - consol energy q4 2019 earnings preview,2
4192,big four/government: no freeze on fees,2


In [None]:
test_df.head()

Unnamed: 0,text,label
2288,mtsc - mts systems declares dollars .30 divid...,2
1855,"santana minerals : petrology confirms nickel, ...",2
1670,mcewen mining prices public offering at dolla...,2
8937,admp - adamis pharma -61% on fda's zimhi rejec...,0
11189,"grain prices to hold steady, trade within a ra...",2


In [None]:
val_df.head()

Unnamed: 0,text,label
9351,jpmorgan positive on five below into earnings,1
7347,"sen. kamala harris wants to ""modernize"" the sc...",2
6414,"u.s. economy adds 225,000 jobs in january; 201...",1
11526,charttrader's recent ymm0 short took off for a...,1
10901,top trending: assange rape investigation dropp...,2


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: ProsusAI/finbert
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
texts = train_df["text"].tolist()

results = pipe(
    texts,
    batch_size=16,
    truncation=True
)

train_df["sentiment"] = [r["label"] for r in results]


In [None]:
train_df.count()

Unnamed: 0,0
text,9544
label,9544
sentiment,9544


In [None]:
label_map = {
    "negative": 0,
    "neutral": 2,
    "positive": 1
}


In [None]:
y_true = train_df["label"]
y_pred = train_df["sentiment"].map(label_map)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Accuracy: 0.711756077116513

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.76      0.60      1431
           1       0.58      0.59      0.58      1918
           2       0.85      0.74      0.79      6195

    accuracy                           0.71      9544
   macro avg       0.64      0.70      0.66      9544
weighted avg       0.74      0.71      0.72      9544



# Fine Tuning

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# Load model directly
model_name = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: ProsusAI/finbert
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
# 1. Ensure labels are int
train_df["label"] = train_df["label"].astype(int)
val_df["label"] = val_df["label"].astype(int)


# 3. Ensure model is trainable
model.train()
for param in model.parameters():
    param.requires_grad = True


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9544 entries, 3444 to 656
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       9544 non-null   object
 1   label      9544 non-null   int64 
 2   sentiment  9544 non-null   object
dtypes: int64(1), object(2)
memory usage: 298.2+ KB


In [None]:
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [None]:
# Ensure labels are int
train_df["label"] = train_df["label"].astype(int)
val_df["label"] = val_df["label"].astype(int)

# Recreate datasets
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
val_dataset   = Dataset.from_pandas(val_df[["text", "label"]])

# Tokenize
train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset   = val_dataset.map(tokenize_fn, batched=True)

#  IMPORTANT: rename label → labels
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

# Set torch format
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)


Map:   0%|          | 0/9544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

In [None]:
pip install -U transformers




In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=dir + "/finbert_finetuned",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=dir + "/logs",
    logging_steps=100,
    eval_steps=500,      # evaluation every 500 steps
    save_steps=500,      # save every 500 steps
    report_to="none"
)


`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
device = model.device  # gets cuda or cpu automatically
import torch
torch.set_grad_enabled(True)

batch = train_dataset[0]

out = model(
    input_ids=batch["input_ids"].unsqueeze(0).to(device),
    attention_mask=batch["attention_mask"].unsqueeze(0).to(device),
    labels=batch["labels"].unsqueeze(0).to(device)
)

print(out.loss, out.loss.requires_grad)


tensor(3.3029, device='cuda:0', grad_fn=<NllLossBackward0>) True


In [None]:
trainer.train()


Step,Training Loss
100,0.679545
200,0.536308
300,0.492861
400,0.308914
500,0.306398
600,0.290441
700,0.195554
800,0.175684


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=897, training_loss=0.35383676764957084, metrics={'train_runtime': 574.4059, 'train_samples_per_second': 49.846, 'train_steps_per_second': 1.562, 'total_flos': 1883365844097024.0, 'train_loss': 0.35383676764957084, 'epoch': 3.0})

In [None]:
print(train_dataset[0])
print(model.training)
print(any(p.requires_grad for p in model.parameters()))


{'labels': tensor(1), 'input_ids': tensor([  101, 28517,  2063, 21602,  2099, 20383, 10299,  2011,  6363,  1012,
         2538,  1010, 10299,  2006,  6599,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [None]:
trainer.evaluate()


{'eval_loss': 0.439889520406723,
 'eval_accuracy': 0.8608549874266554,
 'eval_precision': 0.8637813280633725,
 'eval_recall': 0.8608549874266554,
 'eval_f1': 0.862006657707248,
 'eval_runtime': 8.2094,
 'eval_samples_per_second': 145.32,
 'eval_steps_per_second': 4.629,
 'epoch': 3.0}

In [None]:
test_dataset = Dataset.from_pandas(test_df[["text", "label"]])
test_dataset = test_dataset.map(tokenize_fn, batched=True)
test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

trainer.evaluate(test_dataset)


Map:   0%|          | 0/1194 [00:00<?, ? examples/s]

{'eval_loss': 0.45504769682884216,
 'eval_accuracy': 0.855108877721943,
 'eval_precision': 0.8567378365980753,
 'eval_recall': 0.855108877721943,
 'eval_f1': 0.8556490165822588,
 'eval_runtime': 8.7262,
 'eval_samples_per_second': 136.829,
 'eval_steps_per_second': 4.355,
 'epoch': 3.0}