In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

Downloading readme:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/859k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/217k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(dataset['train'])
print(df.head())

                                                text  label
0  $BYND - JPMorgan reels in expectations on Beyo...      0
1  $CCL $RCL - Nomura points to bookings weakness...      0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...      0
3  $ESS: BTIG Research cuts to Neutral https://t....      0
4  $FNKO - Funko slides after Piper Jaffray PT cu...      0


In [None]:
import re

def process_tweet(text):
  if not isinstance(text, str):
    return text
  text = re.sub(r'http\S+', '[URL]', text)
  text = re.sub(r'#', '', text)
  text = re.sub(r'([!?.]){2,}', r'\1', text)
  return text


In [None]:
def process_dataset(dataset):
  def func(row):
    row['text'] = process_tweet(row['text'])
    return row
  return dataset.map(func, batched=True)

In [None]:
processed_dataset = process_dataset(dataset)

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
def tokenize(row):
  return tokenizer(row['text'], padding="max_length", truncation=True)

In [None]:
tokenized_dataset = processed_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
import torch

def predict_sentiment(text):
  processed_text = process_tweet(text)
  inputs = tokenizer(processed_text, return_tensors="pt", padding="max_length", truncation=True)
  output = model(**inputs)
  prob = torch.nn.functional.softmax(output.logits, dim=-1)
  sentiment = torch.argmax(prob, dim=1).item()

  return sentiment, prob.detach().numpy()

In [None]:
tweet = "TSLA went down today this is so very bad"
sentiment, probabilities = predict_sentiment(tweet)
print(f"Sentiment:{sentiment}, Probabilities: {probabilities}")

Sentiment:0, Probabilities: [[0.8936528  0.10510557 0.00124165]]


In [None]:
tweet = "TSLA went up! time to buy!"
sentiment, probabilities = predict_sentiment(tweet)
print(f"Sentiment:{sentiment}, Probabilities: {probabilities}")

Sentiment:2, Probabilities: [[6.3871342e-04 2.0467307e-01 7.9468822e-01]]


In [None]:
tweet = "up down up down up down"
sentiment, probabilities = predict_sentiment(tweet)
print(f"Sentiment:{sentiment}, Probabilities: {probabilities}")

Sentiment:1, Probabilities: [[5.9857923e-03 9.9324292e-01 7.7125075e-04]]
