In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from textblob import TextBlob
from tqdm import tqdm
import torch

### 1. Preprocessing the Data

For text classification tasks, the first step is preprocessing the data to convert raw text into a format suitable for the model. This involves tokenizing the text and creating appropriate input tensors.

In [2]:
df = pd.read_csv('dataset/analyst_ratings_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A


In [3]:
# Function to determine sentiment
def get_sentiment(title):
    analysis = TextBlob(title)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

# Apply the function to our dataset
tqdm.pandas()
# df['sentiment'] = df['title'].apply(get_sentiment)
df['sentiment'] = df['title'].progress_apply(get_sentiment)
df.head()

100%|██████████| 1400469/1400469 [01:29<00:00, 15587.19it/s]


Unnamed: 0.1,Unnamed: 0,title,date,stock,sentiment
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A,Neutral
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A,Neutral
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A,Neutral
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A,Neutral
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A,Neutral


In [4]:
# Count how many positive, negative and neutral sentiments we have
df['sentiment'].value_counts()

sentiment
Neutral     930937
Positive    339045
Negative    130487
Name: count, dtype: int64

In [5]:
# Load the pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [6]:
# Tokenization function
def tokenize_data(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize the data
encoded_texts = encoded_texts = [tokenize_data(text) for text in tqdm(df['title'], desc="Tokenizing data")]

Tokenizing data: 100%|██████████| 1400469/1400469 [03:26<00:00, 6783.86it/s]


### 2. Fine-Tuning RoBERTa

Fine-tuning involves adjusting the weights of a pre-trained model on your specific dataset. You would usually split your data into training and validation sets, and then train the model.

In [7]:
# Label encoding (convert labels to integers)
label_dict = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
df['label'] = df['sentiment'].replace(label_dict)
df.head()

  df['label'] = df['sentiment'].replace(label_dict)


Unnamed: 0.1,Unnamed: 0,title,date,stock,sentiment,label
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A,Neutral,1
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A,Neutral,1
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A,Neutral,1
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A,Neutral,1
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A,Neutral,1


In [8]:
# Prepare dataset for the model
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
# Split data into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(df['title'], df['label'], test_size=0.1)

In [10]:
# Tokenize and create datasets
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)
train_dataset = NewsDataset(train_encodings, train_labels.tolist())
val_dataset = NewsDataset(val_encodings, val_labels.tolist())

In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
)



In [13]:
# Initialize Trainer
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_dict))
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/472659 [00:00<?, ?it/s]

{'loss': 1.1329, 'grad_norm': 6.12992525100708, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 1.1017, 'grad_norm': 6.898188591003418, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 1.0884, 'grad_norm': 3.1596133708953857, 'learning_rate': 3e-06, 'epoch': 0.0}
{'loss': 1.0675, 'grad_norm': 3.6739771366119385, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}
{'loss': 1.0053, 'grad_norm': 3.58829402923584, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 0.9335, 'grad_norm': 6.8375420570373535, 'learning_rate': 6e-06, 'epoch': 0.0}
{'loss': 0.8061, 'grad_norm': 7.543118476867676, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.0}
{'loss': 0.8409, 'grad_norm': 11.672341346740723, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}
{'loss': 0.8353, 'grad_norm': 7.882238864898682, 'learning_rate': 9e-06, 'epoch': 0.0}
{'loss': 0.7949, 'grad_norm': 6.433279037475586, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 0.6632, 'grad_norm': 11.6812009811401

KeyboardInterrupt: 

### 3. Evaluation

After training, you can evaluate the model using the validation set to see how well it performs.

In [12]:
results = trainer.evaluate()
print(results)

SyntaxError: invalid syntax (393824984.py, line 12)