In [38]:
import pandas as pd
import numpy as np
from scipy.special import softmax

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
import os
print(os.getcwd())

d:\Cornell\course\CS6386\Analyzing-the-Correlation-Between-Retail-Traders--Sentiments-and-Equity-Market-Movements\Sentiment_Analysis


In [4]:
# !pip install datasets

In [5]:
# raw_datasets = load_from_disk("data")
raw_datasets = load_from_disk("data/finetune_data/")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

Map: 100%|██████████| 160/160 [00:00<00:00, 5058.18 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 2660.94 examples/s]
Map: 100%|██████████| 5656/5656 [00:00<00:00, 16784.82 examples/s]


In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5656
    })
})

In [7]:
raw_datasets['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 160
})

In [9]:
tokenized_datasets['train']['text']

['Technopolis and the St. Petersburg government signed a cooperation memorandum in October 2005 to set up a techno-park in the Neudorf production zone in the village of Strelny , in the St. Petersburg suburbs .',
 'Shorting $GS for a quick profit. Bearish sentiment is strong. ????',
 '$SQ $NVDA paying today',
 "Retail investors celebrate $GME's stock gains, while skeptics warn of an impending market correction.",
 "Glaxo's ViiV Healthcare Signs China Manufacturing Deal With Desano",
 'Business boomed after Ostrom helped plant a small story about Rapala lures in a Life magazine issue that featured Marilyn Monroe on the cover .',
 'ArcelorMittal Chief Executive Officer Lakshmi Mittal has already cut output at some furnaces .',
 'Net sales rose by 25.5 % year-on-year to EUR59 .6 m , as the number of chargers delivered went up by 41 % to 65.9 million pieces .',
 'In this case , the effect would be negative in Finland .',
 '$PPSI getting tight. I think we see a small bounce',
 "Short-seller

In [10]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("glue", "mrpc")


In [11]:
# !pip install transformers[torch]
# !pip install accelerate -U

In [12]:
# !pip install accelerate==0.24.0

In [13]:
import accelerate
print(accelerate.__version__)

0.24.0


In [14]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=256,
    # During the first 500 training steps, the learning rate gradually increases from 0 (or a small base rate) to the specified learning rate.
    # This gradual increase helps in stabilizing the training process and often leads to better performance, as it prevents the model from making too large updates too quickly.
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,  # how frequently the training progress is logged
    save_strategy="epoch",  # Set save strategy to match evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # fp16=True,  # Enable mixed precision training
)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 20%|██        | 10/50 [01:18<04:57,  7.45s/it]

{'loss': 0.741, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.0}


                                               
 20%|██        | 10/50 [01:30<04:57,  7.45s/it]

{'eval_loss': 0.8050757646560669, 'eval_accuracy': 0.375, 'eval_f1': 0.0, 'eval_runtime': 12.2982, 'eval_samples_per_second': 3.253, 'eval_steps_per_second': 0.081, 'epoch': 1.0}


 40%|████      | 20/50 [02:55<04:25,  8.85s/it]

{'loss': 0.7121, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.0}


                                               
 40%|████      | 20/50 [03:05<04:25,  8.85s/it]

{'eval_loss': 0.7745421528816223, 'eval_accuracy': 0.375, 'eval_f1': 0.0, 'eval_runtime': 10.2105, 'eval_samples_per_second': 3.918, 'eval_steps_per_second': 0.098, 'epoch': 2.0}


 60%|██████    | 30/50 [04:22<02:35,  7.79s/it]

{'loss': 0.71, 'learning_rate': 3e-06, 'epoch': 3.0}


                                               
 60%|██████    | 30/50 [04:35<02:35,  7.79s/it]

{'eval_loss': 0.7279138565063477, 'eval_accuracy': 0.425, 'eval_f1': 0.14814814814814814, 'eval_runtime': 12.6072, 'eval_samples_per_second': 3.173, 'eval_steps_per_second': 0.079, 'epoch': 3.0}


 80%|████████  | 40/50 [06:02<01:23,  8.32s/it]

{'loss': 0.681, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                               
 80%|████████  | 40/50 [06:13<01:23,  8.32s/it]

{'eval_loss': 0.7155649065971375, 'eval_accuracy': 0.475, 'eval_f1': 0.5116279069767442, 'eval_runtime': 11.5777, 'eval_samples_per_second': 3.455, 'eval_steps_per_second': 0.086, 'epoch': 4.0}


100%|██████████| 50/50 [07:28<00:00,  7.47s/it]

{'loss': 0.6452, 'learning_rate': 5e-06, 'epoch': 5.0}


                                               
100%|██████████| 50/50 [07:39<00:00,  7.47s/it]

{'eval_loss': 0.6888055801391602, 'eval_accuracy': 0.55, 'eval_f1': 0.6538461538461539, 'eval_runtime': 10.6295, 'eval_samples_per_second': 3.763, 'eval_steps_per_second': 0.094, 'epoch': 5.0}


100%|██████████| 50/50 [07:42<00:00,  9.24s/it]

{'train_runtime': 462.14, 'train_samples_per_second': 1.731, 'train_steps_per_second': 0.108, 'train_loss': 0.6978430080413819, 'epoch': 5.0}





TrainOutput(global_step=50, training_loss=0.6978430080413819, metrics={'train_runtime': 462.14, 'train_samples_per_second': 1.731, 'train_steps_per_second': 0.108, 'train_loss': 0.6978430080413819, 'epoch': 5.0})

In [16]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 23/23 [08:32<00:00, 22.30s/it]

(5656, 2) (5656,)





In [17]:
predictions

PredictionOutput(predictions=array([[-0.16926989,  0.06104108],
       [-0.27627718,  0.00562283],
       [-0.1999255 ,  0.01546691],
       ...,
       [-0.14601804, -0.01470591],
       [-0.27364278,  0.02283587],
       [-0.23677611, -0.03390081]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.6770923137664795, 'test_accuracy': 0.5797383309759547, 'test_f1': 0.6955296528756244, 'test_runtime': 539.4233, 'test_samples_per_second': 10.485, 'test_steps_per_second': 0.043})

In [39]:
# Apply softmax to convert logits to probabilities
probabilities = softmax(predictions.predictions, axis=1)

# Get the predicted class labels
predicted_labels = np.argmax(probabilities, axis=1)

print("Probabilities:\n", probabilities)
print("Predicted Labels:\n", predicted_labels)

Probabilities:
 [[0.44267544 0.5573246 ]
 [0.42998803 0.570012  ]
 [0.4463591  0.55364084]
 ...
 [0.46721905 0.53278095]
 [0.4264185  0.57358146]
 [0.44945443 0.5505456 ]]
Predicted Labels:
 [1 1 1 ... 1 1 1]


In [40]:
true_labels = tokenized_datasets["test"]["label"]

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5797383309759547
Precision: 0.650455198850024
Recall: 0.5797383309759547
F1 Score: 0.561865833073609


In [24]:
data = {'Predicted_Labels': predicted_labels}
df = pd.DataFrame(data)
df.to_csv('data/predicted_labels.csv', index=False)