In [60]:
import numpy as np
from scipy.special import softmax

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

from sklearn.metrics import accuracy_score, recall_score, f1_score

from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

In [61]:
import torch
torch.cuda.empty_cache()

In [62]:
import os
print(os.getcwd())

d:\Cornell\course\CS6386\Analyzing-the-Correlation-Between-Retail-Traders--Sentiments-and-Equity-Market-Movements\Sentiment_Analysis


In [63]:
# !pip install datasets

In [64]:
# raw_datasets = load_from_disk("data")
raw_datasets = load_from_disk("data/finetune_data/")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

Map: 100%|██████████| 8/8 [00:00<00:00, 369.99 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 124.10 examples/s]
Map: 100%|██████████| 5656/5656 [00:00<00:00, 14789.49 examples/s]


In [65]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5656
    })
})

In [66]:
raw_datasets['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 8
})

In [67]:
raw_datasets['train']['label']

[1, 1, 0, 0, 0, 0, 1, 1]

In [68]:
tokenized_datasets['train']['text']

['According to Deputy MD Pekka Silvennoinen the aim is double turnover over the next three years .',
 "$AAPL's product announcement is live now. Hoping for some game-changing news! ????",
 "I'm surprised the indices are not reacting to $WMT so far. That is a very ugly report.",
 'According to Swedish authorities , traces of the very toxic osmium tetroxide have been found on the coast of Per+Ã±meri , the Northernmost part of the Gulf of Bothnia .',
 "Controversial decisions by $FB's leadership lead to a sharp decline in the social media giant's stock.",
 "Concerns about $HCA's exposure to changes in healthcare policy and regulatory scrutiny impact its stock.",
 'Componenta increased its stake in Turkish steel company Doktas Dokumculuk Ticaret ve Sanayi A.S. to 92.6 pct stake in March 2007 .',
 "The new factory working model and reorganisations would decrease Nokian Tyres ' costs in the factory by EUR 30 million ( USD 38.7 m ) ."]

In [69]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [70]:
# !pip install transformers[torch]
# !pip install accelerate -U

In [71]:
# !pip install accelerate==0.24.0

In [72]:
import accelerate
print(accelerate.__version__)

0.24.0


In [73]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=256,
    # During the first 500 training steps, the learning rate gradually increases from 0 (or a small base rate) to the specified learning rate.
    # This gradual increase helps in stabilizing the training process and often leads to better performance, as it prevents the model from making too large updates too quickly.
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,  # how frequently the training progress is logged
    save_strategy="epoch",  # Set save strategy to match evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # fp16=True,  # Enable mixed precision training
)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
trainer.train()

 20%|██        | 10/50 [32:24<2:09:36, 194.42s/it]
  0%|          | 0/5 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                             
 20%|██        | 1/5 [00:02<00:09,  2.39s/it]

{'eval_loss': 0.8058676719665527, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1098, 'eval_samples_per_second': 18.208, 'eval_steps_per_second': 9.104, 'epoch': 1.0}


                                             
 40%|████      | 2/5 [00:06<00:09,  3.07s/it]

{'eval_loss': 0.8058640360832214, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1071, 'eval_samples_per_second': 18.682, 'eval_steps_per_second': 9.341, 'epoch': 2.0}


                                             
 60%|██████    | 3/5 [00:09<00:06,  3.24s/it]

{'eval_loss': 0.8058950304985046, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.0997, 'eval_samples_per_second': 20.064, 'eval_steps_per_second': 10.032, 'epoch': 3.0}


                                             
 80%|████████  | 4/5 [00:12<00:03,  3.32s/it]

{'eval_loss': 0.8059375882148743, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1012, 'eval_samples_per_second': 19.765, 'eval_steps_per_second': 9.883, 'epoch': 4.0}


                                             
100%|██████████| 5/5 [00:17<00:00,  3.70s/it]

{'eval_loss': 0.8060104250907898, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1164, 'eval_samples_per_second': 17.175, 'eval_steps_per_second': 8.588, 'epoch': 5.0}


100%|██████████| 5/5 [00:20<00:00,  4.02s/it]

{'train_runtime': 20.1157, 'train_samples_per_second': 1.988, 'train_steps_per_second': 0.249, 'train_loss': 0.7375632286071777, 'epoch': 5.0}





TrainOutput(global_step=5, training_loss=0.7375632286071777, metrics={'train_runtime': 20.1157, 'train_samples_per_second': 1.988, 'train_steps_per_second': 0.249, 'train_loss': 0.7375632286071777, 'epoch': 5.0})

In [76]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 23/23 [08:04<00:00, 21.05s/it]

(5656, 2) (5656,)





In [77]:
predictions

PredictionOutput(predictions=array([[-0.2597048 ,  0.5020186 ],
       [-0.3130621 ,  0.5282108 ],
       [-0.17977655,  0.5649787 ],
       ...,
       [-0.2120085 ,  0.58435404],
       [-0.19780362,  0.5197418 ],
       [-0.29525024,  0.5603512 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.6593472361564636, 'test_accuracy': 0.6423267326732673, 'test_f1': 0.7822155237377544, 'test_runtime': 508.0984, 'test_samples_per_second': 11.132, 'test_steps_per_second': 0.045})

In [78]:
# Apply softmax to convert logits to probabilities
probabilities = softmax(predictions.predictions, axis=1)

# Get the predicted class labels
predicted_labels = np.argmax(probabilities, axis=1)

print("Probabilities:\n", probabilities)
print("Predicted Labels:\n", predicted_labels)

Probabilities:
 [[0.31827223 0.68172777]
 [0.30126676 0.69873327]
 [0.3219652  0.67803484]
 ...
 [0.31080416 0.6891959 ]
 [0.32793376 0.6720663 ]
 [0.29825914 0.70174086]]
Predicted Labels:
 [1 1 1 ... 1 1 1]


In [79]:
true_labels = tokenized_datasets["test"]["label"]

accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.6423267326732673
Recall: 0.6423267326732673
F1 Score: 0.5024379416087803
