In [78]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

import helper_data, helper_model

In [79]:
import torch
torch.cuda.empty_cache()

In [80]:
import os
print(os.getcwd())

d:\Cornell\course\CS6386\Analyzing-the-Correlation-Between-Retail-Traders--Sentiments-and-Equity-Market-Movements\Sentiment_Analysis


In [81]:
# !pip install datasets

In [82]:
# raw_datasets = load_from_disk("data")
raw_datasets = load_from_disk("data/finetune_data/")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

In [83]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5656
    })
})

In [84]:
raw_datasets['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 8
})

In [85]:
tokenized_datasets['train']['text']

["We are honored to be acknowledged for our commitment to the industry , especially in Asia Pacific . ''",
 "Intensified competition in the streaming market sends $ROKU's stock tumbling, frustrating investors.",
 "Investor confidence dwindles as $GE's stock continues its downward trajectory, reflecting company struggles.",
 "$AMC's stock price experiences extreme volatility, leaving investors uncertain about the company's future.",
 'Morrisons finance chief to fill gap as CEO leaves early',
 "$AMZN's Prime Day is coming up. Expecting a sales frenzy. ????",
 'Lots of metal stocks look like a 5-10% run awaits. Long $SSRI,$PAAS and $NEM but most high quality names should go if metal prices cooperate',
 "The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members ."]

In [86]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [87]:
# !pip install transformers[torch]
# !pip install accelerate -U

In [88]:
# !pip install accelerate==0.24.0

In [89]:
import accelerate
print(accelerate.__version__)

0.24.0


In [90]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=256,
    # During the first 500 training steps, the learning rate gradually increases from 0 (or a small base rate) to the specified learning rate.
    # This gradual increase helps in stabilizing the training process and often leads to better performance, as it prevents the model from making too large updates too quickly.
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,  # how frequently the training progress is logged
    save_strategy="epoch",  # Set save strategy to match evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # disable wandb
    # fp16=True,  # Enable mixed precision training
)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
trainer.train()

  0%|          | 0/5 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                             
 20%|██        | 1/5 [00:02<00:07,  2.00s/it]

{'eval_loss': 0.7398532032966614, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1198, 'eval_samples_per_second': 16.691, 'eval_steps_per_second': 8.346, 'epoch': 1.0}


                                             
 40%|████      | 2/5 [00:06<00:10,  3.54s/it]

{'eval_loss': 0.739892303943634, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1334, 'eval_samples_per_second': 14.991, 'eval_steps_per_second': 7.495, 'epoch': 2.0}


                                             
 60%|██████    | 3/5 [00:11<00:08,  4.24s/it]

{'eval_loss': 0.7399743795394897, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1277, 'eval_samples_per_second': 15.658, 'eval_steps_per_second': 7.829, 'epoch': 3.0}


                                             
 80%|████████  | 4/5 [00:16<00:04,  4.45s/it]

{'eval_loss': 0.7401058077812195, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1614, 'eval_samples_per_second': 12.389, 'eval_steps_per_second': 6.194, 'epoch': 4.0}


                                             
100%|██████████| 5/5 [00:21<00:00,  4.63s/it]

{'eval_loss': 0.7402050495147705, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.1405, 'eval_samples_per_second': 14.237, 'eval_steps_per_second': 7.119, 'epoch': 5.0}


100%|██████████| 5/5 [00:24<00:00,  4.81s/it]

{'train_runtime': 24.0668, 'train_samples_per_second': 1.662, 'train_steps_per_second': 0.208, 'train_loss': 0.7106709957122803, 'epoch': 5.0}





TrainOutput(global_step=5, training_loss=0.7106709957122803, metrics={'train_runtime': 24.0668, 'train_samples_per_second': 1.662, 'train_steps_per_second': 0.208, 'train_loss': 0.7106709957122803, 'epoch': 5.0})

In [92]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 23/23 [09:23<00:00, 24.50s/it]

(5656, 2) (5656,)





In [93]:
predictions

PredictionOutput(predictions=array([[-0.14666545,  0.05393648],
       [-0.2648054 ,  0.04353234],
       [ 0.01113193,  0.12815395],
       ...,
       [-0.17355008, -0.07594016],
       [-0.09329125,  0.07237624],
       [-0.1479097 ,  0.00770286]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.6704440712928772, 'test_accuracy': 0.6417963224893918, 'test_f1': 0.7816810344827586, 'test_runtime': 587.269, 'test_samples_per_second': 9.631, 'test_steps_per_second': 0.039})

In [94]:
# Apply softmax to convert logits to probabilities
probabilities = softmax(predictions.predictions, axis=1)

# Get the predicted class labels
predicted_labels = np.argmax(probabilities, axis=1)

print("Probabilities:\n", probabilities)
print("Predicted Labels:\n", predicted_labels)

Probabilities:
 [[0.45001704 0.549983  ]
 [0.42352054 0.57647943]
 [0.47077784 0.5292222 ]
 ...
 [0.47561684 0.5243831 ]
 [0.45867756 0.5413224 ]
 [0.46117514 0.5388248 ]]
Predicted Labels:
 [1 1 1 ... 1 1 1]


In [95]:
true_labels = tokenized_datasets["test"]["label"]

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.6417963224893918
Precision: 0.6422879404993802
Recall: 0.6417963224893918
F1 Score: 0.5031507467243126


In [96]:
helper_model.print_wrong_classifications(predicted_labels, true_labels, raw_datasets['test'])

Predicted Label: 1, True Label: 0
Original Text: AAP - user if so then the current downtrend will break. Otherwise just a short-term correction in med-term downtrend.

Predicted Label: 1, True Label: 0
Original Text: Monday's relative weakness. NYX WIN TIE TAP ICE INT BMC AON C CHK BIIB  

Predicted Label: 1, True Label: 0
Original Text: Won't believe AAP uptrend is back until it crosses above MA(50)

Predicted Label: 1, True Label: 0
Original Text:  red, not ready for break out.

Predicted Label: 1, True Label: 0
Original Text: user: been adding VXY long off the bottom today for trade, also got WPI near low

Predicted Label: 1, True Label: 0
Original Text: NKD looking like a good short. Failed to break price level resistance at 116 today.

Predicted Label: 1, True Label: 0
Original Text: Too early to short into this move. Stock Market needs a few days to settle down. #Patience  COH BWD DT AAP PAY

Predicted Label: 1, True Label: 0
Original Text: PHM PulteGroup Option Bear bets 1.5 Mil

In [97]:
data = {'Predicted_Labels': predicted_labels}
df = pd.DataFrame(data)
df.to_csv('data/predicted_labels.csv', index=False)