In [49]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding, BertForSequenceClassification

from datasets import load_metric
from datasets import load_dataset
from datasets import load_from_disk

import helper_data, helper_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [50]:
import torch
torch.cuda.empty_cache()

In [51]:
import os
print(os.getcwd())

d:\OneDrive\Cornell\Fall 2023\Analyzing-the-Correlation-Between-Retail-Traders--Sentiments-and-Equity-Market-Movements\Sentiment_Analysis


In [52]:
# !pip install datasets

In [53]:
# raw_datasets = load_from_disk("data")
raw_datasets = load_from_disk("data/finetune_data/")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # pad all the examples to the length of the longest element when we batch elements together — dynamic padding.

In [63]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5656
    })
})

In [55]:
raw_datasets['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 8
})

In [56]:
tokenized_datasets['train']['text']

['Day trading $SPY options again. Market volatility is a playground for traders. ????',
 '$AAPL http://stks.co/1LDx Clearing this descending trend line / buy area now',
 "I have taken 2 vacations this Yr and planning 3rd and 4th. There isn't a single location where $ABNB is a better deal than what I find on $BKNG",
 "Shell's $70 Billion BG Deal Meets Shareholder Skepticism",
 "We are now in a position to pursue novel clinical candidates going forward . ''",
 "@asdfLLC. Nice pop today. Told ya we would be green haha. Funny you bring that up. I'm actually working on a story about $NOK at the moment",
 "$OXY is gasping for air. Occidental Petroleum's stock is tied to volatile oil prices.",
 "$TSLA's Elon Musk tweeted something cryptic. Brace for impact! ????"]

In [57]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [58]:
# !pip install transformers[torch]
# !pip install accelerate -U

In [59]:
# !pip install accelerate==0.24.0

In [60]:
import accelerate
print(accelerate.__version__)

0.24.1


In [61]:
training_args = TrainingArguments(
    output_dir="test-trainer",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=256,
    # During the first 500 training steps, the learning rate gradually increases from 0 (or a small base rate) to the specified learning rate.
    # This gradual increase helps in stabilizing the training process and often leads to better performance, as it prevents the model from making too large updates too quickly.
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,  # how frequently the training progress is logged
    save_strategy="epoch",  # Set save strategy to match evaluation strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # disable wandb
    fp16=True,  # Enable mixed precision training
)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [62]:
trainer.train()



You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.679443359375, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.0309, 'eval_samples_per_second': 64.689, 'eval_steps_per_second': 32.344, 'epoch': 1.0}


{'eval_loss': 0.67919921875, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.0309, 'eval_samples_per_second': 64.687, 'eval_steps_per_second': 32.344, 'epoch': 2.0}


{'eval_loss': 0.6787109375, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.03, 'eval_samples_per_second': 66.715, 'eval_steps_per_second': 33.357, 'epoch': 3.0}


{'eval_loss': 0.67822265625, 'eval_accuracy': 0.5, 'eval_f1': 0.6666666666666666, 'eval_runtime': 0.0289, 'eval_samples_per_second': 69.152, 'eval_steps_per_second': 34.576, 'epoch': 4.0}


KeyboardInterrupt: 

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(5656, 2) (5656,)


In [None]:
predictions

PredictionOutput(predictions=array([[ 0.01392853, -0.2796782 ],
       [-0.04717556, -0.33258662],
       [ 0.06373881, -0.22268581],
       ...,
       [-0.01922658, -0.20969233],
       [-0.0109576 , -0.24027182],
       [-0.03525391, -0.31576952]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.7367212772369385, 'test_accuracy': 0.3587340876944837, 'test_f1': 0.005483959418700302, 'test_runtime': 10.4748, 'test_samples_per_second': 539.96, 'test_steps_per_second': 2.196})

In [None]:
# Apply softmax to convert logits to probabilities
probabilities = softmax(predictions.predictions, axis=1)

# Get the predicted class labels
predicted_labels = np.argmax(probabilities, axis=1)

print("Probabilities:\n", probabilities)
print("Predicted Labels:\n", predicted_labels)

Probabilities:
 [[0.5728789  0.4271211 ]
 [0.5708723  0.42912766]
 [0.5711206  0.42887935]
 ...
 [0.547473   0.452527  ]
 [0.5570786  0.44292134]
 [0.5696726  0.43032736]]
Predicted Labels:
 [0 0 0 ... 0 0 0]


In [None]:
true_labels = tokenized_datasets["test"]["label"]

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.3587340876944837
Precision: 0.7142857142857143
Recall: 0.3587340876944837
F1 Score: 0.1919484106912143


In [None]:
helper_model.print_wrong_classifications(predicted_labels, true_labels, raw_datasets['test'])

Predicted Label: 0, True Label: 1
Original Text: Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method 1 or method 2, see prev posts

Predicted Label: 0, True Label: 1
Original Text: user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  

Predicted Label: 0, True Label: 1
Original Text: user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service

Predicted Label: 0, True Label: 1
Original Text: MNTA Over 12.00  

Predicted Label: 0, True Label: 1
Original Text: OI  Over 21.37  

Predicted Label: 0, True Label: 1
Original Text: PGNX  Over 3.04  

Predicted Label: 0, True Label: 1
Original Text: GOOG - ower trend line channel test & volume support.   

Predicted Label: 0, True Label: 1
Original Text: AAP will watch tomorrow for ONG entry.

Predicted Label: 0, True Label: 1
Original Text: i'm assuming FCX opens tomorrow above the 34.25 trigger buy. still very much like this setup.  

In [None]:
data = {'Predicted_Labels': predicted_labels}
df = pd.DataFrame(data)
df.to_csv('data/predicted_labels.csv', index=False)