In [1]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


### load dataset

In [2]:
RANDOM_STATE = 2023
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
SEQUENCE_LENGTH = 300
CURRENT_DIRECTORY = os.getcwd()
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

In [3]:
DATASETS = [
    {
        'name': "training.1600000.processed.noemoticon.csv",
        'api': "kazanova/sentiment140",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/kazanova/sentiment140"
    },
    {
        'name': "reddit_wsb.csv",
        'api': "gpreda/reddit-wallstreetsbets-posts",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/gpreda/reddit-wallstreetsbets-posts"
    },
    {
        'name': "stock_data.csv",
        'api': "yash612/stockmarket-sentiment-dataset",
        'location': "data",
        'url': "https://www.kaggle.com/datasets/yash612/stockmarket-sentiment-dataset"
    },
    {
        'name': "stock_tweets.csv",
        'api': "equinxx/stock-tweets-for-sentiment-analysis-and-prediction",
        'location': "data/unorganized/Stock Tweets for Sentiment Analysis and Prediction",
        'url': "https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction"
    },
    {
        'name': "Company_Tweet.csv",
        'api': "omermetinn/tweets-about-the-top-companies-from-2015-to-2020",
        'location': "data/unorganized/Tweets about the Top Companies from 2015 to 2020",
        'url': "https://www.kaggle.com/datasets/omermetinn/tweets-about-the-top-companies-from-2015-to-2020"
    },
    {
        'name': "stockerbot-export.csv",
        'api': "davidwallach/financial-tweets",
        'location': "data/unorganized/Financial Tweets",
        'url': "https://www.kaggle.com/datasets/davidwallach/financial-tweets"
    }
]

for dataset_info in DATASETS:
    dataset_name = dataset_info['name']
    dataset_location = dataset_info['location']

    if not os.path.exists(os.path.join(dataset_info['location'], dataset_name)):
        print(f"Downloading {dataset_name} from {dataset_info['url']} to {dataset_location}...")
        kaggle.api.dataset_download_files(dataset_info['api'], path=dataset_location, unzip=True)


"sentiment.csv" from https://github.com/surge-ai/stock-sentiment

In [26]:
dataset_filename = {
    '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"])
}

# dataset_path = os.path.join("", "data", dataset_filename["0"][0])
# df = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
# print(df['target'].value_counts())

dataset_path = os.path.join("", "data", dataset_filename["1"][0])
train_df = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["1"][1], skiprows=1)
train_df['target'] = train_df['target'].replace({-1: 0, 1: 1})
print(train_df['target'].value_counts())

dataset_path = os.path.join("", "data", dataset_filename["2"][0])
test_df = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
test_df.rename(columns={"Sentiment": "target"}, inplace=True)
test_df['target'] = test_df['target'].replace({'Negative': 0, 'Positive': 1})
test_df = test_df[['target', 'Tweet Text']]
test_df.rename(columns={'Tweet Text': 'text'}, inplace=True)
print(test_df['target'].value_counts())

target
1    3685
0    2106
Name: count, dtype: int64
target
1    327
0    173
Name: count, dtype: int64


In [6]:
# label_mapping = {0: 0, 4: 1}
# test_df['target'] = test_df['target'].map(label_mapping)

In [27]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape, val_df.shape, test_df.shape)

(4632, 2) (1159, 2) (500, 2)


In [28]:
def preprocess_and_tokenize(tokenizer, texts, labels, batch_size=32):
    texts = texts.tolist()
    labels = labels.tolist()

    tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

    input_ids = tokenized_texts["input_ids"]
    attention_mask = tokenized_texts["attention_mask"]
    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_mask, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

train_dataloader = preprocess_and_tokenize(tokenizer, train_df['text'], train_df['target'])
val_dataloader = preprocess_and_tokenize(tokenizer, val_df['text'], val_df['target'])
test_dataloader = preprocess_and_tokenize(tokenizer, test_df['text'], test_df['target'])

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain', num_labels=2)
model.to(device) 

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
best_f1 = 0.0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    with tqdm(train_dataloader, unit="batch") as t:
        for batch in t:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            t.set_postfix(loss=loss.item())

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} - Average Loss: {average_loss:.4f}")

    model.eval()
    val_predictions = []
    val_targets = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, unit="batch"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            predicted_class = torch.round(torch.sigmoid(logits)).cpu().numpy()
            val_predictions.extend(predicted_class)
            val_targets.extend(labels.cpu().numpy())

    val_predictions = [int(round(val[0])) for val in val_predictions]

    accuracy = accuracy_score(val_targets, val_predictions)
    precision = precision_score(val_targets, val_predictions)
    recall = recall_score(val_targets, val_predictions)
    f1 = f1_score(val_targets, val_predictions)
    roc_auc = roc_auc_score(val_targets, val_predictions)

    print(f"Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        model.save_pretrained('best_model')
        tokenizer.save_pretrained('best_model')


best_model = BertForSequenceClassification.from_pretrained('best_model')
best_model.to(device)  

best_tokenizer = BertTokenizer.from_pretrained('best_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 145/145 [26:24<00:00, 10.92s/batch, loss=0.521]


Epoch 1 - Average Loss: 0.5496


  3%|▎         | 1/37 [00:11<06:53, 11.50s/batch]


KeyboardInterrupt: 

In [12]:
def evaluate_model_on_test(model, test_dataloader):
    model.eval()
    val_predictions = []
    val_targets = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, unit="batch"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            predicted_class = torch.round(torch.sigmoid(logits)).cpu().numpy()
            val_predictions.extend(predicted_class)
            val_targets.extend(labels.cpu().numpy())

    val_predictions = [int(round(val[0])) for val in val_predictions]

    accuracy = accuracy_score(val_targets, val_predictions)
    precision = precision_score(val_targets, val_predictions)
    recall = recall_score(val_targets, val_predictions)
    f1 = f1_score(val_targets, val_predictions)
    roc_auc = roc_auc_score(val_targets, val_predictions)

    print(f"Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

evaluate_model_on_test(best_model, test_dataloader)

Testing - Accuracy: 0.2690, Precision: 0.2993, Recall: 0.1111, F1 Score: 0.1621, ROC AUC: 0.3281
