In [None]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer

from tqdm import tqdm

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import string
import time
import os

!pip install -U datasets
from datasets import load_dataset

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
os.environ['TMPDIR'] = '/ephemeral/tplx-dojo-evals/tmpdir'

In [None]:
# 2. Data Preprocessing
class TextPreprocessor:
    def __init__(self, pretrained_model_name='bert-base-uncased', max_len=50):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.max_len = max_len
        #self.label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}

    def clean_text(self, text):
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()

        # Convert to lowercase
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        return text

    def split_data(self, df, text_column, label_column, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
        assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1."

        # Split into train and temp (validation + test)
        train_data, temp_data = train_test_split(
            df, test_size=val_ratio + test_ratio, random_state=42, stratify=df[label_column]
        )

        # Split temp into validation and test
        val_data, test_data = train_test_split(
            temp_data, test_size=test_ratio / (val_ratio + test_ratio), random_state=42, stratify=temp_data[label_column]
        )

        return train_data, val_data, test_data

    def encode_labels(self, df, label_column):
        """
        Encodes string labels to numeric values.

        Args:
            df (pd.DataFrame): Input DataFrame with string labels.
            label_column (str): Column containing the labels.

        Returns:
            pd.DataFrame: DataFrame with numeric labels.
        """
        #df[label_column] = df[label_column].map(self.label_mapping)
        return df

    def tokenize_and_encode(self, texts):
        """
        Tokenizes and encodes the texts using BERT's tokenizer.

        Args:
            texts (list of str): List of text samples.

        Returns:
            dict: Dictionary containing input_ids and attention_mask.
        """
        return self.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

    def preprocess_dataset(self, df_or_dfs, text_column, label_column):
        """
        Cleans text data and encodes labels.

        Args:
            df (pd.DataFrame): Input DataFrame with text and labels.
            text_column (str): Column with text data.
            label_column (str): Column with labels.

        Returns:
            tuple: Train, validation, and test datasets with input_ids, attention_mask, and labels.
        """
        if isinstance(df_or_dfs, pd.DataFrame):
            df = df_or_dfs
            print("Cleaning text data...")
            df[text_column] = df[text_column].apply(self.clean_text)

            print("Encoding labels...")

            print("Splitting dataset into train, validation, and test sets...")
            train_data, val_data, test_data = self.split_data(df, text_column, label_column)
        else:
            if not isinstance(df_or_dfs, list) or len(df_or_dfs) !=3:
                raise ValueError("df_or_dfs must be a DataFrame or a list of DataFrames of size 3.")
            train_df, val_df, test_df = df_or_dfs
            print("Cleaning text data...")
            train_df[text_column] = train_df[text_column].apply(self.clean_text)
            print("Encoding labels...")
            #train_df = self.encode_labels(train_df, label_column)

            print("Cleaning text data...")
            val_df[text_column] = val_df[text_column].apply(self.clean_text)
            print("Encoding labels...")
            #val_df = self.encode_labels(val_df, label_column)

            print("Cleaning text data...")
            test_df[text_column] = test_df[text_column].apply(self.clean_text)
            print("Encoding labels...")
            #test_df = self.encode_labels(test_df, label_column)

            train_data, val_data, test_data = train_df, val_df, test_df

        print("Tokenizing and encoding text data...")
        train_encodings = self.tokenize_and_encode(train_data[text_column].tolist())
        val_encodings = self.tokenize_and_encode(val_data[text_column].tolist())
        test_encodings = self.tokenize_and_encode(test_data[text_column].tolist())

        train_labels = torch.tensor(train_data[label_column].values)
        val_labels = torch.tensor(val_data[label_column].values)
        test_labels = torch.tensor(test_data[label_column].values)

        print(f"Train set: {len(train_data)} samples")
        print(f"Validation set: {len(val_data)} samples")
        print(f"Test set: {len(test_data)} samples")

        return (train_encodings['input_ids'], train_encodings['attention_mask'], train_labels,
                val_encodings['input_ids'], val_encodings['attention_mask'], val_labels,
                test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
def get_pretrained_bert_model():
    model_name = "bert-base-uncased"   # or "bert-large-uncased", etc.
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    return model

In [None]:
model = get_pretrained_bert_model()
print(count_parameters(model))

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


109484547


In [None]:
def create_dataloader(input_ids, attention_mask, labels, batch_size):
    dataset = TensorDataset(input_ids, attention_mask, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        # Each batch should give us `input_ids`, `attention_mask`, and `labels`
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        # Forward pass for a classification model typically returns: (loss, logits)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss  # CrossEntropyLoss for classification by default
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_samples = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Predicted class = argmax of logits
            preds = torch.argmax(logits, dim=-1)

            correct_preds += (preds == labels).sum().item()
            total_samples += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_preds / total_samples
    return avg_loss, accuracy, all_preds, all_labels

In [None]:
def main():
    start_time = time.time()

    # 1) Load dataset splits
    from datasets import load_dataset
    train_dataset = load_dataset("sjyuxyz/financial-sentiment-analysis", split="train")
    valid_dataset = load_dataset("sjyuxyz/financial-sentiment-analysis", split="valid")
    test_dataset  = load_dataset("sjyuxyz/financial-sentiment-analysis", split="test")

    # Convert to pandas DataFrame
    train_data = pd.DataFrame(train_dataset)
    valid_data = pd.DataFrame(valid_dataset)
    test_data  = pd.DataFrame(test_dataset)

    # Initialize the TextPreprocessor
    preprocessor = TextPreprocessor(max_len=50)
    (train_input_ids, train_attention_mask, train_labels,
     val_input_ids, val_attention_mask, val_labels,
     test_input_ids, test_attention_mask, test_labels) = preprocessor.preprocess_dataset(
        [train_data,test_data,valid_data], text_column='text', label_column='label'
    )

    batch_size = 64

    train_loader = create_dataloader(train_input_ids, train_attention_mask, train_labels, batch_size)
    val_loader = create_dataloader(val_input_ids, val_attention_mask, val_labels, batch_size)
    test_loader = create_dataloader(test_input_ids, test_attention_mask, test_labels, batch_size)

    model = get_pretrained_bert_model()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 4) Define an optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    # 5) Training loop
    num_epochs = 8
    best_val_accuracy = 0

    version=0

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        train_loss = train_epoch(model, train_loader, optimizer, device)
        val_loss, val_accuracy, val_preds, val_labels_ = eval_model(model, val_loader, device)

        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val   Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        # If validation improves, save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_bert.pt")
            print("Model saved.")

    # 6) Evaluate on Test set
    print("\nEvaluating on Test Set with best model...")
    model.load_state_dict(torch.load(f"best_bert.pt"))
    test_loss, test_accuracy, test_preds, test_labels_ = eval_model(model, test_loader, device)
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # Show classification report & confusion matrix
    print("\nClassification Report:")
    print(classification_report(test_labels_, test_preds, digits=4))

    print("Confusion Matrix:")
    print(confusion_matrix(test_labels_, test_preds))

    # Print # params
    print(f"\nTotal trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    end_time = time.time()
    elapsed_time = end_time - start_time  # In seconds

    print(f"\nTotal Time Elapsed: {elapsed_time:.2f} seconds ({elapsed_time / 60:.2f} minutes)")


In [None]:
main()

README.md:   0%|          | 0.00/582 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.58M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/828k [00:00<?, ?B/s]

valid-00000-of-00001.parquet:   0%|          | 0.00/813k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80029 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10004 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/10004 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Cleaning text data...



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, "html.parser").get_text()


Encoding labels...
Cleaning text data...
Encoding labels...
Cleaning text data...
Encoding labels...
Tokenizing and encoding text data...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train set: 80029 samples
Validation set: 10004 samples
Test set: 10004 samples

Epoch 1/8




  Train Loss: 0.4636
  Val   Loss: 0.3011, Val Accuracy: 0.8834
Model saved.

Epoch 2/8




  Train Loss: 0.2237
  Val   Loss: 0.2725, Val Accuracy: 0.9050
Model saved.

Epoch 3/8




  Train Loss: 0.1284
  Val   Loss: 0.2995, Val Accuracy: 0.9056
Model saved.

Epoch 4/8




  Train Loss: 0.0732
  Val   Loss: 0.3582, Val Accuracy: 0.9133
Model saved.

Epoch 5/8




  Train Loss: 0.0495
  Val   Loss: 0.3985, Val Accuracy: 0.9038

Epoch 6/8




  Train Loss: 0.0351
  Val   Loss: 0.4141, Val Accuracy: 0.9072

Epoch 7/8




  Train Loss: 0.0281
  Val   Loss: 0.4180, Val Accuracy: 0.9096

Epoch 8/8




  Train Loss: 0.0241
  Val   Loss: 0.4532, Val Accuracy: 0.9113

Evaluating on Test Set with best model...


                                                             

Test Loss: 0.3439, Test Accuracy: 0.9148

Classification Report:
              precision    recall  f1-score   support

           0     0.9054    0.8687    0.8867      2094
           1     0.9189    0.9426    0.9306      5038
           2     0.9140    0.8997    0.9068      2872

    accuracy                         0.9148     10004
   macro avg     0.9128    0.9037    0.9080     10004
weighted avg     0.9147    0.9148    0.9146     10004

Confusion Matrix:
[[1819  198   77]
 [ 123 4749  166]
 [  67  221 2584]]

Total trainable params: 109484547

Total Time Elapsed: 1289.52 seconds (21.49 minutes)


