In [None]:
!pip install huggingface_hub



In [None]:
!hf auth login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
The token `ReadToken1` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenti

## Data preparation

In [50]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_parquet("hf://datasets/mitulshah/transaction-categorization/default/train/0000.parquet")

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: e47aa15a-212b-461c-91da-76051c2ab9fe)')' thrown while requesting GET https://huggingface.co/datasets/mitulshah/transaction-categorization/resolve/main/default/train/0000.parquet
Retrying in 1s [Retry 1/5].


In [51]:
df_required = df.drop(["country", "currency"], axis=1)

In [52]:
df_required.rename(columns={"transaction_description": "purpose_text", "category": "transaction_type"}, inplace=True)

In [53]:
df_required = df_required[:10000]

In [54]:
import re

# 1. r'\(.*?\)' : Matches anything inside (brackets)
# 2. r'online'  : Matches the literal word "online"
# 3. r'[\',!\?#\-&@%\^]' : Matches any single character in this set


pattern = r'\(.*?\)|online|[\',!\?#\-&@%\^]'

df_required['purpose_text'] = df_required['purpose_text'].str.replace(pattern, '', regex=True)

# 4. r'\s+' : Remove whitespace when there are two or more

df_required['purpose_text'] = df_required['purpose_text'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [55]:
pattern = r'[0-9]+'

df_required.loc[df_required['transaction_type'] != 'Income', 'purpose_text'] = df_required.loc[df_required['transaction_type'] != 'Income', 'purpose_text'].str.replace(pattern, '', regex=True)

In [56]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_required['transaction_type_encoded'] = label_encoder.fit_transform(df_required['transaction_type'])

In [57]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_required["purpose_text"].values, df_required["transaction_type_encoded"].values,
    test_size=0.2, random_state=42,)

## Creating a dataset

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [59]:
from torch.utils.data import Dataset, DataLoader
import torch

class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

max_len = 64
train_dataset = ClassificationDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ClassificationDataset(val_texts, val_labels, tokenizer, max_len)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## Model training

In [60]:
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                 num_labels=len(label_encoder.classes_))
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

Epoch 0: 100%|██████████| 500/500 [01:31<00:00,  5.46it/s, loss=0.0171]


Epoch 0 Loss: 0.5469582361131906


Epoch 1: 100%|██████████| 500/500 [01:35<00:00,  5.25it/s, loss=0.00763]

Epoch 1 Loss: 0.051863547022454444





In [62]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        # Move to CPU and convert to list/numpy to store them
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels,
    all_predictions,
    average='weighted'
)

accuracy = accuracy_score(all_labels, all_predictions)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.9855
Precision: 0.9860
Recall:    0.9855
F1 Score:  0.9855
