In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import nltk
import numpy as np
from transformers import AutoTokenizer
from seqeval.metrics import classification_report, f1_score , precision_score, recall_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PAD_TOKEN = '[PAD]'

In [3]:
# Function to label tokens
def label_tokens(sms, amount, store, balance, date, time, max_len=50):
    tokens = nltk.word_tokenize(sms)
    labels = []
    for token in tokens:
        if token in [':', '/', '.'] or token.isdigit():
            labels.append((token, 'O'))
        elif token in str(amount):
            labels.append((token, 'B-AMOUNT'))
        elif token in store.split():
            labels.append((token, 'B-STORE'))
        elif token in balance:
            labels.append((token, 'B-BALANCE'))
        elif token in date:
            labels.append((token, 'B-DATE'))
        elif token in time:
            labels.append((token, 'B-TIME'))
        else:
            labels.append((token, 'O'))
    # check if a token repeats, use B-I-O tagging

    for i in range(len(labels)):
        if i != 0 and labels[i][1] != 'O' : 
            if labels[i-1][1] == "O" : 
                continue 
            elif labels[i-1][1].split("-")[1] != labels[i][1].split("-")[1]:
                continue 
            elif labels[i-1][1].split("-")[1] == labels[i][1].split("-")[1]:
                labels[i] = (labels[i][0], labels[i][1].replace("B-", "I-"))

    # pad the labels
    if len(labels) < max_len:
        pad_length = max_len - len(labels)
        labels.extend([(PAD_TOKEN, 'O')] * pad_length)
    else : 
        raise ValueError('The length of the tokens is greater than the max_len')
    return labels

In [4]:
data_df = pd.read_csv("data/synthetic_sms.csv")
data_df

Unnamed: 0,sms,type,card_number,amount,store,account,balance,date,time
0,Your card ending in ****7242 was debited QAR 6...,temp0,****7242,681.29,9A8ts5Zaq b v,***370022,9565.07,04/10/20,13:55
1,Alert: QAR 443.09 was spent at XiTi U dvQPm85u...,temp1,****5617,443.09,XiTi U dvQPm85ujTBTHA,***122717,1408.17,03/08/23,13:34
2,"Transaction Notice: QAR 1,106.03 was debited f...",temp2,****5583,1106.03,Svjec67ZhS,***233856,6771.83,23/12/20,19:33
3,Your card ending in ****7854 was charged QAR 1...,temp3,****7854,1232.12,pyFX1fTK-qzV2Q1uD,***184486,3512.70,06/04/20,07:11
4,"QAR 1,366.30 was debited from your account end...",temp4,****8680,1366.30,LQHiAX3EYn0,***511829,5843.34,27/06/20,05:37
...,...,...,...,...,...,...,...,...,...
6195,Update: Purchase of 790.54 at hZjYt aU0wHdM9mX...,temp26,****7563,790.54,hZjYt aU0wHdM9mX6VfmOe3ag,***939242,133.89,28/12/22,06:08
6196,Bank Alert: 170.13 debited from account ***350...,temp27,****5217,170.13,N x ur8JK0p2Tz p6stzt,***350969,1303.09,26/10/21,14:56
6197,Notification: Card ****7105 transaction at vOh...,temp28,****7105,535.83,vOhqbfURJeM9R6EgoiZMN,***119815,7610.56,11/12/23,21:30
6198,"FYI: A 1,473.24 purchase at bn P2w u FXGXGdf6C...",temp29,****8299,1473.24,bn P2w u FXGXGdf6C8,***862604,2803.16,29/08/22,16:55


In [5]:
# Apply labeling function
ner_data = []
for index, row in data_df.iterrows():
    labeled_tokens = label_tokens(row['sms'],  row['amount'], row['store'], row['balance'], row['date'], row['time'], max_len=65)
    ner_data.append(labeled_tokens)  # Add a blank line to separate sentences
ner_data = np.array(ner_data)

In [6]:
# from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# # Initialize a tokenizer with a WordPiece model
# tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# # Define a trainer with a smaller vocab size
# trainer = trainers.WordPieceTrainer(
#     vocab_size=5000, 
#     special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
# )

# # Convert your DataFrame column to a list of strings
# sms_texts = data_df['sms'].tolist()

# # Use train_from_iterator to train on an iterator of texts
# tokenizer.train_from_iterator(sms_texts, trainer=trainer, length=len(sms_texts))

# tokenizer.save("tokenizer.json")

In [6]:
def tokenize_data(tokenizer, ner_data, label2id, IGNORE_INDEX, max_length=120): 
    bert_tokens = []
    for sample in ner_data:
        tokens = sample[:,0]
        labels = sample[:,1]
        encoding = tokenizer(list(tokens),
                            is_split_into_words=True,
                            return_offsets_mapping=True,
                            padding='max_length',
                            truncation=True,
                            max_length=max_length)

        # encoding = tokenizer.encode(list(tokens),
        #                      is_split_into_words=True)
        # The tokenizer returns a list that maps each tokenized subword to its original word index.
        word_ids = encoding.word_ids()

        # fix  [labels[idx] for idx in word_ids if idx is not None else "[pad]"] 
        labels = [ labels[idx] if idx is not None else IGNORE_INDEX for idx in word_ids]

        for i in range(len(labels)):
            if i != 0 and labels[i] != 'O' and labels[i] != IGNORE_INDEX: 
                if labels[i-1] == "O" : 
                    continue 
                elif labels[i-1].split("-")[1] != labels[i].split("-")[1]:
                    continue 
                elif labels[i-1].split("-")[1] == labels[i].split("-")[1]:
                    labels[i] =  labels[i].replace("B-", "I-")

        labels = [label2id[label] if label != IGNORE_INDEX else IGNORE_INDEX for label in labels]
        
        encoding["labels"] = labels 

        bert_tokens.append([encoding["input_ids"] , encoding["labels"]])

    return np.array(bert_tokens)

In [7]:
tokenizer_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [8]:
IGNORE_INDEX = 99
label2id = {
    "O": 0,
    "B-AMOUNT": 1,
    "I-AMOUNT": 2,
    "B-STORE": 3,
    "I-STORE": 4,
    "B-BALANCE": 5,
    "I-BALANCE": 6,
    "B-DATE": 7,
    "I-DATE": 8,
    "B-TIME": 9,
    "I-TIME": 10,
    PAD_TOKEN : IGNORE_INDEX  # We use -100 for tokens we want to ignore in the loss (e.g. padding or subword pieces)
}
id2label = {v: k for k, v in label2id.items()}

In [9]:
bert_tokens = tokenize_data(tokenizer, ner_data, label2id, IGNORE_INDEX, max_length=250)

In [10]:
# ---------------------------
# Define a simple NER Dataset
# ---------------------------
class NerDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        x = self.tokens[idx]
        y = self.labels[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


In [11]:

# ---------------------------
# Define the LightningModule with an LSTM
# ---------------------------
class NerModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_tags, lr=1e-3, l1=1e-4, dropout=0.3, att_heads=4):
        super().__init__()
        self.lr = lr
        self.l1 = l1 
        self.dropout_rate = dropout
        self.att_heads = att_heads
        self.save_hyperparameters()
        
        # Embedding layer; padding_idx is set to IGNORE_INDEX
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=IGNORE_INDEX)
        
        # LSTM layer; bidirectional for richer context
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True)
        
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim*2, num_heads=att_heads, dropout=self.dropout_rate, batch_first=True)
        
        self.dropout = nn.Dropout(self.dropout_rate)
        # Fully connected layer maps attention output to number of tag classes
        self.fc = nn.Linear(hidden_dim * 2, num_tags)
        
        # Loss function: ignore padding index (IGNORE_INDEX)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)

    def forward(self, x):
        # x shape: [batch, seq_len]
        x = self.embedding(x)              # [batch, seq_len, embedding_dim]
        x, _ = self.lstm(x)                # [batch, seq_len, hidden_dim*2]
                
        # Apply self-attention: query, key, and value are all x
        attn_output, attn_weights = self.attention(x, x, x)
        attn_output = self.dropout(attn_output)
        
        logits = self.fc(attn_output)      # [batch, seq_len, num_tags]
        return logits

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        # Flatten logits and targets for computing loss
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), y.view(-1))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), y.view(-1))
        self.log('val_loss', loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.l1)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss'
            }
        }




In [12]:
def predictions(model, dataloader, id2label):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            x, y = batch  # x: token ids, y: label ids (with -100 for padding/subwords)
            logits = model(x)
            preds = torch.argmax(logits, dim=-1)  # shape: [batch, seq_len]
            
            # Convert predictions and true labels to lists, ignoring -100 tokens.
            preds = preds.cpu().numpy()
            y = y.cpu().numpy()
            
            for pred_seq, true_seq in zip(preds, y):
                pred_labels = []
                true_labels = []
                for p, t in zip(pred_seq, true_seq):
                    if t != IGNORE_INDEX:  # Ignore subwords/padding
                        pred_labels.append(id2label.get(p, "O"))
                        true_labels.append(id2label.get(t, "O"))
                    else : 
                        pred_labels.append(PAD_TOKEN)
                        true_labels.append(PAD_TOKEN)
                all_preds.append(pred_labels)
                all_labels.append(true_labels)
    
    return all_preds, all_labels

In [13]:
from sklearn.model_selection import GroupKFold, train_test_split

gkf = GroupKFold(n_splits=5)

folds_res = []
for train_val_idx, test_idx in gkf.split(bert_tokens, groups=data_df['type']):
    
    train_idx , val_idx = train_test_split(train_val_idx, test_size=0.2, random_state=41)

    train_tokens = bert_tokens[train_idx]
    val_tokens = bert_tokens[val_idx]
    test_tokens = bert_tokens[test_idx]

    # ---------------------------
    train_ds = NerDataset(train_tokens[:,0, : ], train_tokens[:,1, :])
    val_ds = NerDataset(val_tokens[:,0, : ], val_tokens[:,1, :])
    test_ds = NerDataset(test_tokens[:,0, : ], test_tokens[:,1, :])

    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    test_loader = DataLoader(test_ds, batch_size=64)

    # Instantiate the model
    model = NerModel(
        vocab_size=tokenizer.vocab_size,
        embedding_dim=512,
        hidden_dim=128,
        num_layers=1,
        num_tags=len(label2id),
        lr=1e-3, 
        l1 = 0, 
        dropout=0.3 ,
        att_heads=2
    )


    # Initialize a PyTorch Lightning trainer
    trainer = pl.Trainer(max_epochs=10, log_every_n_steps=1, devices=[0], 
                         callbacks=[pl.callbacks.EarlyStopping(monitor='val_loss', patience=3 , mode='min'), \
                                    pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')])
    trainer.fit(model, train_dataloaders=train_loader , val_dataloaders=val_loader)

    train_pred , train_true = predictions(model, train_loader, id2label)
    val_pred , val_true = predictions(model, val_loader, id2label)
    test_pred , test_true = predictions(model, test_loader, id2label)
    
    # Print a classification report with entity-level metrics
    print("Train:")
    print(classification_report(train_true, train_pred))
    print("Validation:")
    print(classification_report(val_true, val_pred))
    print("Test:")
    print(classification_report(test_true, test_pred))

    # trian 
    f1_train = f1_score(train_true, train_pred , average='macro')
    precision_train = precision_score(train_true, train_pred , average='macro')
    recall_train = recall_score(train_true, train_pred , average='macro')
    accuracy_train = accuracy_score(train_true, train_pred)

    # val
    f1_val = f1_score(val_true, val_pred , average='macro')
    precision_val = precision_score(val_true, val_pred , average='macro')
    recall_val = recall_score(val_true, val_pred , average='macro')
    accuracy_val = accuracy_score(val_true, val_pred)

    # test
    f1_test = f1_score(test_true, test_pred , average='macro')
    precision_test = precision_score(test_true, test_pred , average='macro')
    recall_test = recall_score(test_true, test_pred , average='macro')
    accuracy_test = accuracy_score(test_true, test_pred)
    
    folds_res.append({
        "f1_train": f1_train,
        "precision_train": precision_train,
        "recall_train": recall_train,
        "accuracy_train": accuracy_train,
        "f1_val": f1_val,
        "precision_val": precision_val,
        "recall_val": recall_val,
        "accuracy_val": accuracy_val,
        "f1_test": f1_test,
        "precision_test": precision_test,
        "recall_test": recall_test,
        "accuracy_test": accuracy_test
    })

folds_res = pd.DataFrame(folds_res)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\s7seg\.conda\envs\mlops\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\s7seg\.conda\envs\mlops\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

c:\Users\s7seg\.conda\envs\mlops\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 8: 100%|██████████| 60/60 [00:02<00:00, 26.53it/s, v_num=0, train_loss=0.00099, val_loss=0.00849] 
Train:




              precision    recall  f1-score   support

      AMOUNT       0.98      0.97      0.98      3977
     BALANCE       0.98      1.00      0.99      4145
        DATE       1.00      1.00      1.00      3840
        PAD]       1.00      1.00      1.00      7680
       STORE       1.00      1.00      1.00      3879
        TIME       1.00      1.00      1.00      3840

   micro avg       0.99      1.00      0.99     27361
   macro avg       0.99      0.99      0.99     27361
weighted avg       0.99      1.00      0.99     27361

Validation:
              precision    recall  f1-score   support

      AMOUNT       0.99      0.97      0.98       999
     BALANCE       0.97      1.00      0.99      1045
        DATE       1.00      1.00      1.00       960
        PAD]       1.00      1.00      1.00      1920
       STORE       0.94      0.95      0.95       972
        TIME       1.00      1.00      1.00       960

   micro avg       0.99      0.99      0.99      6856
   macro av

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | embedding | Embedding          | 14.8 M | train
1 | lstm      | LSTM               | 657 K  | train
2 | attention | MultiheadAttention | 263 K  | train
3 | dropout   | Dropout            | 0      | train
4 | fc        | Linear             | 3.1 K  | train
5 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
63.078    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 63/63 [00:02<00:00, 27.41it/s, v_num=1, train_loss=0.000603, val_loss=0.00538]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 63/63 [00:02<00:00, 27.29it/s, v_num=1, train_loss=0.000603, val_loss=0.00538]
Train:
              precision    recall  f1-score   support

      AMOUNT       0.99      1.00      1.00      4143
     BALANCE       1.00      1.00      1.00      4309
        DATE       1.00      1.00      1.00      4000
        PAD]       1.00      1.00      1.00      8000
       STORE       0.99      0.99      0.99      4046
        TIME       1.00      1.00      1.00      4000

   micro avg       1.00      1.00      1.00     28498
   macro avg       1.00      1.00      1.00     28498
weighted avg       1.00      1.00      1.00     28498

Validation:
              precision    recall  f1-score   support

      AMOUNT       1.00      1.00      1.00      1041
     BALANCE       1.00      1.00      1.00      1067
        DATE       1.00      1.00      1.00      1000
        PAD]       1.00      1.00      1.00      2000
       STORE       0.94      0.96      0.95      1005
        

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | embedding | Embedding          | 14.8 M | train
1 | lstm      | LSTM               | 657 K  | train
2 | attention | MultiheadAttention | 263 K  | train
3 | dropout   | Dropout            | 0      | train
4 | fc        | Linear             | 3.1 K  | train
5 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
63.078    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 63/63 [00:02<00:00, 26.39it/s, v_num=2, train_loss=0.00419, val_loss=0.00687] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 63/63 [00:02<00:00, 26.36it/s, v_num=2, train_loss=0.00419, val_loss=0.00687]
Train:
              precision    recall  f1-score   support

      AMOUNT       1.00      1.00      1.00      4191
     BALANCE       1.00      1.00      1.00      4397
        DATE       1.00      1.00      1.00      4000
        PAD]       1.00      1.00      1.00      8000
       STORE       0.97      0.99      0.98      4042
        TIME       1.00      1.00      1.00      4000

   micro avg       1.00      1.00      1.00     28630
   macro avg       0.99      1.00      1.00     28630
weighted avg       1.00      1.00      1.00     28630

Validation:
              precision    recall  f1-score   support

      AMOUNT       1.00      1.00      1.00      1056
     BALANCE       0.99      1.00      1.00      1103
        DATE       1.00      1.00      1.00      1000
        PAD]       1.00      1.00      1.00      2000
       STORE       0.93      0.94      0.94      1009
        T

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | embedding | Embedding          | 14.8 M | train
1 | lstm      | LSTM               | 657 K  | train
2 | attention | MultiheadAttention | 263 K  | train
3 | dropout   | Dropout            | 0      | train
4 | fc        | Linear             | 3.1 K  | train
5 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
63.078    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 7: 100%|██████████| 63/63 [00:02<00:00, 27.14it/s, v_num=3, train_loss=0.0077, val_loss=0.00748] 
Train:
              precision    recall  f1-score   support

      AMOUNT       1.00      0.98      0.99      4195
     BALANCE       0.97      1.00      0.99      4399
        DATE       1.00      1.00      1.00      4000
        PAD]       1.00      1.00      1.00      8000
       STORE       0.98      0.99      0.99      4046
        TIME       1.00      1.00      1.00      4000

   micro avg       0.99      1.00      0.99     28640
   macro avg       0.99      1.00      0.99     28640
weighted avg       0.99      1.00      0.99     28640

Validation:
              precision    recall  f1-score   support

      AMOUNT       1.00      0.98      0.99      1052
     BALANCE       0.97      1.00      0.99      1101
        DATE       1.00      1.00      1.00      1000
        PAD]       1.00      1.00      1.00      2000
       STORE       0.94      0.95      0.95      1010
        T

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | embedding | Embedding          | 14.8 M | train
1 | lstm      | LSTM               | 657 K  | train
2 | attention | MultiheadAttention | 263 K  | train
3 | dropout   | Dropout            | 0      | train
4 | fc        | Linear             | 3.1 K  | train
5 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
63.078    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 8: 100%|██████████| 63/63 [00:02<00:00, 26.55it/s, v_num=4, train_loss=0.00158, val_loss=0.00829] 
Train:
              precision    recall  f1-score   support

      AMOUNT       0.96      1.00      0.98      4106
     BALANCE       1.00      0.96      0.98      4187
        DATE       1.00      1.00      1.00      4000
        PAD]       1.00      1.00      1.00      8000
       STORE       1.00      1.00      1.00      4052
        TIME       1.00      1.00      1.00      4000

   micro avg       0.99      0.99      0.99     28345
   macro avg       0.99      0.99      0.99     28345
weighted avg       0.99      0.99      0.99     28345

Validation:
              precision    recall  f1-score   support

      AMOUNT       0.95      1.00      0.98      1028
     BALANCE       1.00      0.96      0.98      1047
        DATE       1.00      1.00      1.00      1000
        PAD]       1.00      1.00      1.00      2000
       STORE       0.96      0.96      0.96      1007
        

In [15]:
folds_res.mean()

f1_train           0.995015
precision_train    0.993890
recall_train       0.996232
accuracy_train     0.999839
f1_val             0.987596
precision_val      0.986586
recall_val         0.988719
accuracy_val       0.999528
f1_test            0.879156
precision_test     0.841174
recall_test        0.934788
accuracy_test      0.992627
dtype: float64

In [16]:
for train_val_idx, test_idx in gkf.split(bert_tokens, groups=data_df['type']):
    print(data_df['type'][test_idx].unique())


['temp0' 'temp9' 'temp12' 'temp17' 'temp22' 'temp26' 'temp30']
['temp4' 'temp8' 'temp13' 'temp18' 'temp21' 'temp27']
['temp1' 'temp5' 'temp14' 'temp19' 'temp23' 'temp28']
['temp2' 'temp6' 'temp10' 'temp15' 'temp24' 'temp29']
['temp3' 'temp7' 'temp11' 'temp16' 'temp20' 'temp25']


# Full model training 

In [17]:
from sklearn.model_selection import train_test_split

train_val_idx, test_idx = train_test_split(range(len(data_df)), test_size=0.2, random_state=41)
train_idx , val_idx = train_test_split(train_val_idx, test_size=0.2, random_state=41)

train_tokens = bert_tokens[train_idx]
val_tokens = bert_tokens[val_idx]
test_tokens = bert_tokens[test_idx]

# ---------------------------

train_ds = NerDataset(train_tokens[:,0, : ], train_tokens[:,1, :])
val_ds = NerDataset(val_tokens[:,0, : ], val_tokens[:,1, :])
test_ds = NerDataset(test_tokens[:,0, : ], test_tokens[:,1, :])


train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)
test_loader = DataLoader(test_ds, batch_size=64)




In [18]:
pl.seed_everything(42)

# Instantiate the model
model = NerModel(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=512,
    hidden_dim=128,
    num_layers=1,
    num_tags=len(label2id),
    lr=1e-3, 
    l1 = 0, 
    dropout=0.3 ,
    att_heads=2
)

# Initialize a PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=30, log_every_n_steps=1, devices=[0], 
                     callbacks=[pl.callbacks.EarlyStopping(monitor='val_loss', patience=4, mode='min'), \
                                pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, mode='min')])
trainer.fit(model, train_dataloaders=train_loader , val_dataloaders=val_loader)

train_pred , train_true = predictions(model, train_loader, id2label)
val_pred , val_true = predictions(model, val_loader, id2label)
test_pred , test_true = predictions(model, test_loader, id2label)

# Print a classification report with entity-level metrics
print("Train:")
print(classification_report(train_true, train_pred))
print("Validation:")
print(classification_report(val_true, val_pred))
print("Test:")
print(classification_report(test_true, test_pred))


Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | embedding | Embedding          | 14.8 M | train
1 | lstm      | LSTM               | 657 K  | train
2 | attention | MultiheadAttention | 263 K  | train
3 | dropout   | Dropout            | 0      | train
4 | fc        | Linear             | 3.1 K  | train
5 | loss_fn   | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
15.8 M    Trainable params
0         Non-trainable params
15.8 M    Total params
63.078    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 166.63it/s]

c:\Users\s7seg\.conda\envs\mlops\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                            

c:\Users\s7seg\.conda\envs\mlops\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 10: 100%|██████████| 62/62 [00:02<00:00, 26.16it/s, v_num=5, train_loss=0.000918, val_loss=0.00581]
Train:




              precision    recall  f1-score   support

      AMOUNT       1.00      1.00      1.00      4133
     BALANCE       1.00      1.00      1.00      4288
        DATE       1.00      1.00      1.00      3968
        PAD]       1.00      1.00      1.00      7936
       STORE       1.00      1.00      1.00      4010
        TIME       1.00      1.00      1.00      3968

   micro avg       1.00      1.00      1.00     28303
   macro avg       1.00      1.00      1.00     28303
weighted avg       1.00      1.00      1.00     28303

Validation:
              precision    recall  f1-score   support

      AMOUNT       1.00      1.00      1.00      1030
     BALANCE       1.00      1.00      1.00      1075
        DATE       1.00      1.00      1.00       992
        PAD]       1.00      1.00      1.00      1984
       STORE       0.94      0.95      0.95      1006
        TIME       1.00      1.00      1.00       992

   micro avg       0.99      0.99      0.99      7079
   macro av

In [19]:
def decode(token, label, tokenizer, print_res = True): 
    sample_label = label
    sample_token = token
    amount_t = sample_token[np.where((sample_label == 'B-AMOUNT')  | (sample_label == 'I-AMOUNT'))[0]]
    store_t = sample_token[np.where((sample_label == 'B-STORE')  | (sample_label == 'I-STORE'))[0]]
    balance_t = sample_token[np.where((sample_label == 'B-BALANCE')  | (sample_label == 'I-BALANCE'))[0]]
    date_t = sample_token[np.where((sample_label == 'B-DATE')  | (sample_label == 'I-DATE'))[0]]
    time_t = sample_token[np.where((sample_label == 'B-TIME')  | (sample_label == 'I-TIME'))[0]]

    message = tokenizer.decode(sample_token)
    amount = tokenizer.decode(amount_t)
    store = tokenizer.decode(store_t)
    balance = tokenizer.decode(balance_t)
    date = tokenizer.decode(date_t)
    time = tokenizer.decode(time_t)

    if print_res : 
        print(f"Message: {message}")
        print(f"Amount: {amount}")
        print(f"Store: {store}")
        print(f"Balance: {balance}")
        print(f"Date: {date}")
        print(f"Time: {time}")
    return message, amount, store, balance, date, time


In [20]:
idxs = np.random.choice(range(len(test_pred)), 10)
for idx in idxs : 
    _ = decode(test_tokens[idx][0], np.array(test_pred[idx]), tokenizer)
    print("\n")

Message: [CLS] Debit Notification : QAR 1, 498. 13 was withdrawn from your account ( ending * * * 627768 ) via card * * * * 5943 for a transaction at qhxPQ 4HVwH on 28 / 11 / 23 at 15 : 12. Your updated balance is QAR 1, 657. 66. Please review your account activity for any discrepancies. [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [22]:
# load true data 
real_data_df = pd.read_csv("data/rand_data.csv")
real_data_df

Unnamed: 0,sms,card_number,amount,store,account,balance,date,time
0,"Debit transaction of QAR 1,298.13 from Card **...",****4953,1298.13,SMAISMA CAFE DOHQA,***704063,830.52,1/1/2024,13:43
1,Debit transaction of QAR 413.48 from Card ****...,****9448,413.48,Qatar Charity doha,***663320,4188.75,1/2/2024,2:25
2,Debit transaction of QAR 723.33 from Card ****...,****9858,723.33,MONOPRIX HYPERMARKET DOHDOHA,***459249,5390.72,1/5/2024,20:11
3,Debit transaction of QAR 493.88 from Card ****...,****8611,493.88,GOOGLE *Forest Focus f,***682551,3835.47,1/6/2024,9:12
4,Debit transaction of QAR 884.19 from Card ****...,****5724,884.19,AL MAHA AUTO SPARE PARDOHA,***303376,4988.11,1/6/2024,21:31
...,...,...,...,...,...,...,...,...
359,"Debit transaction of QAR 1,429.01 from Card **...",****2219,1429.01,MINISTER OF FOREIGN AFDOHA,***511450,4335.05,12/27/2024,5:11
360,Debit transaction of QAR 138.83 from Card ****...,****1223,138.83,SANDWHICH POORI KARA D,***418630,8602.99,12/28/2024,6:00
361,Debit transaction of QAR 138.97 from Card ****...,****8807,138.97,ORANGE MINI HYPERMARKEDOHA,***696471,9385.77,12/29/2024,10:14
362,Debit transaction of QAR 61.75 from Card ****9...,****9099,61.75,WOQOD - UMM EBAIRIYA SDRD,***329114,4186.89,12/30/2024,24:14:00


In [23]:
real_data_df['sms'] = real_data_df['sms'].apply(lambda x : x.replace("\n", " "))

In [24]:
real_ner_data = []
for index, row in real_data_df.iterrows():
    labeled_tokens = label_tokens(row['sms'],  row['amount'], row['store'], row['balance'], row['date'], row['time'], max_len=50)
    real_ner_data.append(labeled_tokens)  # Add a blank line to separate sentences
real_ner_data = np.array(real_ner_data)

real_tokens = tokenize_data(tokenizer, real_ner_data, label2id, IGNORE_INDEX, max_length=150)

In [23]:
real_ds = NerDataset(real_tokens[:,0, : ], real_tokens[:,1, :])
real_loader = DataLoader(real_ds, batch_size=64)

real_pred , real_true = predictions(model, real_loader, id2label)

In [24]:
print(classification_report(real_true, real_pred))



              precision    recall  f1-score   support

      AMOUNT       0.74      0.94      0.83       634
     BALANCE       0.79      0.84      0.81       634
        DATE       0.96      0.98      0.97       634
        PAD]       1.00      1.00      1.00      1268
       STORE       0.39      0.70      0.50       646
        TIME       0.58      1.00      0.73       634

   micro avg       0.73      0.92      0.81      4450
   macro avg       0.74      0.91      0.81      4450
weighted avg       0.78      0.92      0.83      4450



In [25]:
acc = accuracy_score(real_true, real_pred)
f1 = f1_score(real_true, real_pred , average='macro')
precision = precision_score(real_true, real_pred , average='macro')
recall = recall_score(real_true, real_pred , average='macro')

print(f"Accuracy: {acc}")
print(f"F1: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 0.9847634069400631
F1: 0.806764705908598
Precision: 0.7424728328741431
Recall: 0.9086809387543827


In [26]:
real_pred = np.array(real_pred)
real_true = np.array(real_true)

In [27]:
idxs = np.random.choice(range(len(real_pred)), 10)
for idx in idxs : 
    _ = decode(real_tokens[idx][0], real_pred[idx], tokenizer)
    print("\n")

Message: [CLS] Debit transaction of QAR 2. 00 from Card * * * 2 at ORANGE MINI HYPERMARKEDOHA from Current Account 01 * * * 390038 New Balance : QAR 284. 59 If you suspect this transaction call 44192022 30 / 07 / 24 12 : 18 [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Amount: 2. 00
Store: ORANGE MINI HYPERMARKEDOHA
Balance: 284. 59
Date: 30 / 07 / 24
Time: 12 : 18


Message: [CLS] Debit transaction of QAR 21. 00 from Card * * * 2 at WOQOD - BU FASSELA SIDA D from Current Account 01 * * * 390038 New Balance : QAR 3, 316. 12 If you suspect this transaction call 44192