In [9]:
!pip install evaluate



In [10]:
import pandas as pd
from collections import Counter
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW
from transformers.modeling_outputs import SequenceClassifierOutput
from PIL import Image
from tqdm import tqdm
import evaluate
import ast
import sklearn

In [11]:
TRAIN_CSV = "/kaggle/input/trocr-dataset/train_data.csv"
VAL_CSV = "/kaggle/input/trocr-dataset/validation_data.csv"
TR_IMAGE_DIR = "/kaggle/input/trocr-dataset/Dataset/Dataset/Train/"
VAL_IMAGE_DIR = "/kaggle/input/trocr-dataset/Dataset/Dataset/Validation/"
PROCESSOR_PATH = "microsoft/trocr-base-handwritten"
BATCH_SIZE = 15
NUM_CLASSES = 4
IMG_SIZE = 384

In [12]:
train_df = pd.read_csv(TRAIN_CSV)
val_df = pd.read_csv(VAL_CSV)

print("Unique values for Train: " + str({**Counter(train_df.text)}))
print("Unique values for Validation: " + str({**Counter(val_df.text)}))

Unique values for Train: {'Belkart': 216, 'Mastercard': 213, 'Mir Belkart': 201, 'Mir': 202, 'Visa Mastercard Mir': 226, 'Visa': 213, 'Visa Mastercard Belkart': 200}
Unique values for Validation: {'Visa Mastercard Mir': 101, 'Visa Mastercard': 100, 'Visa Mastercard Belkart': 200}


In [13]:
train_df.loc[:, "labels"] = train_df.labels.apply(ast.literal_eval)
train_df.loc[:, "labels"] = train_df.labels.apply(np.float32)

val_df.loc[:, "labels"] = val_df.labels.apply(ast.literal_eval)
val_df.loc[:, "labels"] = val_df.labels.apply(np.float32)

In [14]:
class Loader(Dataset):
    def __init__(self, data_dir, df, processor, max_length=20):
        super().__init__()
        self.data_dir = data_dir
        self.df = df
        self.max_length = max_length
        self.processor = processor

    
    def __len__(self):
        return len(self.df)


    def __getitem__(self, index):
        file_name = self.df["file_name"][index]
        text = self.df["text"][index]
        labels = self.df["labels"][index]
        image = Image.open(self.data_dir + file_name).convert("RGB")
        image =image.resize((IMG_SIZE,IMG_SIZE))
        pixel_values = np.array(image) / 255.0
        #pixel_values = self.processor(image, return_tensors="pt").pixel_values
        text = self.processor.tokenizer(text,
                                          padding="max_length",
                                          max_length=self.max_length).input_ids

        text = [token if token != self.processor.tokenizer.pad_token_id else -100 for token in text]

        
        return {"pixel_values": torch.tensor(pixel_values).permute(2,0,1), 
                "text": torch.tensor(text),
                "labels": torch.tensor(labels)}

In [15]:
processor = TrOCRProcessor.from_pretrained(PROCESSOR_PATH)

train_dataset = Loader(data_dir=TR_IMAGE_DIR, df=train_df,
                       processor=processor)

val_dataset = Loader(data_dir=VAL_IMAGE_DIR, df=val_df,
                      processor=processor)



In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

eval_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [17]:
TrOCR_model = VisionEncoderDecoderModel.from_pretrained(PROCESSOR_PATH)

TrOCR_model.config.encoder.image_size = IMG_SIZE
TrOCR_model.config.encoder.encoder_stride = 16
TrOCR_model.config.encoder.patch_size = 16
TrOCR_model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
TrOCR_model.config.pad_token_id = processor.tokenizer.pad_token_id
TrOCR_model.config.vocab_size = TrOCR_model.config.decoder.vocab_size
TrOCR_model.config.eos_token_id = processor.tokenizer.sep_token_id

#TrOCR_model.config.max_length = 64
#TrOCR_model.config.early_stopping = True
#TrOCR_model.config.no_repeat_ngram_size = 3
#TrOCR_model.config.num_beams = 4

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

VisionEncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
class CustomModel(nn.Module):
    def __init__(self, model, num_labels):
        super(CustomModel,self).__init__()
        self.num_labels = num_labels
        self.model = model
        self.classifier = nn.Linear(model.config.decoder.hidden_size,num_labels)

        
    def forward(self, pixel_values, text=None, labels=None):
        
        if text is not None:
            outputs = self.model(pixel_values=pixel_values,
                                 labels = text,
                                 output_hidden_states=True)
        else:
            batch_size = pixel_values.shape[0]
            decoder_input_ids = torch.full(
                (batch_size, 1),
                self.model.config.decoder_start_token_id,
                dtype=torch.long).to(device)
            
            outputs = self.model(pixel_values=pixel_values,
                                 decoder_input_ids=decoder_input_ids,
                                 output_hidden_states=True)

        hidden_states = outputs.decoder_hidden_states[-1]
        logits = self.classifier(hidden_states[:, 0, :].view(-1,self.model.config.decoder.hidden_size))
        

        loss=None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))

    
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model=CustomModel(TrOCR_model,NUM_CLASSES).to(device)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
f1 = evaluate.load("f1", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [13]:
preds = []

for epoch in range(5):
    model.train()
    for batch in tqdm(train_dataloader):
        for k,v in batch.items():
            batch[k] = v.to(device)
            
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_dataloader):
            for k,v in batch.items():
                batch[k] = v.to(device)
         
            outputs = model(pixel_values=batch["pixel_values"],
                            labels=batch["labels"])
        
            logits = outputs.logits
            activation = torch.sigmoid(logits)
            predictions = (activation >= 0.5).float()
            preds.append(predictions)
            for indx in range(activation.shape[0]):
                f1.add_batch(predictions=predictions[indx],
                             references=batch["labels"][indx])

    
    print("Training Loss: " + str(loss.item()))     
    print(f1.compute(average='micro'))

100%|██████████| 99/99 [01:52<00:00,  1.14s/it]
100%|██████████| 27/27 [00:17<00:00,  1.51it/s]


Training Loss: 0.0340915322303772
{'f1': 0.7493765586034913}


100%|██████████| 99/99 [01:51<00:00,  1.13s/it]
100%|██████████| 27/27 [00:16<00:00,  1.68it/s]


Training Loss: 0.0027564875781536102
{'f1': 0.8123441396508728}


100%|██████████| 99/99 [01:51<00:00,  1.13s/it]
100%|██████████| 27/27 [00:16<00:00,  1.68it/s]


Training Loss: 0.00010170799941988662
{'f1': 0.6870324189526185}


100%|██████████| 99/99 [01:51<00:00,  1.13s/it]
100%|██████████| 27/27 [00:16<00:00,  1.67it/s]


Training Loss: 0.00036052113864570856
{'f1': 0.6870324189526185}


100%|██████████| 99/99 [01:51<00:00,  1.13s/it]
100%|██████████| 27/27 [00:16<00:00,  1.67it/s]

Training Loss: 0.00019609223818406463
{'f1': 0.6870324189526185}





In [72]:
pred = torch.cat(preds, dim=0)
true = torch.tensor(val_df["labels"], dtype=torch.float32)

print(sklearn.metrics.classification_report(
    true.cpu().numpy(),
    pred[-true.shape[0]:].cpu().numpy(),
    target_names=["Mastercard", "Visa", "Belkart", "Mir"]
))

              precision    recall  f1-score   support

  Mastercard       1.00      1.00      1.00       401
        Visa       1.00      1.00      1.00       401
     Belkart       0.00      0.00      0.00       200
         Mir       0.34      1.00      0.50       101

   micro avg       0.82      0.82      0.82      1103
   macro avg       0.58      0.75      0.63      1103
weighted avg       0.76      0.82      0.77      1103
 samples avg       0.83      0.83      0.82      1103



  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
""" state = {

    'state_dict': model.state_dict(),

    'optimizer': optimizer.state_dict(),

}

torch.save(state, "D:\\Projects\\Priorbank\\Payment-logos\\For testing\\model_state.pt") """