In [1]:
%pip install -U bitsandbytes peft accelerate trl datasets wandb huggingface_hub

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.24.2-py3-none-any.whl.metadata (13 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting docstring-parser>=0.16 (from tyro>=0.5.11->trl)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import login
login(token=secret_value_0)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import os
import json
import ast
import numpy as np
import pandas as pd
from pathlib import Path
from datasets import Dataset

# Data Processing 

In [5]:
def read_bbox_and_words(path: Path):
    bbox_and_words_list = []

    with open(path, 'r', errors='ignore') as f:
        for line in f.read().splitlines():
            if len(line) == 0:
                continue
            split_lines = line.split(",")
            bbox = np.array(split_lines[0:8], dtype=np.int32)
            text = ",".join(split_lines[8:])
            bbox_and_words_list.append([path.stem, *bbox, text])

    dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'])
    dataframe = dataframe.drop(columns=['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'])

    return dataframe

def read_label_file(file_path):
#     print(file_path)
    with open(file_path, 'r') as f:
        try:
            return ast.literal_eval(f.read())
        except:
            print('error in ', file_path)

def prepare_sroie_example(dataframe, label):
    # Combine all text lines
    full_text = ' '.join(dataframe['line'])

    return {
        "instruction": "Extract the following information from the given ocr text: company, date, address, total.",
        "input": full_text,
        "output": f"Company: {label.get('company', 'N/A')}\nDate: {label.get('date', 'N/A')}\nAddress: {label.get('address', 'N/A')}\nTotal: {label.get('total', 'N/A')}"
    }

def load_sroie_data(ocr_dir, label_dir):
#     print(ocr_dir)
    examples = []
    for filename in os.listdir(ocr_dir):
        if filename.endswith('.txt'):
            ocr_path = Path(ocr_dir) / filename
            label_path = Path(label_dir) / filename
#             print(label_path)
            if label_path.exists():
#                 print(ocr_path)
                dataframe = read_bbox_and_words(ocr_path)
                label = read_label_file(label_path)
                examples.append(prepare_sroie_example(dataframe, label))

    return examples

def create_prompt(sample):
    bos_token = "<s>"
    eos_token = "</s>"

    instruction = sample["instruction"]
    input_text = sample["input"]
    output = sample["output"]

    full_prompt = f"""{bos_token}### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}{eos_token}"""

    return full_prompt

# Load the data
sroie_folder_path = Path('/kaggle/input/SROIE2019')
train_data = load_sroie_data(sroie_folder_path / 'train/box/',
                             sroie_folder_path / 'train/entities/')
test_data = load_sroie_data(sroie_folder_path / 'test/box/',
                             sroie_folder_path / 'test/entities/')


import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)

with torch.cuda.device(device):
    train_set = Dataset.from_list(train_data)
    test_set = Dataset.from_list(train_data)

## Parameter Counting

In [4]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Load the model without quantization
model_fp32 = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
print("Parameters in FP32 model:")
print_trainable_parameters(model_fp32)
print(f"Total parameters: {count_parameters(model_fp32)}")

# Load the model with 4-bit quantization
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_nf4 = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    quantization_config=nf4_config,
)

print("\nParameters in 4-bit quantized model:")
print_trainable_parameters(model_nf4)
print(f"Total parameters: {count_parameters(model_nf4)}")

# After applying LoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.2,
    r=32,
    bias="none",
    task_type="CAUSAL_LM"
)

model_nf4_lora = prepare_model_for_kbit_training(model_nf4)
model_nf4_lora = get_peft_model(model_nf4_lora, peft_config)

print("\nParameters in 4-bit quantized model with LoRA:")
print_trainable_parameters(model_nf4_lora)
print(f"Total parameters: {count_parameters(model_nf4_lora)}")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Parameters in FP32 model:
trainable params: 7241732096 || all params: 7241732096 || trainable%: 100.0
Total parameters: 7241732096


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Parameters in 4-bit quantized model:
trainable params: 262410240 || all params: 3752071168 || trainable%: 6.993743675173274
Total parameters: 3752071168

Parameters in 4-bit quantized model with LoRA:
trainable params: 13631488 || all params: 3765702656 || trainable%: 0.36199055648434075
Total parameters: 3765702656


# Model Quantization

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Testing Before model FineTuning

In [7]:
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to('cuda')

    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

    decoded_output = tokenizer.batch_decode(generated_ids)
    print(decoded_output)
    return decoded_output[0].replace(prompt, "")

In [8]:
test_set[0]

{'instruction': 'Extract the following information from the given ocr text: company, date, address, total.',
 'input': 'MYDIN TRI SHAAS SDN BHD (728515-M) MYDIN MART SRI MUDA 4-20, JALAN RIA 25/62 TAMAN SRI MUDA SEKSYEN 25, 40400 SHAH ALAM SELANGOR TEL : 03-51217970 GST ID: 000429166592 RECEIPT TERMINAL: 195-5505POS011 TRANS #: 322563 DATE : 10/08/2017 03:26:21PM CASHIER : CH107004 MSM - ALMIDAH PARTNER E/LOPE 4.5INX9.5IN W4292 EA .S 9555023304662 2 2.60 5.20 PLASTIC BAG RM0.20 EA .S 2300000017984 1 0.20 0.20 TEN Q E/LOPE 15INX10IN TQ-S1015 EA .S 9555023307724 2 2.50 5.00 UMOE S/NT 100MX75M, N/GRN EA .S 9555495401722 1 3.00 3.00 UMOE S/NT 100MX75M, N/YLW EA .S 9555495400985 1 3.00 3.00 UMOE S/NT 50MX38M, N/YLW EA .S 9555495400961 1 2.50 2.50 UMOE S/N 75MX50M, N/PINK EA .S 9555495401814 1 2.50 2.50 ITEM COUNT 9 TOTAL 21.40 0.00 21.40 ROUNDING ADJUSTMENT TOTAL ATTER ROUNDING CASH 50.00 CHANGE 28.60 GST RATE S = 6% Z = 0% AMT EXCL GST (RM) 20.19 0.00 TAX(RM) 1.21 0.00 POINTS MISSED: 20 JO

In [None]:
instruction = test_set[0]['instruction']
input_text = test_set[0]['input']
pre_finetune_output = generate_response(f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:""", model)

['<s> ### Instruction:\nExtract the following information from the given ocr text: company, date, address, total.\n\n### Input:\nTHE TOAST F&B SDN BHD (965752-T) LOT 2110&2111 JALAN PERMAS UTARA BANDAR BARU PERMAS 81750 JOHOR BAHRU JOHOR TEL: 07-3886880 GST ID NO: 002069884928 TAX INVOICE PS8418042783 08/04/2018 S/P: POS1 LOC: PMS WALK IN TABLE: 39 ITEM QTY U.PRICE DISC AMOUNT 404 TEH (ICE) 1 3.50 0.00 3.50 S 220 KAMPUNG 1 10.00 0.00 10.00 S 101 BUTTER KAYA 1 2.50 0.00 2.50 S 440 SUGARCA 1 2.90 .00 2.90 S TOTAL ITEMS: 4 SUB TOTAL: 18.90 LESS DISCOUNT: 0.00 ROUND: 0.00 TOTAL DUE (GST INC): 18.90 PAID: 50.00 CHANGE: 31.10 S:GST(6%) 17.83 1.07 PLEASE COME AGAIN! 08/04/2018 03:09:12 PM (ICE) CASH\n\n### Response:\nTOAST F&B SDN BHD, 08/04/2018, 965752-T, JALAN PERMAS UTARA BANDAR BARU PERMAS, JOHOR BAHRU, JOHOR, 40.40</s>']


# LoRA Configuartion and Model Training

In [9]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.2,
    r=32,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = prepare_model_for_kbit_training(model)
peft_model = get_peft_model(model, peft_config)
peft_model = peft_model.to(device)

In [10]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./mistral-instruct-7b-finetuned-sroie",
    max_steps=120,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="steps",
    eval_steps=20,
    learning_rate=2e-4,
    lr_scheduler_type='constant',
    fp16=True,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    no_cuda=False,
    dataloader_pin_memory=False
)

from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
    model=peft_model,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=create_prompt,
    args=args,
    train_dataset=train_set,
    eval_dataset=test_set
)

2024-07-25 10:56:56.787293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 10:56:56.800346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 10:56:56.975469: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize

wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc





Step,Training Loss,Validation Loss
20,1.4458,1.331295
40,1.1796,1.114339





Step,Training Loss,Validation Loss
20,1.4458,1.331295
40,1.1796,1.114339
60,1.0555,0.973302
80,0.9514,0.882919
100,0.8599,0.815042
120,0.857,0.766791





TrainOutput(global_step=120, training_loss=1.0859841267267862, metrics={'train_runtime': 5832.7056, 'train_samples_per_second': 0.082, 'train_steps_per_second': 0.021, 'total_flos': 4.202078159241216e+16, 'train_loss': 1.0859841267267862, 'epoch': 2.909090909090909})

In [None]:
trainer.save_model("mistral-instruct-7b-finetuned-sroie-2")

In [None]:
import shutil
shutil.make_archive('finetuned_mistral_latest', 'zip', '/content/mistral-instruct-7b-finetuned-sroie-2')

'/content/finetuned_mistral_latest.zip'

In [None]:
i = 2
instruction = test_set[i]['instruction']
input_text = test_set[i]['input']
output = test_set[i]['output']
pre_finetune_output = generate_response(f"""### Instruction:
{instruction}

### Input:
{input_text}
""", peft_model)

['<s> ### Instruction:\nExtract the following information from the given ocr text: company, date, address, total.\n\n### Input:\nAEON CO. (M) BHD (126926-H) 3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMAN MALURI CHERAS, 55100 KUALA LUMPUR GST ID : 002017394688 SHOPPING HOURS SUN-THU: 1000 HRS - 2230 HRS FRI-SAT : 1000 HRS - 2300 HRS 1X 000007996511 75.00SR AMBROSIAL GREEK SUB-TOTAL 75.00 TOTAL SALES INCL GST 75.00 TOTAL AFTER ADJ INCL GST 75.00 CASH 100.00 ITEM COUNT 1 CHANGE AMT 25.00 INVOICE NO: 2018030610100080498 GST SUMMARY AMOUNT TAX SR @ 6% 70.75 4.25 TOTAL 70.75 4.25 06/03/2018 20:01 1010 008 00B0498 0305582 PJ PIRYALATHA AEON PERMAS JAYA TEL 1-300-80-AEON (2366) THANK YOU FOR YOUR PATRONAGE PLEASE COME AGAIN\n\n### Response:\nCompany: AEON CO. (M) BHD\nDate: 06/03/2018\nAddress: 3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMAN MALURI CHERAS, 55100 KUALA LUMPUR\nTotal: 75.00</s>']


In [None]:
print(pre_finetune_output)

<s> 

### Response:

Company: AEON CO. (M) BHD

Date: 06/03/2018

Address: 3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMAN MALURI CHERAS, 55100 KUALA LUMPUR

Total: 75.00</s>


In [None]:
from huggingface_hub import notebook_login

notebook_login()
trainer.push_to_hub("krishnapal2308/mistral-instruct-7b-finetuned-sroie")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.safetensors:   0%|          | 0.00/54.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

events.out.tfevents.1721885197.2d6e176ea1f9.525.0:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/krishnapal2308/mistral-instruct-7b-finetuned-sroie/commit/c1ccae45c742039bb24f91817fb011c3acc776cc', commit_message='krishnapal2308/mistral-instruct-7b-finetuned-sroie', commit_description='', oid='c1ccae45c742039bb24f91817fb011c3acc776cc', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# # Load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the PEFT configuration and model
peft_model_id = "krishnapal2308/mistral-instruct-7b-finetuned-sroie"
config = PeftConfig.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(base_model, peft_model_id)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/54.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [4]:
def generate_response(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()

prompt = """### Instruction:
Extract the following information from the given OCR text: company, date, address, total.

### Input:
SATS PREMIER LOUNGE SINGAPORE CHANGI AIRPORT TERMINAL 2 DEPARTURE TRANSIT LOUNGE NORTH LEVEL 3 SINGAPORE 819643 TEL: 65822188 TAX INVOICE DATE : 20 APR 2018 TIME: 05:24 PM INV# : 2018042000032950 ITEM AMOUNT ENTRY 1 WALK-IN 64.20 SUB-TOTAL 64.20 GST 7% 4.20 ROUNDING ADJ 0.00 TOTAL 68.40 Goods Sold Are Not Returnable. This is a computer generated receipt. No signature is required.

### Response:
"""

response = generate_response(prompt, model, tokenizer)
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2024-07-25 10:25:28.223508: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 10:25:28.223635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 10:25:28.351135: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Company: SATS PREMIER LOUNGE SINGAPORE CHANGI AIRPORT
Date: 20 APR 2018
Address: NORTH LEVEL 3 SINGAPORE 819643
Total: 68.40
