In [None]:
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install bitsandbytes

In [None]:
!pip install unsloth_zoo

In [None]:
!pip install pdf2image

In [None]:
import torch
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"PyTorch version: {torch.__version__}")

In [6]:
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
import json
import pandas as pd
import boto3
import json
import boto3
from datetime import datetime

aws_access_key_id='YOUR ACCESS KEY',
aws_secret_access_key='YOUR-SECRET',

boto3_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# 1. Create Dataset:

In [7]:
def list_s3_objects(bucket_name, prefix=''):
    s3_client = boto3.client('s3')
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents']]
    else:
        return []

In [8]:
def download_from_s3(s3_path, local_path):
    """
    Download a file from S3 to local path
    """
    try:
        s3_client = boto3.client('s3')
        s3_client.download_file('YOUR-BUCKET-NAME', s3_path, local_path)
         
    except Exception as e:
        print(f"Error downloading from S3: {e}")
        raise

In [9]:
def load_json(bucket_name, file_key):
    """
    Load large JSON file using Dask for parallel processing
    """
    s3_path = f's3://{bucket_name}/{file_key}'
    
    if file_key.endswith('.jsonl') or file_key.endswith('.ndjson'):
        df = pd.read_json(s3_path, lines=True)
    else:
        # For regular JSON, might need to preprocess the file
        df = pd.read_json(s3_path)
    
    # Compute the final DataFrame
    return df

In [10]:
def convert_pdf_to_single_image(pdf_path):
    # Convert PDF to list of images
    images = convert_from_path(pdf_path, dpi=80)
    
    if not images:
        return None
        
    if len(images) == 1:
        return images[0]
    
    # Calculate total height and max width
    total_height = sum(img.height for img in images)
    max_width = max(img.width for img in images)
    
    # Create new image with combined height
    combined_image = Image.new('L', (max_width, total_height), 'white')
    
    # Paste images vertically
    y_offset = 0
    for img in images:
        combined_image.paste(img, (0, y_offset))
        y_offset += img.height
    
    return combined_image

In [11]:
instruction = "You are an expert of extracting fines and tolls informations from invoices."

def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["caption"]} ]
        },
    ]
    return { "messages" : conversation }
pass

In [None]:
df = load_json(PATH, FILE_NAME)
df.head()

In [None]:
columns = ['license_plate', 'event_date', 'amount']

def format_row(row):
    return '{' + ', '.join([
        f"'license_plate':'{row['license_plate']}'",
        f"'event_date':'{row['event_date']}'",
        f"'amount':'{row['amount']:.2f}'"
    ]) + '}'

df['caption'] = df.apply(format_row, axis=1)

df.head()

In [22]:
grouped_df = df.groupby('pdf_path').agg({
    'caption': list
}).reset_index()

In [None]:
grouped_df['caption'][0]

In [24]:
document_list = list_s3_objects('YOUR-BUCKET-NAME', 'LOCAL-PDF-FOLDER')

In [None]:
document_list[0:4]

In [None]:
main_images = []
main_image_ids = []
main_image_caption = []

for file in document_list:
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] INFO: Processing File: {file}")

    pdf_name = file.split('/')[-1]
    download_from_s3(file, f"./pdfs/{pdf_name}")
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] INFO: Local copy with Success: {pdf_name}")
    
    try:
        images = convert_pdf_to_single_image(f"./pdfs/{pdf_name}")
    except:
        images = None
    
    if images:

        caption = grouped_df[grouped_df.pdf_path == pdf_name]['caption'].values
        
        main_images.append(images)
        main_image_ids.append(pdf_name)
        main_image_caption.append(caption)

        images.save(f'./images/agg_{pdf_name}' + '.jpg', 'JPEG')

In [None]:
main_images[0]

In [29]:
dataset = list(map(
    lambda x: {'image': x[0], 'image_id': x[1], 'caption': x[2]},
    #list(Zip()) creates a list of 3 elements following the order in the input lists
    zip(main_images, main_image_ids, main_image_caption)
))

In [None]:
converted_dataset = [convert_to_conversation(sample) for sample in dataset]
converted_dataset[0]

# 2. Trainning Model:

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [21]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 1024,
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
%%time
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Test:

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[2]["image"]

instruction = "You are an expert of extracting fines and tolls informations from invoices."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 618,
                   use_cache = True, temperature = 0.1, min_p = 0.1)

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving