In [None]:
!git clone https://huggingface.co/cxfajar197/urdu-ocr

Cloning into 'urdu-ocr'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 49 (delta 17), reused 0 (delta 0), pack-reused 9 (from 1)[K
Unpacking objects: 100% (49/49), 583.43 KiB | 2.78 MiB/s, done.


In [None]:
!pip install wandb -q



In [None]:
!pip install transformers datasets pandas openpyxl pillow



# Set up Google Colab and mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoProcessor, AutoModelForVision2Seq, Trainer, TrainingArguments
from PIL import Image
import torch
import wandb

wandb.init(project="Urdu_OCR")

# Log in to your W&B account
# wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mamjadkudsi7[0m ([33mamjadkudsi7-university-of-new-haven[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device: Tesla T4


# Load and preprocess the dataset

In [9]:
# Load the annotations
annotations_path = "/content/drive/MyDrive/Annotations.xlsx"
df = pd.read_excel(annotations_path, dtype=str)  # Force all columns to be read as strings
print(df.head())

   test/076_022_18.jpg  \
0  test/076_022_16.jpg   
1  test/076_022_15.jpg   
2  test/076_022_14.jpg   
3  test/076_022_13.jpg   
4  test/076_022_12.jpg   

  اندراج و تحریر شرعاً صرف مستحب اور پسندیدہ ہے وہ واجب نہیں کہ کسی شرعی  
0  اور ہندوستان کی مسلم ریاستوں میں اس پر کم و بی...                      
1  بلکہ اس کے نزدیک وہ پسندیدہ اور مستحسن بھی ہے،...                      
2  جہاں تک پہلی صورت کاتعلق ہے، شریعت اسلامیہ کی ...                      
3  ہے، اس لیے اس کولازمی طورپر جاری کردیاجائے، اس...                      
4  مقدمات سے نجات مل سکتی ہے، نکاح کے ثبوت اورک د...                      


In [4]:
# Set the base path for images
base_image_path = "/content/drive/MyDrive/Test/"  # Make sure this path is correct

# Function to load and preprocess images
def load_image(image_path):
    # Add error handling to check if file exists
    if not os.path.exists(image_path):
        print(f"Warning: Image not found at {image_path}. Skipping this example.")
        return None  # or raise an exception if you want to stop execution

    image = Image.open(image_path).convert("RGB")
    return image

def create_dataset(df):
    dataset = Dataset.from_pandas(df)

    def process_example(examples):
        # Access the image filename using the correct column name (e.g., 'image_path')
        # Assuming the image filename is in the first column (index 0)
        image_paths = [os.path.join(base_image_path, os.path.basename(example))
                      for example in examples[df.columns[0]]]  # Access the column values directly
        # Use os.path.basename to safely extract filename
        # Access column value directly using the column name as the key

        # Convert examples (list of dictionaries) to a pandas DataFrame for easier access by column name
        examples_df = pd.DataFrame(examples)

        images = [load_image(image_path) for image_path in image_paths]  # List of loaded images

        # Add images to examples in the batch (using the DataFrame for access)
        # Ensure that the length of 'images' matches the length of 'examples_df'
        # Handle the case where all images are None
        if all(img is None for img in images):  # Check if all images are None
            print("Warning: All images in this batch are missing or invalid. Returning an empty dataset.")
            return {}
        else:
            examples_df['image'] = [img for img in images if img is not None]

        # Filter out examples where the image loading failed
        examples_df = examples_df[examples_df['image'].notna()]
        # Check if the DataFrame is empty after dropping rows with missing images
        if examples_df.empty:
            print("Warning: All images in this batch are missing or invalid. Returning an empty dataset.")
            return {}  # Return an empty dictionary to prevent errors

        # Convert back to a list of dictionaries for compatibility with datasets.map
        examples = examples_df.to_dict('records')

        return examples

    # Apply the process_example function to the dataset
    dataset = dataset.map(process_example, batched=True, batch_size=2) # Reduced batch size

    # Check if the dataset is empty
    if dataset.num_rows == 0:
        print("Error: The dataset is empty after processing. Please check your image paths and file names.")
        return None # Return None to indicate an error

    return dataset

# Create train and validation datasets
train_df, val_df = df.iloc[:7500], df.iloc[7500:]


# Load the model and processor

In [5]:
model_name = "cxfajar197/urdu-ocr"
processor = AutoProcessor.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForVision2Seq.from_pretrained(model_name).to(device)

Config of the encoder: <class 'transformers.models.deit.modeling_deit.DeiTModel'> is overwritten by shared encoder config: DeiTConfig {
  "_name_or_path": "facebook/deit-base-distilled-patch16-384",
  "architectures": [
    "DeiTForImageClassificationWithTeacher"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "tench, Tinca tinca",
    "1": "goldfish, Carassius auratus",
    "2": "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
    "3": "tiger shark, Galeocerdo cuvieri",
    "4": "hammerhead, hammerhead shark",
    "5": "electric ray, crampfish, numbfish, torpedo",
    "6": "stingray",
    "7": "cock",
    "8": "hen",
    "9": "ostrich, Struthio camelus",
    "10": "brambling, Fringilla montifringilla",
    "11": "goldfinch, Carduelis carduelis",
    "12": "house finch, linnet, Carpodacus mexicanus",
    "13": "junco, snowbird",


# Prepare the datasets

In [6]:
# Define preprocess_data function here
def preprocess_data(examples):

    # Ensure images are of type PIL.Image.Image and convert to RGB
    # Instead of examples['image'], use examples[df.columns[0]] to access the image paths
    images = [load_image(os.path.join(base_image_path, os.path.basename(image_path))).convert("RGB")
              if isinstance(load_image(os.path.join(base_image_path, os.path.basename(image_path))), Image.Image)
              else load_image(os.path.join(base_image_path, os.path.basename(image_path)))
              for image_path in examples[df.columns[0]]]

    # Filter out None values from the images list before processing
    images = [image for image in images if image is not None]
    if not images:  # If images is empty after filtering
        print("Warning: All images in this batch are missing or invalid. Returning an empty dataset.")
        return {}  # Return an empty dictionary to prevent errors

    # Process images to get pixel values using the processor individually
    pixel_values = [processor(images=image, return_tensors="pt").pixel_values for image in images]

    # Stack the pixel values into a single tensor
    pixel_values = torch.stack(pixel_values, dim=0)

    # Get the text column name
    text_column_name = df.columns[1]

    # Tokenize each text individually, handling potential None or empty strings
    labels = []
    for text in examples[text_column_name]:
        if text is not None and text != "":  # Check if text is valid
            labels.append(processor.tokenizer(text, padding="max_length", truncation=True).input_ids)
        else:
            # Handle missing or invalid text data, e.g., replace with a special token
            labels.append(processor.tokenizer("[MISSING]", padding="max_length", truncation=True).input_ids)

    # Now use the processed image data (pixel_values) and text in the final dictionary
    final_inputs = {
        df.columns[0]: examples[df.columns[0]], # Original image path column
        df.columns[1]: examples[df.columns[1]], # Original text column
        "pixel_values": pixel_values.squeeze(1),  # Use 'pixel_values' key, remove extra dimension
        "labels": labels,  # Assign the tokenized labels
    }
    return final_inputs

# Create train and validation datasets
train_df, val_df = df.iloc[:7500], df.iloc[7500:]
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

# Check if train_dataset is None before proceeding
if train_dataset is None:
    print("Error: train_dataset is None. Check your image paths and data.")
    # Handle the error appropriately, e.g., exit the script or raise an exception
else:
    # Proceed with preprocessing if train_dataset is valid
    # Remove remove_columns to keep the necessary columns
    train_dataset = train_dataset.map(preprocess_data, batched=True)  # remove_columns removed

# Similar check for val_dataset
if val_dataset is None:
    print("Error: val_dataset is None. Check your image paths and data.")
    # Handle the error appropriately
else:
    # Remove remove_columns to keep the necessary columns
    val_dataset = val_dataset.map(preprocess_data, batched=True)  # remove_columns removed

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Map:   0%|          | 0/840 [00:00<?, ? examples/s]



Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Map:   0%|          | 0/840 [00:00<?, ? examples/s]



In [10]:
print("Train dataset features:", train_dataset.features)
print("Validation dataset features:", val_dataset.features)

Train dataset features: {'test/076_022_18.jpg': Value(dtype='string', id=None), 'اندراج و تحریر شرعاً صرف مستحب اور پسندیدہ ہے وہ واجب نہیں کہ کسی شرعی': Value(dtype='string', id=None)}
Validation dataset features: {'test/076_022_18.jpg': Value(dtype='string', id=None), 'اندراج و تحریر شرعاً صرف مستحب اور پسندیدہ ہے وہ واجب نہیں کہ کسی شرعی': Value(dtype='string', id=None)}


# Set up training arguments and trainer

In [7]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # Increase batch size if possible
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Increase epochs if necessary
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb",
    logging_dir="./logs",  # Add logging directory
    ignore_data_skip=True,
    label_names = ["labels"],  # Add 'labels' here. You may need to add other columns as needed
    # Remove unused columns explicitly
    remove_unused_columns=False # This line will keep all columns
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
)

  trainer = Trainer(


# Fine-tune the model

In [8]:
trainer.train()



TypeError: can only join an iterable

# Save the fine-tuned model

In [None]:
model.save_pretrained("/content/drive/MyDrive/fine_tuned_urdu_ocr_model")
processor.save_pretrained("/content/drive/MyDrive/fine_tuned_urdu_ocr_model")

# Test the fine-tuned model

In [None]:
def test_model(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=100)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return generated_text

# Test with a sample image
sample_image_path = "/content/drive/MyDrive/Test/076_022_15.jpg"
result = test_model(sample_image_path)
print("Generated Text:", result)