In [1]:
import os
import zipfile
from tqdm import tqdm
from PIL import Image, ImageOps

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
zip_path = '/content/drive/MyDrive/Image_Dataset.zip'
extract_path = '/content/mendeley_images'
output_local_folder = "/content/images_processed"

if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Archivos descomprimidos en: {extract_path}")

Archivos descomprimidos en: /content/mendeley_images


In [7]:
if not os.path.exists(output_local_folder):
    os.makedirs(output_local_folder)
    print(f"Created directory: {output_local_folder}")

Created directory: /content/images_processed


In [3]:
def process_pharmacy_images(input_root, output_folder, target_size=470):
    processed_count = 0

    # os.walk traverses all subdirectories automatically
    for root, dirs, files in os.walk(input_root):
        # Filter for images
        valid_files = [f for f in files if f.lower().endswith((".jpg", ".png", ".jpeg"))]

        if not valid_files:
            continue

        print(f"Processing {len(valid_files)} images found in {root}...")

        for filename in tqdm(valid_files):
            try:
                img_path = os.path.join(root, filename)
                img = Image.open(img_path).convert("RGB")

                # Dynamic Padding to make it square
                w, h = img.size
                if w > h:
                    # Landscape: pad top/bottom
                    pad_total = w - h
                    padding = (0, pad_total // 2, 0, pad_total - (pad_total // 2))
                else:
                    # Portrait: pad left/right
                    pad_total = h - w
                    padding = (pad_total // 2, 0, pad_total - (pad_total // 2), 0)

                # Add padding to make it square
                temp_img = ImageOps.expand(img, padding, fill='white')

                # Resize to target (e.g., 470x470)
                final_img = temp_img.resize((target_size, target_size), Image.Resampling.LANCZOS)

                # Save
                final_img.save(os.path.join(output_folder, f"padded_{filename}"))
                processed_count += 1
            except Exception as e:
                print(f"Error processing {filename}: {e}")

    print(f"\n✅ Total de imágenes procesadas: {processed_count}")

In [8]:
process_pharmacy_images(extract_path, output_local_folder)

Processing 14474 images found in /content/mendeley_images/Image_Dataset...


100%|██████████| 14474/14474 [00:45<00:00, 317.33it/s]


✅ Total de imágenes procesadas: 14474





In [9]:
import shutil

zip_output_name = '/content/drive/MyDrive/images_processed'

shutil.make_archive(zip_output_name, 'zip', output_local_folder)

print(f"✅ Carpeta comprimida y guardada en Drive como: {zip_output_name}.zip")

✅ Carpeta comprimida y guardada en Drive como: /content/drive/MyDrive/images_processed.zip


In [4]:
!pip install easyocr

Installing collected packages: python-bidi, pyclipper, ninja, easyocr
Successfully installed easyocr-1.7.2 ninja-1.13.0 pyclipper-1.4.0 python-bidi-0.6.7


In [5]:
import easyocr
import json
import os
from tqdm import tqdm

In [10]:
# Inicializamos el lector para español e inglés
reader = easyocr.Reader(['es', 'en'])
ocr_results = []
output_folder = "/content/images_processed"



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [12]:
# List files and process them with a progress bar
files = [f for f in os.listdir(output_folder) if f.endswith(('.jpg', '.png', '.jpeg'))]

for filename in tqdm(files):
    img_path = os.path.join(output_folder, filename)

    # Read text from the image
    results = reader.readtext(img_path)

    # Filter by confidence > 0.7 to ensure training quality
    detected_text = " ".join([res[1] for res in results if res[2] > 0.7])

    if detected_text.strip():
        # Create the structure for the Unsloth Master Dataset
        ocr_results.append({
            "instruction": "Which medication is shown in the image?",
            "context": "", # Empty for the visual track
            "answer": f"The image shows a package of {detected_text.strip()}.",
            "image": img_path
        })

100%|██████████| 14474/14474 [15:25<00:00, 15.64it/s]


In [13]:
# Save the JSONL file
output_jsonl = "pipeline_ocr.jsonl"
with open(output_jsonl, "w", encoding="utf-8") as f:
    for item in ocr_results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"\n✅ Image pipeline completed: {len(ocr_results)} examples saved to {output_jsonl}")


✅ Image pipeline completed: 5000 examples saved to pipeline_ocr.jsonl


In [14]:
import pandas as pd

# Load the JSONL file into a DataFrame
df_img = pd.read_json(output_jsonl, lines=True)

print(f"✅ Total visual examples: {len(df_img)}")
print("\nDetected columns:", df_img.columns.tolist())

✅ Total de ejemplos visuales: 5000

Columnas detectadas: ['instruction', 'context', 'answer', 'image']


Unnamed: 0,instruction,context,answer,image
0,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Dapsone.,/content/images_processed/padded_d5_aug1.jpg
1,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de BBODEB.,/content/images_processed/padded_p32_aug3.jpg
2,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Ozomet-VG1.,/content/images_processed/padded_o27.jpg
3,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de 2.,/content/images_processed/padded_o83_aug4.jpg
4,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Cyclobenz...,/content/images_processed/padded_2_aug1.jpg


In [15]:
display(df_img.head())

Unnamed: 0,instruction,context,answer,image
0,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Dapsone.,/content/images_processed/padded_d5_aug1.jpg
1,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de BBODEB.,/content/images_processed/padded_p32_aug3.jpg
2,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Ozomet-VG1.,/content/images_processed/padded_o27.jpg
3,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de 2.,/content/images_processed/padded_o83_aug4.jpg
4,¿Qué medicamento se muestra en la imagen?,,En la imagen se observa un envase de Cyclobenz...,/content/images_processed/padded_2_aug1.jpg
