In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# --- 1. INSTALL DEPENDENCIES ---
# Kaggle comes with many libraries, but we need specific versions for Qwen2-VL
print("Installing libraries...")
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q accelerate qwen-vl-utils json_repair

import os
import zipfile
import json
import torch
import pandas as pd
import re
from tqdm import tqdm
from json_repair import repair_json
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# --- 2. DATA SETUP ---
# In Kaggle, input data is usually at /kaggle/input/{competition_name}
# Since the input directory is Read-Only, we usually unzip to /kaggle/working/

# NOTE: Check the specific path in your "Input" tab.
# It might be a zip file or already unzipped folders.
# This logic assumes it is a ZIP file based on your previous code.

INPUT_DIR = '/kaggle/input/poli-meme-decode-cuet-cse-fest'
WORKING_DIR = '/kaggle/working/unzipped'
ZIP_FILE = None

# Search for the zip file in the input directory
for root, dirs, files in os.walk(INPUT_DIR):
    for file in files:
        if file.endswith(".zip"):
            ZIP_FILE = os.path.join(root, file)
            break

if ZIP_FILE:
    print(f"Found zip file: {ZIP_FILE}")
    if not os.path.exists(WORKING_DIR):
        print(f"Extracting to {WORKING_DIR}...")
        os.makedirs(WORKING_DIR, exist_ok=True)
        with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
            zip_ref.extractall(WORKING_DIR)
        print("Extraction complete.")
    else:
        print("Directory already exists, skipping extraction.")
else:
    # Fallback: Maybe the data is already unzipped in input?
    print("No zip file found. Assuming data is uncompressed in Input.")
    WORKING_DIR = INPUT_DIR



Installing libraries...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
No zip file found. Assuming data is uncompressed in Input.


In [5]:
# Define your specific paths based on the extraction
# Note: Adjust 'PoliMemeDecode' if the folder structure inside the zip is different
IMG_DIR = os.path.join(WORKING_DIR, 'PoliMemeDecode', 'Train', 'Image')
TEST_CSV = os.path.join(WORKING_DIR, 'PoliMemeDecode', 'Train', 'Train.csv')

print(f"Image Directory: {IMG_DIR}")
print(f"CSV Path: {TEST_CSV}")

Image Directory: /kaggle/input/poli-meme-decode-cuet-cse-fest/PoliMemeDecode/Train/Image
CSV Path: /kaggle/input/poli-meme-decode-cuet-cse-fest/PoliMemeDecode/Train/Train.csv


In [6]:
# for collab

from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"heytamjid","key":"a54824d15a6faf7dc881c2828b4633cd"}'}

In [7]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c poli-meme-decode-cuet-cse-fest

Downloading poli-meme-decode-cuet-cse-fest.zip to /content
 77% 280M/364M [00:00<00:00, 825MB/s] 
100% 364M/364M [00:00<00:00, 671MB/s]


In [8]:
import zipfile
import os

zip_file_path = '/content/poli-meme-decode-cuet-cse-fest.zip'
extraction_path = '/content/unzipped'

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print(f"Successfully unzipped {zip_file_path} to {extraction_path}")

# List the contents of the extraction path to verify
print(f"Contents of {extraction_path}:")
!ls {extraction_path}

Successfully unzipped /content/poli-meme-decode-cuet-cse-fest.zip to /content/unzipped
Contents of /content/unzipped:
PoliMemeDecode


In [9]:
# --- 1. INSTALL UPDATED LIBRARIES (Crucial for Qwen2.5) ---
# Qwen2.5-VL requires the very latest transformers and utils
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q accelerate bitsandbytes
!pip install -q --upgrade qwen-vl-utils

import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

# --- 2. CONFIGURE QUANTIZATION ---
# This config squeezes the 7B model into ~6GB VRAM
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# --- 3. LOAD MODEL (Colab Optimized) ---
print("Loading VLM Model (Qwen2.5-VL-7B-Instruct) with 4-bit quantization...")
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,  # <--- CRITICAL FOR COLAB
    offload_buffers=True     # <--- Helps if VRAM gets tight
)

# Note: The processor is usually compatible with the 'Auto' class,
# but we ensure we grab the right config.
processor = AutoProcessor.from_pretrained(model_name)

print("Model loaded successfully on GPU (4-bit)!")




  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hLoading VLM Model (Qwen2.5-VL-7B-Instruct) with 4-bit quantization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'mrope_section'}


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Model loaded successfully on GPU (4-bit)!


In [13]:
import gc
from PIL import Image

# --- 4. IMPROVED PARSING & GENERATION LOGIC ---

def resize_image_if_needed(image_path, max_size=1024):
    """
    Resizes image if either dimension exceeds max_size.
    Saves to a temporary path to avoid modifying original data.
    """
    try:
        with Image.open(image_path) as img:
            # Check if resize is needed
            if img.width > max_size or img.height > max_size:
                img.thumbnail((max_size, max_size)) # Maintains aspect ratio

                # Save to a temp file
                temp_path = "/content/temp_processing/temp_processing.jpg"
                img.save(temp_path, format="JPEG")
                return temp_path
            else:
                return image_path
    except Exception as e:
        print(f"Image open error: {e}")
        return None


# --- 4. PARSING LOGIC (Helper Functions) ---

# def parse_robust_json(raw_text):
#     """
#     Attempts to parse JSON from the model output.
#     """
#     cleaned_text = raw_text.replace("```json", "").replace("```", "").strip()

#     data = {
#         "detected_text_blocks": [],
#         "final_caption": "",
#         "explanation": ""
#     }

#     # Strategy A: Smart Repair
#     try:
#         parsed = json.loads(repair_json(cleaned_text))
#         if isinstance(parsed, dict):
#             data.update(parsed)
#             return data
#     except Exception:
#         pass

#     # Strategy B: Regex Extraction
#     caption_match = re.search(r'"final_caption":\s*"(.*?)"', cleaned_text, re.DOTALL)
#     if caption_match:
#         data["final_caption"] = caption_match.group(1)

#     expl_match = re.search(r'"explanation":\s*"(.*?)"', cleaned_text, re.DOTALL)
#     if expl_match:
#         data["explanation"] = expl_match.group(1)

#     blocks_match = re.search(r'"detected_text_blocks":\s*(\[.*?\])', cleaned_text, re.DOTALL)
#     if blocks_match:
#         try:
#             blocks_list = json.loads(repair_json(blocks_match.group(1)))
#             if isinstance(blocks_list, list):
#                 data["detected_text_blocks"] = blocks_list
#         except:
#             pass

#     return data

def parse_robust_json(raw_text):
    """
    Simpler parser for the simplified prompt.
    """
    # Clean code blocks if present
    cleaned_text = raw_text.replace("```json", "").replace("```", "").strip()

    data = {
        "detected_text": [], # Changed key to match new prompt
        "raw_output": cleaned_text # Keep raw text just in case
    }

    try:
        # Try direct JSON parsing first
        parsed = json.loads(cleaned_text)
        if isinstance(parsed, dict) and "detected_text" in parsed:
            data["detected_text"] = parsed["detected_text"]
            return data
    except:
        pass

    # Fallback: Regex for simple list of strings
    # This looks for: "detected_text": [ ... ]
    list_match = re.search(r'"detected_text":\s*(\[.*?\])', cleaned_text, re.DOTALL)
    if list_match:
        try:
            # We try to fix common JSON errors (like trailing commas)
            list_str = list_match.group(1)
            # Simple cleanup for common VLM json errors
            list_str = list_match.group(1).replace(",]", "]")
            data["detected_text"] = json.loads(list_str)
        except:
            pass

    return data

def generate_vlm_response(image_path, model, processor):

    # 1. Resize Image (CRITICAL FIX FOR MEMORY)
    processed_path = resize_image_if_needed(image_path, max_size=1024) # 1280 is a good balance for T4 GPU

    if not processed_path:
        return "Error: Could not process image file because no processed_path"


    if not os.path.exists(processed_path):
        return "Error: Could not process image file because no processed_path 2"

    # prompt_text = """
    # Analyze this meme image. You must differentiate between the 'Meme Caption' and 'Background Noise'.

    # Step 1: Visual Scan
    # - List EVERY separate block of text you see (e.g., top text, bottom text, watermarks, advertisements, meme page name, phone UI dates/batteries etc).
    # - **CRITICAL:** Text might contain Bangla and/or English MIXED. If text is Bangla, write in BANGLA SCRIPT. If English, write in ENGLISH script. Do not translate/transcript to the other language/script. Keep as it is.

    # Step 2: Relevance Filter (Chain of Thought)
    # - For each text block, decide if it is RELEVANT (caption of the meme i.e. part of the joke/message/meme) or NOISE (i.e. small watermarks repeated across the page, page names, ads, UI etc).
    # - Explanation: Why is it relevant or noise? (e.g., "It's the punchline", "It's just a website URL, not related to the meme").

    # Step 3: Explaination of the meme
    # - Explain the meme in words to someone blind and cannot see, explain what the meme is about, why it is humorous/funny/meme-worthy.

    # OUTPUT FORMAT:
    # Provide ONLY a valid JSON object with this exact structure:
    # {
    #   "detected_text_blocks": [
    #     {"text": "sample extracted text 2", "type": "Relevant", "reason": "This text is part of the meme because the question sets the stage for the meme punchline."},
    #     {"text": "sample extracted text 1", "type": "Noise", "reason": "It is an advertisement text and is not related to the meme message."}
    #   ],
    #   "explanation": "Explaination of the meme"
    # }
    # """

    prompt_text = """
    Task: Perform a granular OCR scan of this image.
    You are given a meme. You will extract EACH AND EVERY BLOCK of the VISIBLE TEXTS in this image.

    Rules:
    1. Scan the image from Top-Left to Bottom-Right.
    2. Identify EVERY SEPERATE BLOCK of text.
    3. SEPARATION RULE: If two pieces of text are visually separated, treat them as DIFFERENT list items. Do not merge them.
    4. **CRITICAL:** Memes can have Bangla and/or English words mixed. If the text is Bangla, write in BANGLA SCRIPT. If English, write in English. Do NOT transcript to the other language.
    5. Transcribe the text EXACTLY as written.
    6. Do NOT output duplicate lines.

    OUTPUT FORMAT:
    Return a SINGLE JSON object containing a list of strings.
    {
      "detected_text": [
        "Top header text",
        "Middle meme caption",
        "Another meme text",
        "Bottom punchline text"
      ]
    }
    """

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": processed_path},
                {"type": "text", "text": prompt_text},
            ],
        }
    ]

    try:
        text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text_input],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # NEW --- GENERATION SETTINGS TUNED FOR TEXT ---
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                # ✅ NEW: Prevents the model from getting stuck in a loop
                #repetition_penalty=1.1,
                # ✅ NEW: slightly lowers temperature to make it more factual/less creative
                #temperature=0.8,


                do_sample=True,
                temperature=0.2,         # Increased slightly (0.2 -> 0.4) to force variety
                top_p=0.95,               # Nucleus sampling (keeps high quality)
            )

        # output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)





    #     # --- MEMORY CLEANUP (CRITICAL) ---
    #     del inputs
    #     del generated_ids
    #     del image_inputs
    #     torch.cuda.empty_cache() # Clear GPU cache
    #     gc.collect() # Clear Python RAM

    #     return output_text[0]

    # except Exception as e:
    #     # Emergency cleanup if it crashes
    #     torch.cuda.empty_cache()
    #     gc.collect()
    #     return f"Error: {str(e)}"



        # ✅ MISSING PART FIXED HERE: Decode the IDs into Text
        output_text_list = processor.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        # Safe extraction
        if output_text_list:
            final_text = output_text_list[0]
        else:
            final_text = "Error: No text generated slkdjfioeu"

        # --- MEMORY CLEANUP ---
        del inputs
        del generated_ids
        del image_inputs
        torch.cuda.empty_cache()
        gc.collect()

        # Parse the final text
        if "assistant\n" in final_text:
            return final_text.split("assistant\n")[-1].strip()
        else:
            return final_text

    except Exception as e:
        torch.cuda.empty_cache()
        gc.collect()
        return f"Error: {str(e)}"




# # --- 5. EXECUTION ---

# # Verify CSV exists
# if not os.path.exists(TEST_CSV):
#     print(f"ERROR: CSV file not found at {TEST_CSV}")
#     # Try to find it recursively if path is wrong
#     for root, dirs, files in os.walk(WORKING_DIR):
#         if "Test.csv" in files:
#             TEST_CSV = os.path.join(root, "Test.csv")
#             print(f"Found CSV at: {TEST_CSV}")
#             break

# test_df = pd.read_csv(TEST_CSV)
# results = []
# subset = test_df.head(10).copy() # Processing first 10 for testing

# print("Processing...")
# tqdm.pandas()

# for index, row in tqdm(subset.iterrows(), total=subset.shape[0]):
#     # Fix image name if it has extra spaces or path issues
#     img_name = str(row['Image_name']).strip()
#     full_img_path = os.path.join(IMG_DIR, img_name)

#     res = generate_vlm_response(full_img_path, model, processor)

#     if res and not res.startswith("Error"):
#         parsed = parse_robust_json(res)
#         results.append({
#             "Image_name": img_name,
#             "final_caption": parsed.get("final_caption", ""),
#             "explanation": parsed.get("explanation", ""),
#             "debug_blocks": json.dumps(parsed.get("detected_text_blocks", []), ensure_ascii=False)
#         })
#     else:
#         results.append({
#             "Image_name": img_name,
#             "final_caption": "",
#             "explanation": "Image Load Error or Model Failure",
#             "debug_blocks": "[]"
#         })

# # --- 6. DISPLAY & SAVE ---
# result_df = pd.DataFrame(results)
# pd.set_option('display.max_colwidth', None)
# display(result_df)

# # Save results to output directory so you can download them
# result_df.to_csv('/kaggle/working/submission_test.csv', index=False)
# print("Saved results to /kaggle/working/submission_test.csv")

In [14]:
# --- EXECUTION ---
IMG_DIR = '/content/unzipped/PoliMemeDecode/Train/Image' # Update if needed
TEST_CSV = '/content/unzipped/PoliMemeDecode/Train/Train.csv'
test_df = pd.read_csv(TEST_CSV)

In [16]:
# --- 5. EXECUTION WITH CHECKPOINTING ---

import os
import pandas as pd
from tqdm import tqdm
import torch
import gc
from google.colab import drive # Import drive

# ================= CONFIGURATION =================
START_INDEX = 2300       # Where to start (e.g., 0)
END_INDEX = len(test_df)    # Where to stop (e.g., 1000). Set to len(test_df) for all.
SAVE_INTERVAL = 5    # Save to disk every 50 images

# --- Google Drive Setup ---
drive.mount('/content/drive')
drive_output_dir = '/content/drive/MyDrive/Colab_Output'
os.makedirs(drive_output_dir, exist_ok=True) # Ensure the directory exists
# =================================================

# Load Data
test_df = pd.read_csv(TEST_CSV)

# Create the specific subset using .iloc
# Ensure END_INDEX doesn't exceed dataframe length
real_end = min(END_INDEX, len(test_df))
subset = test_df.iloc[START_INDEX:real_end]

# Define output filename based on range so you don't overwrite other runs
output_file = os.path.join(drive_output_dir, f'results_{START_INDEX}_to_{real_end}.csv')

print(f"Processing images from index {START_INDEX} to {real_end}...")
print(f"Saving checkpoint every {SAVE_INTERVAL} images to: {output_file}")

results = []
tqdm.pandas()

# We use enumerate to keep track of how many we've processed in THIS specific run
for count, (index, row) in enumerate(tqdm(subset.iterrows(), total=subset.shape[0])):

    img_name = str(row['Image_name']).strip()
    full_img_path = os.path.join(IMG_DIR, img_name)

    # --- Path Handling ---
    if not os.path.exists(full_img_path):
        # Case insensitive fallback search
        found = False
        for f in os.listdir(IMG_DIR):
            if f.lower() == img_name.lower():
                full_img_path = os.path.join(IMG_DIR, f)
                found = True
                break
        if not found:
            results.append({
                "Image_name": img_name,
                "detected_text": ["Error: File Not Found"]
            })
            continue

    # --- Generation ---
    try:
        res = generate_vlm_response(full_img_path, model, processor)

        if res and not res.startswith("Error"):
            parsed = parse_robust_json(res)
            # Store the result
            results.append({
                "Image_name": img_name,
                "detected_text": parsed.get("detected_text", ["Error: Parse Failure"])
            })
        else:
            results.append({
                "Image_name": img_name,
                "detected_text": [f"FAILED: {res}"]
            })

    except Exception as e:
        # Catch unforeseen errors so loop doesn't break
        results.append({
                "Image_name": img_name,
                "detected_text": [f"CRASH: {str(e)}"]
            })

    # --- CHECKPOINT SAVING ---
    # Save if we hit the interval OR if it's the very last item
    if (count + 1) % SAVE_INTERVAL == 0 or (count + 1) == len(subset):
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(output_file, index=False)
        # Optional: Print a small status update
        # print(f"  -> Checkpoint saved: {count + 1} images processed.")

    # --- MEMORY CLEANUP ---
    if count % 10 == 0:
        torch.cuda.empty_cache()
        gc.collect()

print("Processing complete.")
final_df = pd.DataFrame(results)

# Save the final results to Google Drive
final_df.to_csv(output_file, index=False)
print(f"Final results saved to Google Drive at: {output_file}")

# Display the content of the saved CSV
print("Content of the saved CSV:")
display(pd.read_csv(output_file))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing images from index 2300 to 2860...
Saving checkpoint every 5 images to: /content/drive/MyDrive/Colab_Output/results_2300_to_2860.csv


100%|██████████| 560/560 [3:14:11<00:00, 20.81s/it]

Processing complete.
Final results saved to Google Drive at: /content/drive/MyDrive/Colab_Output/results_2300_to_2860.csv
Content of the saved CSV:





Unnamed: 0,Image_name,detected_text
0,train2332.jpg,"['দোস্ত মুভি দেখতে', 'যাবি কালকে?', 'F FONNE',..."
1,train2333.jpg,"['@Bengali_thug_life', 'তুমি চায়ের চিনি দাও ন..."
2,train2334.jpg,['ঈদুল ফিতরের নামাজের আগে কিছু খাওয়া ও মিষ্টি...
3,train2335.jpg,"['*পরীক্ষার হলে', 'আমার খাতা', 'ফাস্টবেঞ্চারের..."
4,train2336.jpg,"['১০ বছর বয়সী আমি ঘুম শেষে জানতে পারি', 'আমাক..."
...,...,...
555,train2894.jpg,"[""When it's midnight and you are walking on th..."
556,train2895.jpg,"['লোডশেডিংয়ের উপকারীতা কি??', 'FB/FAKIBAJIBD'..."
557,train2896.jpg,"['দাদা যাবি না?', 'তোরা যা.....']"
558,train2898.jpg,"['*ব্রিটিশদের জুতোর অস্তিত্ব আছে*', 'মহান বিপ্..."


In [17]:
from google.colab import runtime
runtime.unassign()