# ðŸ“¦ Data Cleaning Pipeline for COMICS Fine-Tuning

This notebook prepares clean, high-quality training data by:

1. **Removing cover pages** (page 0)
2. **Detecting story boundaries** (page gaps + title patterns)
3. **Filtering advertisement panels** (ad keyword detection)
4. **Skipping short segments** (<10 panels)
5. **Building clean sequences** (only within story segments)
6. **Including OCR text** in prompts for context

**Note:** Story boundary detection uses PATTERNS, not just character names!

---

In [25]:
"""
=============================================================================
CELL 1: CONFIGURATION
=============================================================================
Set your parameters here before running the pipeline.
"""

from pathlib import Path

# =============================================================================
# PATH CONFIGURATION
# =============================================================================

DATA_DIR = Path("/scratch/bftl/hsekar/comics_project/data")
OCR_CSV_PATH = DATA_DIR / "ocr" / "COMICS_OCR_WAVE1_sorted.csv"
IMAGES_DIR = DATA_DIR / "images"
PROCESSED_DIR = DATA_DIR / "processed"
OUTPUT_DIR = Path("/scratch/bftl/hsekar/comics_project/outputs")

# Create directories if they don't exist
PROCESSED_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)

# =============================================================================
# SEQUENCE CONFIGURATION
# =============================================================================

CONTEXT_WINDOW = 5          # Number of context panels before target
MIN_SEGMENT_LENGTH = 10     # Minimum panels in a story segment to use it
MIN_PAGE_NUMBER = 1         # Skip page 0 (cover) - start from page 1

# =============================================================================
# DATASET SPLIT CONFIGURATION
# =============================================================================

TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15
RANDOM_SEED = 42

# =============================================================================
# MODEL CONFIGURATION
# =============================================================================

# Using Mistral based LLaVA (better for dialogue generation)
MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
MODEL_CACHE = Path("/scratch/bftl/hsekar/comics_project/model_cache")

# =============================================================================
# FOR QUICK TESTING (set to None to use all comics)
# =============================================================================

MAX_COMICS = None  # Set to e.g., 50 for testing, None for full dataset

print("Configuration loaded!")
print(f"  OCR CSV: {OCR_CSV_PATH}")
print(f"  Images: {IMAGES_DIR}")
print(f"  Model: {MODEL_ID}")
print(f"  Context window: {CONTEXT_WINDOW}")
print(f"  Min segment length: {MIN_SEGMENT_LENGTH}")

Configuration loaded!
  OCR CSV: /scratch/bftl/hsekar/comics_project/data/ocr/COMICS_OCR_WAVE1_sorted.csv
  Images: /scratch/bftl/hsekar/comics_project/data/images
  Model: llava-hf/llava-v1.6-mistral-7b-hf
  Context window: 5
  Min segment length: 10


In [3]:
"""
=============================================================================
CELL 2: FILTER FUNCTIONS (ROBUST VERSION)
=============================================================================
Core filtering logic for cleaning the data.

IMPORTANT: Story detection uses PATTERNS that work for ANY character,
not just a limited list of known names!
"""

import re

# =============================================================================
# GENERIC TITLE PATTERNS (work for ANY character/story)
# =============================================================================

# These patterns indicate a NEW STORY is starting - works regardless of character
TITLE_INDICATOR_PATTERNS = [
    # Story introduction patterns
    r'\bIN:\s',              # "BATMAN IN: THE DARK NIGHT"
    r'\bSTARRING\b',         # "STARRING THE BLUE BEETLE"
    r'\bPRESENTS\b',         # "MARVEL PRESENTS"
    r'\bFEATURING\b',        # "FEATURING CAPTAIN AMERICA"
    r'\bINTRODUCING\b',      # "INTRODUCING THE NEW HERO"
    
    # Chapter/Part indicators
    r'\bCHAPTER\s+[\dIVX]+', # "CHAPTER 1", "CHAPTER IV"
    r'\bPART\s+[\dIVX]+',    # "PART 1", "PART II"
    r'\bEPISODE\s+[\dIVX]+', # "EPISODE 5"
    
    # Versus patterns
    r'\bVS\.?\b',            # "HERO VS VILLAIN", "HERO VS. VILLAIN"
    r'\bVERSUS\b',           # "HERO VERSUS VILLAIN"
    
    # Adventure patterns
    r'THE ADVENTURES? OF\b', # "THE ADVENTURE OF", "THE ADVENTURES OF"
    r'\bMEETS?\b',           # "BATMAN MEETS SUPERMAN"
    
    # Author/credit patterns (indicate story start)
    r'\bBY\s+[A-Z][A-Z]+',   # "BY KIRBY", "BY SIMON"
    r'WRITTEN BY\b',
    r'DRAWN BY\b',
    r'ART BY\b',
    r'STORY BY\b',
    
    # Common title prefixes (work for any character)
    r'^THE\s+AMAZING\b',     # "THE AMAZING SPIDER-MAN"
    r'^THE\s+INCREDIBLE\b',  # "THE INCREDIBLE HULK"
    r'^THE\s+MIGHTY\b',      # "THE MIGHTY THOR"
    r'^THE\s+SPECTACULAR\b', # "THE SPECTACULAR..."
    r'^THE\s+FANTASTIC\b',   # "THE FANTASTIC FOUR"
    r'^THE\s+UNCANNY\b',     # "THE UNCANNY X-MEN"
    
    # Military/professional titles (work for many characters)
    r'^CAPTAIN\s+[A-Z]',     # "CAPTAIN AMERICA", "CAPTAIN MARVEL"
    r'^SERGEANT\s+[A-Z]',    # "SERGEANT FURY"
    r'^CORPORAL\s+[A-Z]',    # "CORPORAL COLLINS"
    r'^DOCTOR\s+[A-Z]',      # "DOCTOR STRANGE"
    r'^DETECTIVE\s+[A-Z]',   # "DETECTIVE COMICS"
    r'^AGENT\s+[A-Z]',       # "AGENT X"
    r'^PROFESSOR\s+[A-Z]',   # "PROFESSOR X"
]

# Patterns that indicate END of a story
STORY_END_PATTERNS = [
    r'\bTHE\s+END\b',
    r'^END$',
    r'\bFINIS\b',
    r'\bTO\s+BE\s+CONTINUED\b',
    r'\bCONTINUED\s+NEXT\b',
]

# =============================================================================
# ADVERTISEMENT KEYWORDS
# =============================================================================

AD_KEYWORDS = [
    # Sales/commerce
    'BUY NOW', 'ORDER NOW', 'ORDER TODAY',
    'SEND FOR', 'SEND ONLY', 'SEND JUST',
    'ONLY $', 'JUST $', 'ONLY 10', 'ONLY 25', 'ONLY 50',
    'FREE GIFT', 'FREE CATALOG', 'FREE BOOK', 'FREE SAMPLE',
    'MONEY BACK', 'GUARANTEE', 'GUARANTEED',
    'COUPON', 'CLIP THIS', 'CUT OUT',
    'MAIL TO', 'MAIL THIS', 'SEND TO',
    'P.O. BOX', 'POST OFFICE BOX', 'BOX NO.',
    
    # Product categories common in old comics
    'BODY BUILDING', 'BUILD MUSCLE', 'CHARLES ATLAS',
    'X-RAY SPECS', 'X-RAY VISION', 'SEE THRU',
    'BB GUN', 'AIR RIFLE', 'DAISY GUN',
    'STAMP COLLECT', 'COIN COLLECT',
    'MAGIC TRICK', 'LEARN MAGIC',
    'SEA MONKEY', 'SEA-MONKEY', 'SEAMONKEY',
    
    # War-era ads
    'WAR BONDS', 'WAR STAMPS', 'BUY BONDS',
    'DEFENSE BONDS', 'SAVINGS BONDS',
    
    # Address patterns (ads have mailing addresses)
    'DEPT.', 'DEPARTMENT',
    'NEW YORK, N.Y', 'CHICAGO, ILL', 'NEWARK, N.J',
    
    # Pricing patterns
    '10Â¢', '25Â¢', '50Â¢', '98Â¢', '$1.00', '$1.98',
    '10 CENTS', '25 CENTS', '50 CENTS',
    
    # Subscription/membership
    'SUBSCRIBE', 'SUBSCRIPTION',
    'JOIN NOW', 'JOIN THE CLUB',
    'MEMBERSHIP',
]

# =============================================================================
# FILTER FUNCTIONS
# =============================================================================

def is_likely_advertisement(text):
    """
    Detect if a panel is likely an advertisement.
    """
    if not text or not isinstance(text, str):
        return False
    
    text_upper = text.upper()
    
    for keyword in AD_KEYWORDS:
        if keyword in text_upper:
            return True
    
    return False


def is_story_title_panel(text):
    """
    Detect if a panel is likely a story title/splash page.
    
    Uses MULTIPLE SIGNALS that work for ANY character:
    1. Regex patterns for common title structures
    2. Short + mostly ALL CAPS text
    3. Story ending patterns (next page is new story)
    
    Returns:
        bool: True if panel appears to be a story title
    """
    if not text or not isinstance(text, str):
        return False
    
    text_upper = text.upper().strip()
    
    # Signal 1: Check regex patterns (works for ANY character)
    for pattern in TITLE_INDICATOR_PATTERNS:
        if re.search(pattern, text_upper):
            return True
    
    # Signal 2: Short text that is mostly ALL CAPS (classic title format)
    words = text.split()
    if 2 <= len(words) <= 10:  # Title length range
        caps_chars = sum(1 for c in text if c.isupper())
        alpha_chars = sum(1 for c in text if c.isalpha())
        if alpha_chars > 0:
            caps_ratio = caps_chars / alpha_chars
            # If mostly caps AND short, likely a title
            if caps_ratio > 0.75 and len(words) <= 6:
                return True
    
    return False


def is_story_ending(text):
    """
    Detect if a panel marks the END of a story.
    The NEXT page would be a new story.
    """
    if not text or not isinstance(text, str):
        return False
    
    text_upper = text.upper().strip()
    
    for pattern in STORY_END_PATTERNS:
        if re.search(pattern, text_upper):
            return True
    
    return False


def detect_story_boundaries(panels_df):
    """
    Detect story boundaries within a single comic.
    
    Uses MULTIPLE SIGNALS (not just character names!):
    1. Page number gaps > 1 (filtered ads between stories)
    2. Title patterns in first panel of page (regex-based)
    3. Story ending patterns (THE END, etc.)
    4. Short ALL-CAPS text (classic title format)
    
    Args:
        panels_df: DataFrame with panels from ONE comic, sorted by page/panel
        
    Returns:
        list: List of page numbers where new stories begin
    """
    if panels_df.empty:
        return []
    
    boundaries = []
    pages = sorted(panels_df['page_no'].unique())
    
    if len(pages) == 0:
        return []
    
    # First valid page is always a boundary (story start)
    boundaries.append(pages[0])
    
    # Track if previous page had a story ending
    prev_page_had_ending = False
    
    for i in range(1, len(pages)):
        current_page = pages[i]
        previous_page = pages[i-1]
        
        # Signal 1: Page gap > 1 (ads were filtered between stories)
        # This is the MOST RELIABLE signal!
        if current_page - previous_page > 1:
            boundaries.append(current_page)
            prev_page_had_ending = False
            continue
        
        # Signal 2: Previous page had "THE END" or similar
        if prev_page_had_ending:
            boundaries.append(current_page)
            prev_page_had_ending = False
            continue
        
        # Signal 3: Check first panel of this page for title patterns
        page_panels = panels_df[panels_df['page_no'] == current_page]
        if not page_panels.empty:
            first_panel = page_panels.iloc[0]
            text = str(first_panel.get('agg_text', ''))
            
            if is_story_title_panel(text):
                # Don't add if it's right after another boundary (same story intro)
                if boundaries and current_page - boundaries[-1] > 2:
                    boundaries.append(current_page)
        
        # Check if THIS page has a story ending (for next iteration)
        all_page_text = ' '.join(str(row.get('agg_text', '')) for _, row in page_panels.iterrows())
        prev_page_had_ending = is_story_ending(all_page_text)
    
    return sorted(set(boundaries))


# Test the detection
print("Filter functions loaded!")
print(f"  Title patterns: {len(TITLE_INDICATOR_PATTERNS)}")
print(f"  Story end patterns: {len(STORY_END_PATTERNS)}")
print(f"  Ad keywords: {len(AD_KEYWORDS)}")

# Test cases
print("\nTest cases:")
test_titles = [
    "BLUE BEETLE IN: THE MYSTERY CASE",
    "CAPTAIN MARVEL VS THE VILLAIN",
    "THE AMAZING SPIDER-MAN",
    "CHAPTER 3: THE FINAL BATTLE",
    "BY JACK KIRBY",
    "HERCULES MODERN CHAMPION OF JUSTICE",
    "SOME RANDOM DIALOGUE HERE",  # Should be False
]
for t in test_titles:
    result = is_story_title_panel(t)
    print(f"  '{t[:40]}...' -> {result}")

Filter functions loaded!
  Title patterns: 30
  Story end patterns: 5
  Ad keywords: 67

Test cases:
  'BLUE BEETLE IN: THE MYSTERY CASE...' -> True
  'CAPTAIN MARVEL VS THE VILLAIN...' -> True
  'THE AMAZING SPIDER-MAN...' -> True
  'CHAPTER 3: THE FINAL BATTLE...' -> True
  'BY JACK KIRBY...' -> True
  'HERCULES MODERN CHAMPION OF JUSTICE...' -> True
  'SOME RANDOM DIALOGUE HERE...' -> True


In [5]:
"""
=============================================================================
CELL 3: LOAD AND EXPLORE DATA
=============================================================================
Load the OCR CSV and get basic statistics.
"""

import pandas as pd
import numpy as np

print("Loading OCR data...")
df = pd.read_csv(OCR_CSV_PATH)

print(f"\n{'='*60}")
print("RAW DATA STATISTICS")
print(f"{'='*60}")
print(f"Total panels: {len(df):,}")
print(f"Total comics: {df['comic_no'].nunique():,}")
print(f"Columns: {list(df.columns)}")

# Get list of all comics
all_comics = sorted(df['comic_no'].unique())
print(f"\nComic number range: {min(all_comics)} to {max(all_comics)}")

# Limit comics if testing
if MAX_COMICS is not None:
    all_comics = all_comics[:MAX_COMICS]
    print(f"\n TESTING MODE: Limited to {MAX_COMICS} comics")

print(f"\nComics to process: {len(all_comics)}")

Loading OCR data...

RAW DATA STATISTICS
Total panels: 420,000
Total comics: 1,441
Columns: ['comic_no', 'page_no', 'panel_no', 'img_path', 'agg_text', 'bubble_count', 'bubbles_json']

Comic number range: 0 to 1447

Comics to process: 1441


In [6]:
"""
=============================================================================
CELL 4: PROCESS ALL COMICS - SEGMENT INTO STORIES
=============================================================================
Apply all filters and segment each comic into separate stories.
"""

from tqdm import tqdm

print("Processing comics...")
print(f"{'='*60}")

# Statistics tracking
stats = {
    'total_panels_raw': 0,
    'panels_removed_cover': 0,
    'panels_removed_ads': 0,
    'total_stories': 0,
    'stories_skipped_short': 0,
    'total_panels_clean': 0,
    'boundaries_by_page_gap': 0,
    'boundaries_by_pattern': 0,
}

# Store all story segments
all_story_segments = []  # List of (comic_no, story_idx, panels_list)

for comic_no in tqdm(all_comics, desc="Processing comics"):
    # Get all panels for this comic
    comic_df = df[df['comic_no'] == comic_no].copy()
    comic_df = comic_df.sort_values(['page_no', 'panel_no'])
    
    stats['total_panels_raw'] += len(comic_df)
    
    # FILTER 1: Remove cover pages (page 0)
    panels_before = len(comic_df)
    comic_df = comic_df[comic_df['page_no'] >= MIN_PAGE_NUMBER]
    stats['panels_removed_cover'] += (panels_before - len(comic_df))
    
    if comic_df.empty:
        continue
    
    # FILTER 2: Remove advertisement panels
    panels_before = len(comic_df)
    comic_df = comic_df[~comic_df['agg_text'].apply(is_likely_advertisement)]
    stats['panels_removed_ads'] += (panels_before - len(comic_df))
    
    if comic_df.empty:
        continue
    
    # FILTER 3: Detect story boundaries
    boundaries = detect_story_boundaries(comic_df)
    
    if not boundaries:
        continue
    
    # Create story segments
    for story_idx, start_page in enumerate(boundaries):
        # Determine end page
        if story_idx + 1 < len(boundaries):
            end_page = boundaries[story_idx + 1] - 1
        else:
            end_page = comic_df['page_no'].max()
        
        # Get panels in this story segment
        segment_df = comic_df[
            (comic_df['page_no'] >= start_page) & 
            (comic_df['page_no'] <= end_page)
        ]
        
        stats['total_stories'] += 1
        
        # FILTER 4: Skip short segments
        if len(segment_df) < MIN_SEGMENT_LENGTH:
            stats['stories_skipped_short'] += 1
            continue
        
        # Convert to list of panel dictionaries
        panels_list = []
        for _, row in segment_df.iterrows():
            # Construct image path
            img_path = IMAGES_DIR / str(comic_no) / f"{int(row['page_no'])}_{int(row['panel_no'])}.jpg"
            
            panels_list.append({
                'comic_no': int(comic_no),
                'page_no': int(row['page_no']),
                'panel_no': int(row['panel_no']),
                'image_path': str(img_path),
                'text': str(row['agg_text']) if pd.notna(row['agg_text']) else '',
            })
        
        stats['total_panels_clean'] += len(panels_list)
        
        all_story_segments.append({
            'comic_no': comic_no,
            'story_idx': story_idx,
            'start_page': start_page,
            'end_page': end_page,
            'panels': panels_list,
        })

print(f"\n{'='*60}")
print("FILTERING STATISTICS")
print(f"{'='*60}")
print(f"Total panels (raw):        {stats['total_panels_raw']:,}")
print(f"Removed (cover pages):     {stats['panels_removed_cover']:,}")
print(f"Removed (advertisements):  {stats['panels_removed_ads']:,}")
print(f"Total stories detected:    {stats['total_stories']:,}")
print(f"Stories skipped (short):   {stats['stories_skipped_short']:,}")
print(f"Stories kept:              {len(all_story_segments):,}")
print(f"Total panels (clean):      {stats['total_panels_clean']:,}")
print(f"\nData reduction: {100*(1 - stats['total_panels_clean']/stats['total_panels_raw']):.1f}%")
print(f"\nAverage stories per comic: {len(all_story_segments)/len(all_comics):.1f}")
print(f"Average panels per story: {stats['total_panels_clean']/len(all_story_segments):.1f}")

Processing comics...


Processing comics: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1441/1441 [01:08<00:00, 21.00it/s]


FILTERING STATISTICS
Total panels (raw):        420,000
Removed (cover pages):     2,667
Removed (advertisements):  3,286
Total stories detected:    11,514
Stories skipped (short):   1,774
Stories kept:              9,740
Total panels (clean):      406,042

Data reduction: 3.3%

Average stories per comic: 6.8
Average panels per story: 41.7





In [8]:
"""
=============================================================================
CELL 5: BUILD TRAINING SEQUENCES (OPTIMIZED)
=============================================================================
OPTIMIZATION: Pre-cache all existing image paths into a set for instant lookup.
This reduces ~2 million disk checks to ~420K (done once), speeding up from 
~1.5 hours to ~5 minutes.
"""

from tqdm import tqdm
import os

print("Building sequences from story segments...")
print(f"{'='*60}")

# =========================================================================
# OPTIMIZATION: Pre-scan all existing images into a set
# =========================================================================
print("\nStep 1: Scanning existing images (one-time cost)...")

existing_images = set()

# Count total comics for progress bar
comic_folders = [f for f in IMAGES_DIR.iterdir() if f.is_dir()]
print(f"  Found {len(comic_folders)} comic folders")

for comic_folder in tqdm(comic_folders, desc="Scanning images"):
    for img_file in comic_folder.iterdir():
        if img_file.suffix.lower() in ['.jpg', '.jpeg', '.png']:
            existing_images.add(str(img_file))

print(f"  Total images found: {len(existing_images):,}")

# =========================================================================
# Build sequences with FAST set lookup
# =========================================================================
print("\nStep 2: Building sequences (with fast image verification)...")

all_sequences = []
skipped_no_images = 0

for segment in tqdm(all_story_segments, desc="Building sequences"):
    panels = segment['panels']
    
    # Need at least CONTEXT_WINDOW + 1 panels to make a sequence
    if len(panels) <= CONTEXT_WINDOW:
        continue
    
    # Create sequences within this story segment
    for i in range(CONTEXT_WINDOW, len(panels)):
        context_panels = panels[i - CONTEXT_WINDOW : i]
        target_panel = panels[i]
        
        # FAST: Check against pre-cached set (instant lookup!)
        all_exist = True
        for p in context_panels + [target_panel]:
            if p['image_path'] not in existing_images:
                all_exist = False
                break
        
        if not all_exist:
            skipped_no_images += 1
            continue
        
        # Build the sequence
        sequence = {
            'comic_no': segment['comic_no'],
            'story_idx': segment['story_idx'],
            'context': context_panels,
            'target': target_panel,
            'target_text': target_panel['text'],
            'context_texts': [p['text'] for p in context_panels],
        }
        
        all_sequences.append(sequence)

print(f"\n{'='*60}")
print("SEQUENCE STATISTICS")
print(f"{'='*60}")
print(f"Total sequences built: {len(all_sequences):,}")
print(f"Skipped (missing images): {skipped_no_images:,}")
if all_story_segments:
    print(f"Average sequences per story: {len(all_sequences) / len(all_story_segments):.1f}")

# Show sample sequence
if all_sequences:
    sample = all_sequences[0]
    print(f"\n{'='*60}")
    print("SAMPLE SEQUENCE")
    print(f"{'='*60}")
    print(f"Comic: {sample['comic_no']}, Story: {sample['story_idx']}")
    print(f"\nContext panels:")
    for i, ctx in enumerate(sample['context']):
        text_preview = ctx['text'][:50] + '...' if len(ctx['text']) > 50 else ctx['text']
        print(f"  {i+1}. Page {ctx['page_no']}, Panel {ctx['panel_no']}: {text_preview}")
    print(f"\nTarget panel:")
    print(f"  Page {sample['target']['page_no']}, Panel {sample['target']['panel_no']}")
    target_preview = sample['target_text'][:100] + '...' if len(sample['target_text']) > 100 else sample['target_text']
    print(f"  Text: {target_preview}")

Building sequences from story segments...

Step 1: Scanning existing images (one-time cost)...
  Found 1441 comic folders


Scanning images: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1441/1441 [00:04<00:00, 320.85it/s]


  Total images found: 879,226

Step 2: Building sequences (with fast image verification)...


Building sequences: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 9740/9740 [00:02<00:00, 3932.83it/s]


SEQUENCE STATISTICS
Total sequences built: 357,342
Skipped (missing images): 0
Average sequences per story: 36.7

SAMPLE SEQUENCE
Comic: 0, Story: 0

Context panels:
  1. Page 2, Panel 0: CK HIM ,PLAS! CK HIM WN! PLASTIC MAN
  2. Page 2, Panel 1: PLASTICMAN IN HIS CAREER AS AS CRIME-FIGHTER, PLAS...
  3. Page 3, Panel 0: NOW IF I PUT THIS HERE... AND THIS ONE HERE...
  4. Page 3, Panel 1: AND ADJUST THIS SPRING...
  5. Page 3, Panel 2: THAT DOES IT! IT'S FINISHED AND READY FOR THE FINA...

Target panel:
  Page 3, Panel 3
  Text: I'VE BEEN WORKING ON IT FOR DAYS! I MUST GET A BREATH OF AIR AND CLEAR MY HEAD BEFORE I BEGIN MY TES...





In [10]:
"""
DIAGNOSTIC: Check how Comic #0 was segmented
Run this cell to inspect Comic #0 specifically
"""

# Find all story segments for Comic 0
comic_0_segments = [seg for seg in all_story_segments if seg['comic_no'] == 0]

print(f"{'='*60}")
print(f"COMIC #0 ANALYSIS")
print(f"{'='*60}")
print(f"\nNumber of stories detected: {len(comic_0_segments)}")

for i, seg in enumerate(comic_0_segments):
    print(f"\n--- Story {i+1} ---")
    print(f"  Pages: {seg['start_page']} to {seg['end_page']}")
    print(f"  Panels: {len(seg['panels'])}")
    
    # Show first panel text (what triggered the boundary?)
    if seg['panels']:
        first_text = seg['panels'][0]['text'][:80]
        print(f"  First panel text: \"{first_text}...\"")

print(f"\n{'='*60}")

# Also check: What pages in Comic 0 were detected as boundaries?
comic_0_df = df[df['comic_no'] == 0].copy()
comic_0_df = comic_0_df[comic_0_df['page_no'] >= 1]  # Exclude cover
comic_0_df = comic_0_df.sort_values(['page_no', 'panel_no'])

print(f"\nBASIC INFO:")
print(f"  Total panels (after cover removal): {len(comic_0_df)}")
print(f"  Page range: {comic_0_df['page_no'].min()} to {comic_0_df['page_no'].max()}")

print(f"\nBOUNDARIES DETECTED:")
boundaries = detect_story_boundaries(comic_0_df)
print(f"  Boundary pages: {boundaries}")
print(f"  Number of boundaries: {len(boundaries)}")

# Show what text triggered each boundary
print(f"\nTEXT AT EACH BOUNDARY:")
for page in boundaries:
    page_panels = comic_0_df[comic_0_df['page_no'] == page]
    if not page_panels.empty:
        first_text = str(page_panels.iloc[0]['agg_text'])[:70]
        print(f"  Page {page:3d}: \"{first_text}...\"")

# Show page gaps (if any)
print(f"\nPAGE GAPS DETECTED:")
pages = sorted(comic_0_df['page_no'].unique())
gaps = []
for i in range(1, len(pages)):
    if pages[i] - pages[i-1] > 1:
        gaps.append((pages[i-1], pages[i]))
        
if gaps:
    for start, end in gaps:
        print(f"  Gap: Page {start} â†’ Page {end} (missing: {list(range(start+1, end))})")
else:
    print(f"  No page gaps found (all pages consecutive)")

print(f"\n{'='*60}")

COMIC #0 ANALYSIS

Number of stories detected: 7

--- Story 1 ---
  Pages: 2 to 8
  Panels: 44
  First panel text: "CK HIM ,PLAS! CK HIM WN! PLASTIC MAN..."

--- Story 2 ---
  Pages: 9 to 17
  Panels: 56
  First panel text: "NOW FIRST I REMOVE THESE!..."

--- Story 3 ---
  Pages: 18 to 25
  Panels: 52
  First panel text: "I DON'T KNOW! I MUSTA BURNED IT SOMEWHERE! THAT'S NO ORDINARY BURN! IT LOOKS TO ..."

--- Story 4 ---
  Pages: 26 to 29
  Panels: 28
  First panel text: "WELL, CAN YOU? AWRK!..."

--- Story 5 ---
  Pages: 30 to 39
  Panels: 58
  First panel text: "PLASTIC SHE'S A FAST SLOOP! WE'LL BE AT AMOS ISLAND BY MORNING! IT'S ALMOST DARK..."

--- Story 6 ---
  Pages: 40 to 42
  Panels: 11
  First panel text: "..."

--- Story 7 ---
  Pages: 43 to 48
  Panels: 41
  First panel text: "STILL TRYING, EH ROCKY?..."


BASIC INFO:
  Total panels (after cover removal): 293
  Page range: 2 to 49

BOUNDARIES DETECTED:
  Boundary pages: [np.int64(2), np.int64(9), np.int64(18), np.int64(26),

In [14]:
"""
=============================================================================
CELL 6: CREATE TRAIN/VAL/TEST SPLIT
=============================================================================
Split at the COMIC level (not sequence level) to ensure:
- No story from a comic appears in both train and test
- Proper generalization testing
"""

import random
import json

print("Creating train/val/test split...")
print(f"{'='*60}")

# Get unique comics that have sequences
comics_with_sequences = list(set(seq['comic_no'] for seq in all_sequences))
random.seed(RANDOM_SEED)
random.shuffle(comics_with_sequences)

# Calculate split sizes
n_total = len(comics_with_sequences)
n_train = int(n_total * TRAIN_RATIO)
n_val = int(n_total * VAL_RATIO)
n_test = n_total - n_train - n_val

# Split comics
train_comics = set(comics_with_sequences[:n_train])
val_comics = set(comics_with_sequences[n_train:n_train + n_val])
test_comics = set(comics_with_sequences[n_train + n_val:])

# Split sequences based on comic assignment
train_sequences = [seq for seq in all_sequences if seq['comic_no'] in train_comics]
val_sequences = [seq for seq in all_sequences if seq['comic_no'] in val_comics]
test_sequences = [seq for seq in all_sequences if seq['comic_no'] in test_comics]

print(f"\nCOMIC-LEVEL SPLIT:")
print(f"  Train comics: {len(train_comics):,} ({100*len(train_comics)/n_total:.1f}%)")
print(f"  Val comics:   {len(val_comics):,} ({100*len(val_comics)/n_total:.1f}%)")
print(f"  Test comics:  {len(test_comics):,} ({100*len(test_comics)/n_total:.1f}%)")

print(f"\nSEQUENCE-LEVEL SPLIT:")
print(f"  Train sequences: {len(train_sequences):,} ({100*len(train_sequences)/len(all_sequences):.1f}%)")
print(f"  Val sequences:   {len(val_sequences):,} ({100*len(val_sequences)/len(all_sequences):.1f}%)")
print(f"  Test sequences:  {len(test_sequences):,} ({100*len(test_sequences)/len(all_sequences):.1f}%)")

# Save split info
split_info = {
    'train_comics': sorted(list(train_comics)),
    'val_comics': sorted(list(val_comics)),
    'test_comics': sorted(list(test_comics)),
    'n_train_sequences': len(train_sequences),
    'n_val_sequences': len(val_sequences),
    'n_test_sequences': len(test_sequences),
    'context_window': CONTEXT_WINDOW,
    'random_seed': RANDOM_SEED,
}

split_info_path = PROCESSED_DIR / "clean_split_info.json"

# Convert numpy types to native Python types for JSON serialization
def convert_to_native(obj):
    if isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    else:
        return obj

# Convert split_info before saving
split_info = convert_to_native(split_info)

# Now save
with open(split_info_path, 'w') as f:
    json.dump(split_info, f, indent=2)

print(f"\n Split info saved to: {split_info_path}")

Creating train/val/test split...

COMIC-LEVEL SPLIT:
  Train comics: 1,007 (70.0%)
  Val comics:   215 (14.9%)
  Test comics:  217 (15.1%)

SEQUENCE-LEVEL SPLIT:
  Train sequences: 249,576 (69.8%)
  Val sequences:   53,236 (14.9%)
  Test sequences:  54,530 (15.3%)

 Split info saved to: /scratch/bftl/hsekar/comics_project/data/processed/clean_split_info.json


In [16]:
"""
=============================================================================
CELL 7: SAVE PROCESSED SEQUENCES
=============================================================================
Save all sequences in formats ready for:
1. Zero-shot evaluation
2. Fine-tuning
"""

import pickle

print("Saving processed sequences...")
print(f"{'='*60}")

# Save as pickle (fastest for Python)
train_path = PROCESSED_DIR / "train_sequences.pkl"
val_path = PROCESSED_DIR / "val_sequences.pkl"
test_path = PROCESSED_DIR / "test_sequences.pkl"

with open(train_path, 'wb') as f:
    pickle.dump(train_sequences, f)
print(f"Train sequences saved: {train_path}")

with open(val_path, 'wb') as f:
    pickle.dump(val_sequences, f)
print(f"Val sequences saved: {val_path}")

with open(test_path, 'wb') as f:
    pickle.dump(test_sequences, f)
print(f"Test sequences saved: {test_path}")

# Also save a small sample as JSON for easy inspection
sample_sequences = {
    'train_sample': train_sequences[:5] if len(train_sequences) >= 5 else train_sequences,
    'val_sample': val_sequences[:5] if len(val_sequences) >= 5 else val_sequences,
    'test_sample': test_sequences[:5] if len(test_sequences) >= 5 else test_sequences,
}

sample_path = PROCESSED_DIR / "sample_sequences.json"
with open(sample_path, 'w') as f:
    json.dump(sample_sequences, f, indent=2, default=str)
print(f"Sample sequences saved: {sample_path}")

print(f"\n{'='*60}")
print("FILES SAVED")
print(f"{'='*60}")
print(f"  {train_path}")
print(f"  {val_path}")
print(f"  {test_path}")
print(f"  {split_info_path}")
print(f"  {sample_path}")

Saving processed sequences...
Train sequences saved: /scratch/bftl/hsekar/comics_project/data/processed/train_sequences.pkl
Val sequences saved: /scratch/bftl/hsekar/comics_project/data/processed/val_sequences.pkl
Test sequences saved: /scratch/bftl/hsekar/comics_project/data/processed/test_sequences.pkl
Sample sequences saved: /scratch/bftl/hsekar/comics_project/data/processed/sample_sequences.json

FILES SAVED
  /scratch/bftl/hsekar/comics_project/data/processed/train_sequences.pkl
  /scratch/bftl/hsekar/comics_project/data/processed/val_sequences.pkl
  /scratch/bftl/hsekar/comics_project/data/processed/test_sequences.pkl
  /scratch/bftl/hsekar/comics_project/data/processed/clean_split_info.json
  /scratch/bftl/hsekar/comics_project/data/processed/sample_sequences.json


In [17]:
"""
=============================================================================
CELL 8: CREATE HYBRID PROMPT (Images Primary, OCR as Support)
=============================================================================
Images are the primary source of truth. OCR text is used as a helpful hint
when available and meaningful, but the model is instructed to trust the images.
"""

def create_prompt_hybrid(context_texts=None):
    """
    Hybrid prompt: Images are primary, OCR provides hints when available.
    
    Args:
        context_texts: List of OCR texts from context panels (can be None or empty)
        
    Returns:
        str: Formatted prompt
    """
    
    base_prompt = (
        "You are reading a comic book.\n"
        "I will show you the previous panels as images.\n"
        "Based on them, write the dialogue and narration that should appear "
        "in the NEXT panel of the story.\n"
        "Keep it concise and in a natural comic-book style."
    )
    
    # Check if we have any meaningful OCR text
    has_useful_ocr = False
    if context_texts:
        # Filter out empty, very short, or noisy OCR
        useful_texts = []
        for i, text in enumerate(context_texts, 1):
            text = str(text).strip() if text else ""
            # Only include if text is meaningful (>10 chars, not just punctuation)
            if len(text) > 10 and any(c.isalpha() for c in text):
                useful_texts.append((i, text))
        has_useful_ocr = len(useful_texts) > 0
    
    if has_useful_ocr:
        # Add OCR as supplementary context
        ocr_hint = "\n\nNote: Here is some text detected in the panels (may be incomplete or noisy):\n"
        for panel_num, text in useful_texts:
            # Truncate very long OCR to avoid overwhelming the prompt
            truncated = text[:150] + "..." if len(text) > 150 else text
            ocr_hint += f"  Panel {panel_num}: \"{truncated}\"\n"
        ocr_hint += "\nUse this as a hint, but trust what you SEE in the images as the primary source."
        
        return base_prompt + ocr_hint
    else:
        return base_prompt


def create_prompt_simple():
    """
    Simple prompt matching Ollama baseline (no OCR).
    Use this for direct comparison with Ollama results.
    """
    return (
        "You are reading a comic book.\n"
        "I will show you the previous panels as images.\n"
        "Based on them, write the dialogue and narration that should appear "
        "in the NEXT panel of the story.\n"
        "Keep it concise and in a natural comic-book style."
    )


# =============================================================================
# DEMO: Show both prompt styles
# =============================================================================

print("="*70)
print("PROMPT STYLE 1: HYBRID (Images + OCR hints)")
print("="*70)

sample_texts = [
    "RANG-A-TANG THE WONDER DOG! BECAUSE OF HIS ALMOST-HUMAN BRAIN...",
    "I ASKED YOU TO COME HERE BECAUSE I WANTED TO SIGN THIS FAMOUS DOG!",
    "",  # Empty panel (action only)
    "WE'RE ABOUT TO GO BANKRUPT!",
    "THAT'S RIGHT! NAWSON SWELLES, HIS NAME IS!",
]

print(create_prompt_hybrid(sample_texts))

print("\n" + "="*70)
print("PROMPT STYLE 2: SIMPLE (Images only, matches Ollama)")
print("="*70)

print(create_prompt_simple())

print("\n" + "="*70)
print("HYBRID with empty/noisy OCR (falls back to simple)")
print("="*70)

noisy_texts = ["", "...", "nan", "!?", "  "]
print(create_prompt_hybrid(noisy_texts))

print("\n" + "="*70)
print("RECOMMENDATION")
print("="*70)
print("""
For zero-shot evaluation:
  - Use create_prompt_hybrid() to leverage OCR when available
  - This reduces hallucination while keeping images as primary source

For direct Ollama comparison:
  - Use create_prompt_simple() for apples-to-apples comparison
""")

PROMPT STYLE 1: HYBRID (Images + OCR hints)
You are reading a comic book.
I will show you the previous panels as images.
Based on them, write the dialogue and narration that should appear in the NEXT panel of the story.
Keep it concise and in a natural comic-book style.

Note: Here is some text detected in the panels (may be incomplete or noisy):
  Panel 1: "RANG-A-TANG THE WONDER DOG! BECAUSE OF HIS ALMOST-HUMAN BRAIN..."
  Panel 2: "I ASKED YOU TO COME HERE BECAUSE I WANTED TO SIGN THIS FAMOUS DOG!"
  Panel 4: "WE'RE ABOUT TO GO BANKRUPT!"
  Panel 5: "THAT'S RIGHT! NAWSON SWELLES, HIS NAME IS!"

Use this as a hint, but trust what you SEE in the images as the primary source.

PROMPT STYLE 2: SIMPLE (Images only, matches Ollama)
You are reading a comic book.
I will show you the previous panels as images.
Based on them, write the dialogue and narration that should appear in the NEXT panel of the story.
Keep it concise and in a natural comic-book style.

HYBRID with empty/noisy OCR (fall

In [18]:
"""
=============================================================================
CELL 9: FINAL SUMMARY
=============================================================================
Summary of what was accomplished and next steps.
"""

print("="*70)
print("                    DATA CLEANING COMPLETE!")
print("="*70)

print(f"""
WHAT WAS DONE:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
Removed cover pages (page 0)
Filtered advertisement panels ({stats['panels_removed_ads']:,} removed)
Detected story boundaries using PATTERNS (not just character names!):
   - Page gaps (ads filtered between stories)
   - Title patterns ("IN:", "STARRING", "CHAPTER", etc.)
   - Story endings ("THE END", "TO BE CONTINUED")
   - Short ALL-CAPS text
Skipped short segments ({stats['stories_skipped_short']:,} skipped)
Built clean sequences ({len(all_sequences):,} total)
Created train/val/test split at comic level
Saved processed data to {PROCESSED_DIR}

DATA SUMMARY:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  Raw panels:           {stats['total_panels_raw']:,}
  Clean panels:         {stats['total_panels_clean']:,}
  Total stories:        {len(all_story_segments):,}
  Total sequences:      {len(all_sequences):,}
  
  Train sequences:      {len(train_sequences):,}
  Validation sequences: {len(val_sequences):,}
  Test sequences:       {len(test_sequences):,}

FILES CREATED:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  {PROCESSED_DIR}/train_sequences.pkl
  {PROCESSED_DIR}/val_sequences.pkl
  {PROCESSED_DIR}/test_sequences.pkl
  {PROCESSED_DIR}/clean_split_info.json
  {PROCESSED_DIR}/sample_sequences.json

NEXT STEPS:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
1. Run zero-shot evaluation with the cleaned test set
2. Fine-tune LLaVA on the cleaned training set
3. Compare zero-shot vs fine-tuned results
""")

print("="*70)

                    DATA CLEANING COMPLETE!

WHAT WAS DONE:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
Removed cover pages (page 0)
Filtered advertisement panels (3,286 removed)
Detected story boundaries using PATTERNS (not just character names!):
   - Page gaps (ads filtered between stories)
   - Title patterns ("IN:", "STARRING", "CHAPTER", etc.)
   - Story endings ("THE END", "TO BE CONTINUED")
   - Short ALL-CAPS text
Skipped short segments (1,774 skipped)
Built clean sequences (357,342 total)
Created train/val/test split at comic level
Saved processed data to /scratch/bftl/hsekar/comics_project/data/processed

DATA SUMMARY:
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

In [30]:
"""
CELL: Create zero_shot_eval.py with detailed scene description
"""
from pathlib import Path

script_content = '''#!/usr/bin/env python3
import pickle
import random
import time
import json
from pathlib import Path
from PIL import Image
import torch
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

# Configuration
PROCESSED_DIR = Path("/scratch/bftl/hsekar/comics_project/data/processed")
OUTPUT_DIR = Path("/scratch/bftl/hsekar/comics_project/outputs/zero_shot")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

NUM_SEQUENCES = 100
RANDOM_SEED = 42
MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"
MAX_NEW_TOKENS = 512  # Increased for detailed output

print(f"Sequences: {NUM_SEQUENCES}")
print(f"Model: {MODEL_ID}")
print(f"Max tokens: {MAX_NEW_TOKENS}")

# Load model
print("\\nLoading model...")
processor = LlavaNextProcessor.from_pretrained(MODEL_ID)
model = LlavaNextForConditionalGeneration.from_pretrained(
    MODEL_ID, 
    torch_dtype=torch.float16, 
    device_map="auto", 
    low_cpu_mem_usage=True
)
model.eval()
print(f"Model loaded on: {model.device}")

# Load test data
print("\\nLoading test sequences...")
with open(PROCESSED_DIR / "test_sequences.pkl", "rb") as f:
    test_sequences = pickle.load(f)
print(f"Total: {len(test_sequences)}")

random.seed(RANDOM_SEED)
sampled = random.sample(test_sequences, min(NUM_SEQUENCES, len(test_sequences)))
print(f"Sampled: {len(sampled)}")

# Prompt function - DETAILED VERSION
def create_prompt(context_texts):
    base = """You are reading a comic book. I will show you the previous panels as images.

Based on them, predict what happens in the NEXT panel. Provide a detailed description:

1. CHARACTERS: Who is present? Describe their appearance, expressions, and emotions.

2. SETTING: Where is this happening? Describe the background, environment, and any visible objects.

3. ACTION: What are the characters doing? Describe their poses, movements, and interactions.

4. DIALOGUE: Write any speech bubbles or narration boxes that would appear.

Format your response exactly like this:
CHARACTERS: [Detailed character descriptions]
SETTING: [Detailed environment description]
ACTION: [What is happening in the scene]
DIALOGUE: [Speech bubbles and narration text]"""
    
    # Add OCR hints if available
    useful = []
    for i, t in enumerate(context_texts, 1):
        t = str(t).strip() if t else ""
        if len(t) > 10 and any(c.isalpha() for c in t):
            useful.append((i, t))
    
    if useful:
        hint = "\\n\\nText detected in previous panels:\\n"
        for num, txt in useful:
            truncated = txt[:150] + "..." if len(txt) > 150 else txt
            hint += f"  Panel {num}: \\"{truncated}\\"\\n"
        return base + hint
    return base

# Prediction function
def predict(image_paths, context_texts):
    # Load images
    images = []
    for p in image_paths:
        try:
            images.append(Image.open(p).convert("RGB"))
        except Exception as e:
            print(f"    Could not load image: {p}")
            images.append(Image.new("RGB", (224, 224), (128, 128, 128)))
    
    prompt = create_prompt(context_texts)
    
    # Format for LLaVA 1.6 Mistral
    conversation = [
        {
            "role": "user",
            "content": [
                *[{"type": "image"} for _ in images],
                {"type": "text", "text": prompt}
            ]
        }
    ]
    
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(text=text_prompt, images=images, return_tensors="pt", padding=True).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=MAX_NEW_TOKENS, 
            do_sample=False,
            pad_token_id=processor.tokenizer.pad_token_id
        )
    
    full_text = processor.decode(output[0], skip_special_tokens=True)
    
    if "[/INST]" in full_text:
        return full_text.split("[/INST]")[-1].strip()
    return full_text.strip()

# Run evaluation
print("\\nRunning evaluation...")
results = []
start = time.time()

for idx, seq in enumerate(sampled):
    if idx % 10 == 0:
        elapsed = time.time() - start
        print(f"Progress: {idx+1}/{len(sampled)} | Elapsed: {elapsed:.1f}s")
    
    # Get data
    context = seq.get('context', [])
    image_paths = [panel.get('image_path', '') for panel in context]
    context_texts = seq.get('context_texts', [])
    target = seq.get('target', {})
    ground_truth = seq.get('target_text', '')
    
    try:
        pred = predict(image_paths, context_texts)
    except Exception as e:
        print(f"  Error on seq {idx}: {e}")
        pred = f"[ERROR: {e}]"
    
    results.append({
        'idx': idx,
        'comic_no': int(seq.get('comic_no', 0)),
        'story_idx': seq.get('story_idx'),
        'context_texts': context_texts,
        'ground_truth': ground_truth,
        'prediction': pred,
        'target_path': target.get('image_path', '')
    })

total_time = time.time() - start
print(f"\\nDone! Total time: {total_time:.1f}s ({total_time/len(sampled):.2f}s per sequence)")

# Save results
pkl_path = OUTPUT_DIR / f"zero_shot_{NUM_SEQUENCES}seq_detailed.pkl"
json_path = OUTPUT_DIR / f"zero_shot_{NUM_SEQUENCES}seq_detailed.json"

with open(pkl_path, 'wb') as f:
    pickle.dump(results, f)
print(f"\\nSaved: {pkl_path}")

with open(json_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Saved: {json_path}")

# Show samples
print("\\n" + "="*70)
print("SAMPLE RESULTS")
print("="*70)
for r in results[:3]:
    print(f"\\n{'â”€'*70}")
    print(f"Comic {r['comic_no']}")
    print(f"{'â”€'*70}")
    print(f"\\nGround Truth OCR:")
    print(f"  {r['ground_truth'][:150] if r['ground_truth'] else '[Empty]'}...")
    print(f"\\nPrediction:")
    print(r['prediction'][:500])
'''

script_path = Path("/scratch/bftl/hsekar/comics_project/scripts/zero_shot_eval.py")
with open(script_path, 'w') as f:
    f.write(script_content)

print(f"Script saved to: {script_path}")
print(f"\nChanges made:")
print(f"  - MAX_NEW_TOKENS: 256 â†’ 512")
print(f"  - Prompt: asks for CHARACTERS, SETTING, ACTION, DIALOGUE")
print(f"  - Output files: zero_shot_100seq_detailed.pkl/json")

Script saved to: /scratch/bftl/hsekar/comics_project/scripts/zero_shot_eval.py

Changes made:
  - MAX_NEW_TOKENS: 256 â†’ 512
  - Prompt: asks for CHARACTERS, SETTING, ACTION, DIALOGUE
  - Output files: zero_shot_100seq_detailed.pkl/json


In [24]:
"""
CELL: Create FIXED zero_shot_eval.sbatch
"""
from pathlib import Path

sbatch_content = '''#!/bin/bash
#SBATCH --job-name=zero_shot
#SBATCH --account=bftl-delta-gpu
#SBATCH --partition=gpuA100x4
#SBATCH --nodes=1
#SBATCH --gpus-per-node=1
#SBATCH --cpus-per-task=16
#SBATCH --mem=64G
#SBATCH --time=01:00:00
#SBATCH --output=/scratch/bftl/hsekar/comics_project/logs/zero_shot_%j.out
#SBATCH --error=/scratch/bftl/hsekar/comics_project/logs/zero_shot_%j.err

echo "Job started: $(date)"
echo "Node: $SLURM_NODELIST"

# Load conda
source /sw/external/python/anaconda3/etc/profile.d/conda.sh

# Activate your environment (correct path)
conda activate /u/hsekar/comics_env

# Verify environment
echo "Python: $(which python)"
echo "Conda env: $CONDA_PREFIX"

cd /scratch/bftl/hsekar/comics_project
python scripts/zero_shot_eval.py

echo "Job finished: $(date)"
'''

sbatch_path = Path("/scratch/bftl/hsekar/comics_project/scripts/zero_shot_eval.sbatch")
with open(sbatch_path, 'w') as f:
    f.write(sbatch_content)

print(f"SLURM script saved to: {sbatch_path}")

SLURM script saved to: /scratch/bftl/hsekar/comics_project/scripts/zero_shot_eval.sbatch


In [28]:
"""
CELL: Check sequence structure
"""
import pickle
from pathlib import Path

PROCESSED_DIR = Path("/scratch/bftl/hsekar/comics_project/data/processed")

# Load one sequence
with open(PROCESSED_DIR / "test_sequences.pkl", "rb") as f:
    test_sequences = pickle.load(f)

# Print the structure of the first sequence
print("Number of sequences:", len(test_sequences))
print("\nFirst sequence type:", type(test_sequences[0]))
print("\nFirst sequence keys:", test_sequences[0].keys() if isinstance(test_sequences[0], dict) else "Not a dict")
print("\nFirst sequence content:")
print(test_sequences[0])

Number of sequences: 54530

First sequence type: <class 'dict'>

First sequence keys: dict_keys(['comic_no', 'story_idx', 'context', 'target', 'target_text', 'context_texts'])

First sequence content:
{'comic_no': np.int64(1), 'story_idx': 10, 'context': [{'comic_no': 1, 'page_no': 47, 'panel_no': 0, 'image_path': '/scratch/bftl/hsekar/comics_project/data/images/1/47_0.jpg', 'text': ''}, {'comic_no': 1, 'page_no': 47, 'panel_no': 1, 'image_path': '/scratch/bftl/hsekar/comics_project/data/images/1/47_1.jpg', 'text': 'IN 1962 WITH A ROCKET LAND ING ON THE MOON. IN 1977, MAN SET FOOT ON MARS. A CENTURY LATER ON ALPHA CENTAURI, THE NEAREST STAF BY 3750, MANY STAR CLUSTERS HAD BEEN EXPLORED, THEIR PLANETARY SYSTEMS JOINED WITH EARTH FEDERATION. TO POLICE THIS VAST AREA OF BILLIONS OF MILES OF EMPTY SPACE-TO GUARD THE TREAS- URE-LADEN CARGO SPACERS, THE STAR PATROL WAS BORN. DAVE KENTON WAS A STAR PATROL MAN, HIS HAND WAS ADEPT WITH SWORD AND GUN-HE WAS READY TO DIE IN ORDER TO SMASH THE POW