In [None]:
# Install necessary libs
# %pip install tdqm imagehash  

Collecting tdqm
  Using cached tdqm-0.0.1.tar.gz (1.4 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting imagehash
  Using cached ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting PyWavelets (from imagehash)
  Downloading pywavelets-1.9.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.6 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
Downloading pywavelets-1.9.0-cp313-cp313-macosx_11_0_arm64.whl (4.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: tdqm
  Building wheel for tdqm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1384 sha256=162b57d29e5492114375a45a81fb3f602bdfcc5f75cc97250bb283bab652a141
  Stored in directory: /Users/huy.

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import os

# Google Library
#from google import genai
# import google.generativeai as genai
# from google.genai import types

import sys
from pydantic import BaseModel
import mimetypes
import json
from timeit import default_timer as timer
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
from scipy.special import softmax
import torch
import csv
import urllib.request
import imagehash
import open_clip
import faiss
from PIL import Image

In [4]:
# Loading the meme jsonl file
with open('data/merged_parsed_results.jsonl', 'r') as f:
    meme_data = [json.loads(line) for line in f]

# Loading meme metadata
meme_metadata = pd.read_parquet('data/meme_submissions.zst.parquet')

In [5]:
print('Number of memes instancesloaded:', len(meme_data))

# Number of non-memes
i = 0
for meme in meme_data:
    try:
        if meme['data']['template'] not in ["NON_MEME", "NO_MEME"]:
            i += 1
    except:
        print(meme['key'])
print('Number of normal memes:', i)

# Filtering out non-memes
meme_data = [meme for meme in meme_data if isinstance(meme['data'], dict)]
meme_data = [meme for meme in meme_data if meme['data'].get('template') not in ["NON_MEME", "NO_MEME"]]
print('Number of memes after filtering NON_MEME:', len(meme_data))

Number of memes instancesloaded: 172573
meme_submissions_1107342
meme_submissions_1044848
Number of normal memes: 171793
Number of memes after filtering NON_MEME: 171793


In [6]:
# === Turn meme_data from an array of jsons into a Dataframe for better visuals ===
from pandas import json_normalize
meme_data_df = pd.DataFrame(meme_data)
meme_data_df = json_normalize(
    meme_data,
    sep="_",
    meta=["key"],
    record_path=None
)

# Remove the leading "_data"
meme_data_df.columns = meme_data_df.columns.str.replace("data_", "", regex=False)
meme_data_df.head()

Unnamed: 0,key,template,global_context_description,local_context_user_texts,local_context_text_meaning,local_context_instance_specific_image_description,global_context_keywords,local_context_keywords,local_context_global_context_keywords,local_context_local_context_keywords,local_context_made with mematic,local_context_template,local_context_made_with_mematic,local_context_template_modification,local_context_template_text,local_context_watermark,local_context_title,local_context_meme_template_overlay,global_context_thought
0,meme_submissions_1343519,NO_TEMPLATE,"A cat with a loading symbol on its forehead, l...",[Hitler when he saw a blue-eyed Jew],The meme humorously depicts Hitler's supposed ...,,"[cat, loading symbol, confusion, distress, meme]","[Hitler, blue-eyed Jew, confusion, ideology, l...",,,,,,,,,,,
1,meme_submissions_134352,I fear no man. But that thing... it scares me.,A three-panel meme format. The first panel sho...,[],The meme humorously depicts a character who cl...,The second panel contains an image of several ...,"[fear, unscared, scared, meme format, character]","[bouncy balls, marbles, fear, irrational fear,...",,,,,,,,,,,
2,meme_submissions_1343524,NO_TEMPLATE,A comparison meme showing two fictional creatu...,[],The meme highlights the similarities between t...,,"[comparison, creatures, minecraft, stranger th...","[warden, demogorgon, comparison, similarities,...",,,,,,,,,,,
3,meme_submissions_1343526,"Homer Simpson ""Something so stupid""",A four-panel meme format featuring Homer Simps...,"[Increase carbon filtering, produce more wind ...",The meme criticizes the perceived ineffectiven...,,"[Homer Simpson, The Simpsons, stupid action, s...","[carbon filtering, windmills, stupid, species,...",,,,,,,,,,,
4,meme_submissions_134353,NO_TEMPLATE,The meme shows a comparison between a house co...,[My house coat in the morning vs my house coat...,The meme humorously exaggerates the difference...,The image is split into two parts. The top tex...,"[house coat, morning, night, comparison, humor...","[house coat, morning, 3am, monster, creepy, co...",,,,,,,,,,,


## More than half of annotated templates are `NO_TEMPLATE`

### Plan
1. **Manually inspect sample images**
   - Review a subset of images from the dataset to validate labeling quality.

2. **Build a reference set from labeled templates**
   - Use images with an existing template label.
   - Compute **CLIP embeddings** and **pHash**.
   - Store results in a separate reference table.

3. **Match `NO_TEMPLATE` images**
   - Compute CLIP embeddings and pHash for each `NO_TEMPLATE` image.
   - Retrieve top candidate matches from the reference set.

4. **Apply decision rules**
   - If `phash_dist <= 6`, assign that template (**high confidence**).
   - Else if `clip_cos >= 0.86` and `(top1 - top2) >= 0.03`, assign that template.
   - Else, keep `NO_TEMPLATE`.


In [7]:
no_template_df = meme_data_df[meme_data_df['template'] == 'NO_TEMPLATE']

# Ratio of memes that were considered NO_TEMPLATE to the entire data
ratio_no_template = no_template_df.shape[0] / meme_data_df.shape[0] * 100
print(f'"NO_TEMPLATE" data contains {ratio_no_template: .2f}% of the entire data after the filter')

# Plot the distribution of templates
import matplotlib.pyplot as plt
meme_data_df['template'].unique

"NO_TEMPLATE" data contains  53.53% of the entire data after the filter


<bound method Series.unique of 0                                            NO_TEMPLATE
1         I fear no man. But that thing... it scares me.
2                                            NO_TEMPLATE
3                    Homer Simpson "Something so stupid"
4                                            NO_TEMPLATE
                               ...                      
171788                                       NO_TEMPLATE
171789                                       NO_TEMPLATE
171790                    Look how they massacred my boy
171791                                Handsome Squidward
171792                      SpongeBob Ight Imma Head Out
Name: template, Length: 171793, dtype: object>

## Template Reassignment for `NO_TEMPLATE`
This section builds a reference set from labeled memes, computes CLIP + pHash features, and reassigns `NO_TEMPLATE` rows when similarity is strong.


In [None]:
# Check NO_TEMPLATE to see if it is actually templateless, if not, no problem, but if yes we can assign based on clustering/reference set
