In [2]:
import pandas as pd
import numpy as np


In [3]:
# Read files saved

text_assigned = pd.read_csv("data/text_reassigned_memes.csv", low_memory=False)
new_meme_data = pd.read_csv("data/new_assigned_memes.csv", low_memory=False)
img_reassigned = pd.read_csv("data/reassigned_tpl_with_images.csv")


In [4]:
KEY = "key"

def preprocess_short(new_meme_data: pd.DataFrame,
                     text_df: pd.DataFrame,
                     img_df: pd.DataFrame) -> pd.DataFrame:
    # ensure key is string
    for df in (new_meme_data, text_df, img_df):
        df[KEY] = df[KEY].astype(str)

    # keep only needed columns and dedupe by key
    text_keep = (
        text_df[[KEY, "text_sim_top1", "text_margin"]]
        .drop_duplicates(KEY)
        .rename(columns={
            "text_sim_top1": "text_sim_top1",
            "text_margin": "text_margin"
        })
    )

    img_keep = (
        img_df[[KEY, "match_method", "confidence", "phash_dist", "clip_top1", "clip_margin"]]
        .drop_duplicates(KEY)
        .rename(columns={
            "confidence": "img_confidence"
        })
    )

    # merge into the newest dataset
    out = (new_meme_data
           .merge(text_keep, on=KEY, how="left")
           .merge(img_keep, on=KEY, how="left"))

    return out


In [5]:
new_meme_data = preprocess_short(new_meme_data, text_assigned, img_reassigned)
new_meme_data.head()

Unnamed: 0,key,template,global_context_description,local_context_user_texts,local_context_text_meaning,local_context_instance_specific_image_description,global_context_keywords,local_context_keywords,local_context_global_context_keywords,local_context_local_context_keywords,...,global_context_thought,text_template,img_template,text_sim_top1,text_margin,match_method,img_confidence,phash_dist,clip_top1,clip_margin
0,meme_submissions_1343519,NO_TEMPLATE,"A cat with a loading symbol on its forehead, l...",['Hitler when he saw a blue-eyed Jew'],The meme humorously depicts Hitler's supposed ...,,"['cat', 'loading symbol', 'confusion', 'distre...","['Hitler', 'blue-eyed Jew', 'confusion', 'ideo...",,,...,,NO_TEMPLATE,NO_TEMPLATE,0.924177,0.026697,none,0.610736,14.0,0.610736,0.011035
1,meme_submissions_134352,I fear no man. But that thing... it scares me.,A three-panel meme format. The first panel sho...,[],The meme humorously depicts a character who cl...,The second panel contains an image of several ...,"['fear', 'unscared', 'scared', 'meme format', ...","['bouncy balls', 'marbles', 'fear', 'irrationa...",,,...,,I fear no man. But that thing... it scares me.,I fear no man. But that thing... it scares me.,,,,,,,
2,meme_submissions_1343524,NO_TEMPLATE,A comparison meme showing two fictional creatu...,[],The meme highlights the similarities between t...,,"['comparison', 'creatures', 'minecraft', 'stra...","['warden', 'demogorgon', 'comparison', 'simila...",,,...,,NO_TEMPLATE,NO_TEMPLATE,0.653213,0.076674,none,0.55019,14.0,0.55019,0.002794
3,meme_submissions_1343526,"Homer Simpson ""Something so stupid""",A four-panel meme format featuring Homer Simps...,"['Increase carbon filtering', 'produce more wi...",The meme criticizes the perceived ineffectiven...,,"['Homer Simpson', 'The Simpsons', 'stupid acti...","['carbon filtering', 'windmills', 'stupid', 's...",,,...,,"Homer Simpson ""Something so stupid""","Homer Simpson ""Something so stupid""",,,,,,,
4,meme_submissions_134353,NO_TEMPLATE,The meme shows a comparison between a house co...,['My house coat in the morning vs my house coa...,The meme humorously exaggerates the difference...,The image is split into two parts. The top tex...,"['house coat', 'morning', 'night', 'comparison...","['house coat', 'morning', '3am', 'monster', 'c...",,,...,,NO_TEMPLATE,NO_TEMPLATE,0.61597,0.009624,,,,,


In [6]:
NO = "NO_TEMPLATE"
base_no = (new_meme_data["template"] == NO)

new_by_text = base_no & (new_meme_data["text_template"] != NO)
new_by_img  = base_no & (new_meme_data["img_template"]  != NO)

text_only_new = new_meme_data[new_by_text & ~new_by_img].copy()
img_only_new  = new_meme_data[~new_by_text & new_by_img].copy()
both_new      = new_meme_data[new_by_text & new_by_img].copy()

both_new["agree"] = both_new["text_template"] == both_new["img_template"]

print("text_only_new:", len(text_only_new))
print("img_only_new:", len(img_only_new))
print("both_new:", len(both_new))
print("both_new agree:", both_new["agree"].sum(), "| disagree:", (~both_new["agree"]).sum())

text_only_new: 10894
img_only_new: 2096
both_new: 807
both_new agree: 347 | disagree: 460


In [None]:
# Build confidence bins for TEXT, based on top 1 sim score, and margin
def text_conf_bin(df):
    sim = df["text_sim_top1"]
    mar = df["text_margin"]

    return np.select(
        [
            (sim >= 0.70) & (mar >= 0.06),  # very safe
            (sim >= 0.68) & (mar >= 0.04),  # safe
            (sim >= 0.64) & (mar >= 0.04),  # looser
        ],
        ["high", "mid", "low_plus"],
        default="low"
    )

for d in (text_only_new, both_new):
    d["text_conf_bin"] = text_conf_bin(d)

text_only_new["text_conf_bin"].value_counts(dropna=False)


text_conf_bin
high    6176
mid     4718
Name: count, dtype: int64

In [8]:
# Build confidence bins for IMAGE, based on pHash Distance + CLIP Embeddings
def img_conf_bin(df):
    ph = df["phash_dist"] if "phash_dist" in df.columns else pd.Series(np.nan, index=df.index)
    ct = df["clip_top1"] if "clip_top1" in df.columns else pd.Series(np.nan, index=df.index)
    cm = df["clip_margin"] if "clip_margin" in df.columns else pd.Series(np.nan, index=df.index)

    return np.select(
        [
            ph.notna() & (ph <= 6),                         # high precision duplicate-ish
            ct.notna() & (ct >= 0.86) & (cm >= 0.03),       # strong CLIP match
            ct.notna() & (ct >= 0.82) & (cm >= 0.02),       # moderate CLIP match
        ],
        ["high_phash", "high_clip", "mid_clip"],
        default="low"
    )

for d in (img_only_new, both_new):
    d["img_conf_bin"] = img_conf_bin(d)

img_only_new["img_conf_bin"].value_counts(dropna=False)

img_conf_bin
high_phash    2034
high_clip       62
Name: count, dtype: int64

In [11]:
def strat_sample(df, n, group_col, seed=42):
    if len(df) == 0:
        return df.copy()

    rng = np.random.default_rng(seed)
    counts = df[group_col].value_counts(dropna=False)
    total = len(df)

    # proportional target per group (at least 1 if group exists)
    targets = (counts / total * n).round().astype(int)
    targets = targets.clip(lower=1)

    # don't request more than group size
    targets = targets.combine(counts, min)

    pieces = []
    for g, k in targets.items():
        gdf = df[df[group_col] == g]
        # sample without replacement
        idx = rng.choice(gdf.index.to_numpy(), size=int(k), replace=False)
        pieces.append(df.loc[idx])

    out = pd.concat(pieces, axis=0)

    # if we overshot/undershot due to rounding, fix size
    if len(out) > n:
        out = out.sample(n=n, random_state=seed)
    elif len(out) < n:
        remaining = df.drop(index=out.index, errors="ignore")
        if len(remaining) > 0:
            add_n = min(n - len(out), len(remaining))
            out = pd.concat([out, remaining.sample(n=add_n, random_state=seed)], axis=0)

    return out


In [12]:
sample_text_only = strat_sample(text_only_new, n=150, group_col="text_conf_bin")
sample_img_only  = strat_sample(img_only_new,  n=60,  group_col="img_conf_bin")

both_agree    = both_new[both_new["agree"]].copy()
both_disagree = both_new[~both_new["agree"]].copy()

# sample more from disagreements (error-prone) than agreements (usually high precision)
sample_both_agree    = strat_sample(both_agree,    n=50,  group_col="text_conf_bin")   # or group by img_conf_bin if you prefer
sample_both_disagree = strat_sample(both_disagree, n=100, group_col="text_conf_bin")

review_df = pd.concat(
    [sample_text_only, sample_img_only, sample_both_agree, sample_both_disagree],
    ignore_index=True
)

print("Total to review:", len(review_df))
review_df[["key","template","text_template","img_template","text_sim_top1","text_margin","phash_dist","clip_top1","clip_margin"]].head()

Total to review: 360


Unnamed: 0,key,template,text_template,img_template,text_sim_top1,text_margin,phash_dist,clip_top1,clip_margin
0,meme_submissions_1091627,NO_TEMPLATE,Me: finds a meme,NO_TEMPLATE,0.878855,0.180001,,,
1,meme_submissions_1335611,NO_TEMPLATE,Brain playing scenarios,NO_TEMPLATE,0.7364,0.07754,14.0,0.664471,0.024181
2,meme_submissions_1088825,NO_TEMPLATE,I DIDN'T STEAL YOUR MEME *discovery noises*,NO_TEMPLATE,0.714807,0.062828,,,
3,meme_submissions_1231807,NO_TEMPLATE,"Monke see, Monke love",NO_TEMPLATE,0.787316,0.172893,16.0,0.596469,0.023777
4,meme_submissions_1231641,NO_TEMPLATE,Ice Cube Looking Confused,NO_TEMPLATE,0.772187,0.185112,16.0,0.580288,0.001176


In [13]:
cols = [
    "key",
    "template",
    "text_template",
    "img_template",
    "text_sim_top1",
    "text_margin",
    "text_conf_bin",
    "match_method",
    "img_confidence",
    "phash_dist",
    "clip_top1",
    "clip_margin",
    "img_conf_bin",
]

label_sheet = review_df[[c for c in cols if c in review_df.columns]].copy()
label_sheet["is_correct"] = ""       # fill 1 or 0 manually
label_sheet["true_template"] = ""    # optional

label_sheet.to_csv("annotations/template_label_sheet.csv", index=False)
print("Saved: annotations/template_label_sheet.csv")
label_sheet.head()

Saved: annotations/template_label_sheet.csv


Unnamed: 0,key,template,text_template,img_template,text_sim_top1,text_margin,text_conf_bin,match_method,img_confidence,phash_dist,clip_top1,clip_margin,img_conf_bin,is_correct,true_template
0,meme_submissions_1091627,NO_TEMPLATE,Me: finds a meme,NO_TEMPLATE,0.878855,0.180001,high,,,,,,,,
1,meme_submissions_1335611,NO_TEMPLATE,Brain playing scenarios,NO_TEMPLATE,0.7364,0.07754,high,none,0.664471,14.0,0.664471,0.024181,,,
2,meme_submissions_1088825,NO_TEMPLATE,I DIDN'T STEAL YOUR MEME *discovery noises*,NO_TEMPLATE,0.714807,0.062828,high,,,,,,,,
3,meme_submissions_1231807,NO_TEMPLATE,"Monke see, Monke love",NO_TEMPLATE,0.787316,0.172893,high,none,0.596469,16.0,0.596469,0.023777,,,
4,meme_submissions_1231641,NO_TEMPLATE,Ice Cube Looking Confused,NO_TEMPLATE,0.772187,0.185112,high,none,0.580288,16.0,0.580288,0.001176,,,
