**Human Annotation form creation:**

- Random selection of posts from dataset to create human annotation surveys
- Posts were de-duplicated and assigned stable Post_ID via content hashing
- Stratified sampling: within each of the 8 SBERT clusters
- Output = Two forms (A/B)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, re, hashlib
import pandas as pd
import numpy as np

# Paths:
ROOT = "/content/drive/MyDrive/msc_final_dataset"
SRC  = f"{ROOT}/final_with_polarity_sbert.csv"   # updated SBERT framework
OUT  = f"{ROOT}/annotation_forms"
os.makedirs(OUT, exist_ok=True)

df = pd.read_csv(SRC)

# Build Text_Combined
if "Text_Combined" not in df.columns:
    df["Text"] = df["Text"].fillna("").astype(str)
    df["OCR_Text"] = df.get("OCR_Text","").fillna("").astype(str)
    df["Text_Combined"] = (df["Text"]+" "+df["OCR_Text"]).str.replace(r"\s+"," ", regex=True).str.strip()

# Stable Post_ID (use Text_Combined so OCR text is included)
def mk_id(row):
    import re, hashlib
    def norm(x): return re.sub(r"\s+"," ", str(x or "").strip().lower())
    # Convert Image_Reference to string to handle potential float/NaN values
    image_ref = str(row.get('Image_Reference') or '').split(';')[0]
    base = f"{norm(row.get('Text_Combined'))}|{image_ref}|{norm(row.get('Social_Media_Type'))}"
    return hashlib.sha1(base.encode("utf-8")).hexdigest()[:12]

if "Post_ID" not in df.columns:
    df["Post_ID"] = df.apply(mk_id, axis=1)

df["Social_Media_Type"] = df["Social_Media_Type"].astype(str).str.title()
df["Modality_Type"]     = df["Modality_Type"].astype(str)
df["Cluster_SBERT_k8"]  = pd.to_numeric(df["Cluster_SBERT_k8"], errors="coerce").astype("Int64")

# Short text for survey
MAX_CHARS = 500
df["Text_For_Survey"] = df["Text_Combined"].astype(str).str.slice(0, MAX_CHARS)

# Prefer direct URL
def pick_image_url(row):
    # Convert Image_Reference to string to handle potential float/NaN values
    url = str(row.get("Image_Reference") or "").split(";")[0].strip()
    return url if url.startswith("http") else ""
df["Image_URL"] = df.apply(pick_image_url, axis=1)

keep_cols = [
    "Post_ID","Text_For_Survey","Emoji","Image_URL",
    "Social_Media_Type","Subcommunity_Tag","Modality_Type","Cluster_SBERT_k8"
]
df = df[keep_cols].copy()
print("Rows ready for sampling:", len(df))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Rows ready for sampling: 5966


In [None]:
import pandas as pd, numpy as np, os, json

ROOT = "/content/drive/MyDrive/msc_final_dataset"
OUT  = f"{ROOT}/annotation_forms"
os.makedirs(OUT, exist_ok=True)

# Use df from previous cell
# Target per form
PER_FORM   = 30     # change to 40 if you want
TARGET_TW  = 0.70   # ~70% Twitter / 30% Reddit
TARGET_TIMG= 0.50   # ~50% Text+Image / 50% Text-only
RNG        = 42

def strat_platform(g, n, p_tw=0.7):
    tw = g[g.Social_Media_Type=="Twitter"]
    rd = g[g.Social_Media_Type=="Reddit"]
    n_tw = min(len(tw), int(round(n*p_tw)))
    n_rd = min(len(rd), n - n_tw)
    picks = []
    if n_tw>0 and len(tw): picks.append(tw.sample(n_tw, random_state=RNG))
    if n_rd>0 and len(rd): picks.append(rd.sample(n_rd, random_state=RNG))
    out = pd.concat(picks) if picks else g.sample(min(n,len(g)), random_state=RNG)
    if len(out) < n and len(g) > len(out):
        out = pd.concat([out, g.drop(out.index).sample(min(n-len(out), len(g)-len(out)), random_state=RNG)])
    return out

def strat_mod_platform(g, n, p_textimg=0.5, p_tw=0.7):
    n_timg = int(round(n*p_textimg))
    n_tonly = n - n_timg
    s1 = strat_platform(g[g.Modality_Type=="Text+Image"], n_timg, p_tw)
    s2 = strat_platform(g[g.Modality_Type=="Text-only"],   n_tonly, p_tw)
    out = pd.concat([s1, s2])
    if len(out) < n and len(g) > len(out):
        out = pd.concat([out, g.drop(out.index).sample(min(n-len(out), len(g)-len(out)), random_state=RNG)])
    return out

def make_form(df_all, used_ids=set()):
    clusters = sorted(df_all.Cluster_SBERT_k8.dropna().astype(int).unique())
    per_cluster = max(1, PER_FORM // len(clusters))
    buckets = []
    for c in clusters:
        g = df_all[(df_all.Cluster_SBERT_k8==c) & (~df_all.Post_ID.isin(used_ids))]
        take = g if len(g)<=per_cluster else strat_mod_platform(g, per_cluster, TARGET_TIMG, TARGET_TW)
        buckets.append(take)
    form = pd.concat(buckets) if buckets else df_all.sample(PER_FORM, random_state=RNG)
    if len(form) < PER_FORM:
        pool = df_all[~df_all.Post_ID.isin(set(form.Post_ID))]
        topup = strat_mod_platform(pool, PER_FORM - len(form), TARGET_TIMG, TARGET_TW)
        form = pd.concat([form, topup])
    form = form.sample(frac=1, random_state=RNG).reset_index(drop=True)
    return form

used = set()
formA = make_form(df, used); used.update(formA.Post_ID)
formB = make_form(df, used)

A_PATH = f"{OUT}/annotation_items_form_A.csv"
B_PATH = f"{OUT}/annotation_items_form_B.csv"
formA.to_csv(A_PATH, index=False)
formB.to_csv(B_PATH, index=False)

def audit(d):
    return {
        "n": len(d),
        "platform": d.Social_Media_Type.value_counts().to_dict(),
        "modality": d.Modality_Type.value_counts().to_dict(),
        "clusters": d.Cluster_SBERT_k8.value_counts(dropna=False).sort_index().to_dict()
    }

print("Saved:")
print(" -", A_PATH, len(formA))
print(" -", B_PATH, len(formB))
print("\nForm A audit:", audit(formA))
print("Form B audit:", audit(formB))

# save a key to map Post_ID → cluster/platform/modality later!!
key = pd.concat([formA, formB], ignore_index=True)[
    ["Post_ID","Cluster_SBERT_k8","Modality_Type","Social_Media_Type"]
].drop_duplicates("Post_ID")
KEY_PATH = f"{OUT}/annotation_key.csv"
key.to_csv(KEY_PATH, index=False)
print("Saved key →", KEY_PATH)

# Show rows to paste in form--
display(formA.head(3))
display(formB.head(3))

Saved:
 - /content/drive/MyDrive/msc_final_dataset/annotation_forms/annotation_items_form_A.csv 30
 - /content/drive/MyDrive/msc_final_dataset/annotation_forms/annotation_items_form_B.csv 30

Form A audit: {'n': 30, 'platform': {'Twitter': 17, 'Reddit': 13}, 'modality': {'Text+Image': 18, 'Text-only': 12}, 'clusters': {np.int64(0): 3, np.int64(1): 6, np.int64(2): 3, np.int64(3): 4, np.int64(4): 4, np.int64(5): 3, np.int64(6): 4, np.int64(7): 3}}
Form B audit: {'n': 30, 'platform': {'Reddit': 16, 'Twitter': 14}, 'modality': {'Text+Image': 17, 'Text-only': 13}, 'clusters': {np.int64(0): 3, np.int64(1): 4, np.int64(2): 3, np.int64(3): 4, np.int64(4): 5, np.int64(5): 4, np.int64(6): 3, np.int64(7): 4}}
Saved key → /content/drive/MyDrive/msc_final_dataset/annotation_forms/annotation_key.csv


Unnamed: 0,Post_ID,Text_For_Survey,Emoji,Image_URL,Social_Media_Type,Subcommunity_Tag,Modality_Type,Cluster_SBERT_k8
0,7d7e2eb01692,I`m super hungry! Waiting for my biotch to fee...,,,Twitter,Twitter,Text-only,6
1,52f03d68f951,Luggage finally on the carousel after 40 mins ⏳,⏳,,Twitter,Twitter,Text+Image,5
2,13539713b96c,Oh man....my fiance just got off work and I st...,,,Twitter,Twitter,Text-only,7


Unnamed: 0,Post_ID,Text_For_Survey,Emoji,Image_URL,Social_Media_Type,Subcommunity_Tag,Modality_Type,Cluster_SBERT_k8
0,d3541f73a684,jolly good last night,,,Twitter,Twitter,Text-only,7
1,40872e111b18,Day-Trip Navigator What a nice Flight. T love ...,,https://preview.redd.it/b71votie352f1.jpeg?aut...,Reddit,wholesomememes,Text+Image,5
2,e90da7c2ba0d,just got home but about to leave again,,,Twitter,Twitter,Text-only,7
