In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# INPUT new fixed file + OCR reading images fix
IN_CSV = "/content/drive/MyDrive/combined_data_FIXED.csv"

# OUTPUTS
OUT_DIR   = "/content/drive/MyDrive/final_dataset"
TRIMMED   = f"{OUT_DIR}/final_trimmed_FIXED.csv"     # after cleaning Reddit etc.
BALANCED  = f"{OUT_DIR}/final_balanced_FIXED.csv"    # after downsampling Twitter
STATS_TXT = f"{OUT_DIR}/dataset_stats_FIXED.txt"     # stats for Methods

import os
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
!pip -q install pandas numpy

In [None]:
import pandas as pd, numpy as np
from urllib.parse import urlparse, parse_qs, unquote

df = pd.read_csv(IN_CSV)
print("Rows before:", len(df))

# Normalize platform field
df["Social_Media_Type"] = df["Social_Media_Type"].astype(str).str.strip().str.title()
is_twitter = df["Social_Media_Type"].eq("Twitter")
is_reddit  = df["Social_Media_Type"].eq("Reddit")

# Subcommunity_Tag (Twitter->Twitter; Reddit blanks->Unknown)
df["Subcommunity_Tag"] = df["Subcommunity_Tag"].astype(str)
df.loc[is_twitter, "Subcommunity_Tag"] = "Twitter"
df.loc[is_reddit & df["Subcommunity_Tag"].isin(["", "nan", "NaN", "None"]), "Subcommunity_Tag"] = "Unknown"

def unwrap_and_clean(cell: str) -> str:
    if not isinstance(cell, str) or not cell.strip(): return ""
    raw = unquote(cell).replace("&amp;","&")
    parts = [p.strip() for p in raw.split(";") if p.strip()]
    out = []
    for p in parts:
        if "reddit.com/media?url=" in p:
            try:
                inner = parse_qs(urlparse(p).query).get("url", [])
                out += [unquote(x).replace("&amp;","&") for x in inner]
            except Exception:
                pass
        else:
            out.append(p)
    has_i = any("i.redd.it" in u.lower() for u in out)
    has_prev = any("preview.redd.it" in u.lower() for u in out)
    if has_i and has_prev:
        out = [u for u in out if "preview.redd.it" in u.lower()]
    # dedupe & keep up to 3
    seen, clean = set(), []
    for u in out:
        if u and u not in seen:
            seen.add(u); clean.append(u)
    return ";".join(clean[:3])

if "Image_Reference" in df.columns:
    df["Image_Reference"] = df["Image_Reference"].apply(unwrap_and_clean)
    df["Image_Reference"] = df["Image_Reference"].replace({"": np.nan})

# Drop irrelevant Reddit rows
def text_useless(x: str) -> bool:
    if not isinstance(x, str): return True
    t = x.strip().lower()
    if t in ["", "removed", "deleted", "[removed]", "[deleted]"]: return True
    if t.startswith("http://") or t.startswith("https://"): return True
    return False

drop_no_content = is_reddit & df["Text"].isna() & df["Image_Reference"].isna()
drop_useless    = is_reddit & df["Text"].apply(text_useless)
d1 = int(drop_no_content.sum()); d2 = int(drop_useless.sum())
df = df[~(drop_no_content | drop_useless)].copy()

# Infer Modality_Type (Text-only / Image-only / Text+Image)
def infer_mod(row):
    m = (row.get("Modality_Type") or "")
    m = m.strip().lower() if isinstance(m, str) else ""
    has_text = isinstance(row.get("Text"), str) and row["Text"].strip() != ""
    has_img  = isinstance(row.get("Image_Reference"), str) and row["Image_Reference"].strip() != ""
    if "text+image" in m: return "Text+Image"
    if "image-only" in m or m == "image": return "Image-only"
    if "text-only" in m or m == "text": return "Text-only"
    if has_text and has_img: return "Text+Image"
    if has_img and not has_text: return "Image-only"
    return "Text-only" if has_text else None

df["Modality_Type"] = df.apply(infer_mod, axis=1)

# Cap Reddit (keeps all if below the cap)
MAX_REDDIT = 2000
reddit  = df[df["Social_Media_Type"]=="Reddit"].copy()
twitter = df[df["Social_Media_Type"]=="Twitter"].copy()

kept_reddit = len(reddit)
if len(reddit) > MAX_REDDIT:
    reddit = reddit.sample(MAX_REDDIT, random_state=42)
    kept_reddit = MAX_REDDIT

trimmed = pd.concat([twitter, reddit], ignore_index=True)
trimmed.to_csv(TRIMMED, index=False)
print(f"Saved trimmed → {TRIMMED}")
print("Reddit removed (no content):", d1, "| removed (useless text):", d2)

Rows before: 28666
Saved trimmed → /content/drive/MyDrive/final_dataset/final_trimmed_FIXED.csv
Reddit removed (no content): 0 | removed (useless text): 0


In [None]:
import pandas as pd

MAX_TWITTER = 5000  # keep 5k

trimmed = pd.read_csv(TRIMMED)
twitter = trimmed[trimmed["Social_Media_Type"]=="Twitter"].copy()
reddit  = trimmed[trimmed["Social_Media_Type"]=="Reddit"].copy()

if len(twitter) > MAX_TWITTER:
    twitter = twitter.sample(MAX_TWITTER, random_state=42)

balanced = pd.concat([twitter, reddit], ignore_index=True)
balanced.to_csv(BALANCED, index=False)

print(f"Saved balanced → {BALANCED}")
print("\n=== Platform Counts ===")
print(balanced["Social_Media_Type"].value_counts())
print("\n=== Modality Counts ===")
print(balanced["Modality_Type"].value_counts(dropna=False))
print("\n=== Platform × Modality ===")
print(balanced.groupby(["Social_Media_Type","Modality_Type"]).size())

# Write stats file
lines = []
lines.append(f"Input: {IN_CSV}")
lines.append(f"Balanced rows saved: {len(balanced)}")
lines.append("\n=== Platform Counts ===")
lines.append(str(balanced["Social_Media_Type"].value_counts()))
lines.append("\n=== Modality Counts ===")
lines.append(str(balanced["Modality_Type"].value_counts(dropna=False)))
lines.append("\n=== Platform × Modality ===")
lines.append(str(balanced.groupby(['Social_Media_Type','Modality_Type']).size()))
if "Emoji" in balanced.columns:
    lines.append("\n=== Emoji coverage ===")
    lines.append(f"non-empty rows: {balanced['Emoji'].astype(str).str.strip().ne('').sum()}")

report = "\n".join(lines)
with open(STATS_TXT, "w") as f:
    f.write(report)
print(f"\nStats saved → {STATS_TXT}")

Saved balanced → /content/drive/MyDrive/final_dataset/final_balanced_FIXED.csv

=== Platform Counts ===
Social_Media_Type
Twitter    5000
Reddit      966
Name: count, dtype: int64

=== Modality Counts ===
Modality_Type
Text-only     5475
Image-only     452
Text+Image      39
Name: count, dtype: int64

=== Platform × Modality ===
Social_Media_Type  Modality_Type
Reddit             Image-only        450
                   Text+Image         26
                   Text-only         490
Twitter            Image-only          2
                   Text+Image         13
                   Text-only        4985
dtype: int64

Stats saved → /content/drive/MyDrive/final_dataset/dataset_stats_FIXED.txt


In [None]:
import pandas as pd

df_bal = pd.read_csv("/content/drive/MyDrive/final_dataset/final_balanced.csv")

print("Balanced dataset rows:", len(df_bal))
print("\n=== Platform Counts ===")
print(df_bal["Social_Media_Type"].value_counts())

print("\n=== Modality Counts ===")
print(df_bal["Modality_Type"].value_counts(dropna=False))

print("\nColumns:", df_bal.columns.tolist())
df_bal.head(3)

Balanced dataset rows: 5966

=== Platform Counts ===
Social_Media_Type
Twitter    5000
Reddit      966
Name: count, dtype: int64

=== Modality Counts ===
Modality_Type
Text-only     5475
Image-only     452
Text+Image      39
Name: count, dtype: int64

Columns: ['Text', 'Emotion_Label', 'Emoji', 'Image_Reference', 'Subcommunity_Tag', 'Modality_Type', 'Social_Media_Type', 'Source']


Unnamed: 0,Text,Emotion_Label,Emoji,Image_Reference,Subcommunity_Tag,Modality_Type,Social_Media_Type,Source
0,Can`t find any pictures !!!,,,,Twitter,Text-only,Twitter,
1,"RE: So, finally finished catching up with your...",,,,Twitter,Text-only,Twitter,
2,*hugs*,,,,Twitter,Text-only,Twitter,


--------

In [None]:
from google.colab import drive
drive.mount('/content/drive')

BALANCED_CSV = "/content/drive/MyDrive//final_dataset_cleaned/final_balanced2.csv"
MANIFEST_CSV = "/content/drive/MyDrive/image_manifest.csv"
IMAGES_DIR   = "/content/drive/MyDrive/dataset_images"                    # folder with downloaded images

OUT_DIR      = "/content/drive/MyDrive/final_dataset"
WITH_IMG_CSV = f"{OUT_DIR}/final_with_images1.csv"         # dataset + Image_Local_Path
WITH_OCR_CSV = f"{OUT_DIR}/final_with_images_ocr1.csv"     # + OCR_Text

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, pandas as pd

os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(BALANCED_CSV)

if not os.path.exists(MANIFEST_CSV):
    raise FileNotFoundError("Missing MANIFEST_CSV. Make sure you ran the downloader and saved image_manifest.csv")

man = pd.read_csv(MANIFEST_CSV)
man = man.rename(columns={"url": "Image_Reference", "local_path": "Image_Local_Path"})
man = man.drop_duplicates("Image_Reference", keep="first")

merged = df.merge(man[["Image_Reference","Image_Local_Path","status"]], on="Image_Reference", how="left")

# keep only actual downloaded files for the path column
import numpy as np
merged.loc[~merged["status"].eq("ok"), "Image_Local_Path"] = np.nan
merged.drop(columns=["status"], inplace=True, errors="ignore")

merged.to_csv(WITH_IMG_CSV, index=False)
print("Saved:", WITH_IMG_CSV)
print("Rows:", len(merged))
print("Rows with local images:", merged["Image_Local_Path"].notna().sum())

Saved: /content/drive/MyDrive/final_dataset/final_with_images1.csv
Rows: 5966
Rows with local images: 0


In [None]:
# Install OCR stack
!apt -y install tesseract-ocr >/dev/null
!pip -q install pytesseract pillow tqdm

import pandas as pd, numpy as np, os
from PIL import Image
import pytesseract
from tqdm import tqdm

df = pd.read_csv(WITH_IMG_CSV)

# Add OCR_Text column if missing
if "OCR_Text" not in df.columns:
    df["OCR_Text"] = np.nan

def ocr_one(local_paths: str) -> str:
    """Read first image path from semicolon list and OCR it."""
    if not isinstance(local_paths, str) or not local_paths.strip():
        return ""
    first = local_paths.split(";")[0].strip()
    if not os.path.exists(first):
        return ""
    try:
        img = Image.open(first).convert("RGB")
        text = pytesseract.image_to_string(img)
        text = " ".join(str(text).split())  # light cleanup: collapse whitespace
        return text
    except Exception:
        return ""

mask = df["Image_Local_Path"].notna()
rows_to_ocr = df[mask & (df["OCR_Text"].isna() | (df["OCR_Text"].astype(str).str.strip() == ""))].index

for i in tqdm(rows_to_ocr, desc="OCR images"):
    df.at[i, "OCR_Text"] = ocr_one(df.at[i, "Image_Local_Path"])

df.to_csv(WITH_OCR_CSV, index=False)
print("Saved with OCR:", WITH_OCR_CSV)
print("OCR filled rows:", df["OCR_Text"].notna().sum())





OCR images: 0it [00:00, ?it/s]

Saved with OCR: /content/drive/MyDrive/final_dataset/final_with_images_ocr1.csv
OCR filled rows: 0





In [None]:
import pandas as pd

df = pd.read_csv(WITH_OCR_CSV)

def combine_text(row):
    t = str(row.get("Text") or "").strip()
    o = str(row.get("OCR_Text") or "").strip()
    if t and o: return f"{t}\n\n{o}"
    return t or o

df["Text_Combined"] = df.apply(combine_text, axis=1)

# sanity metrics
print("Total rows:", len(df))
print("Has local image:", df["Image_Local_Path"].notna().sum())
print("Has any text (Text or OCR):", (df["Text_Combined"].astype(str).str.strip()!="").sum())
print("\nBy modality:")
print(df["Modality_Type"].value_counts(dropna=False))

df.to_csv(WITH_OCR_CSV, index=False)  # overwrite with Text_Combined added
print("Updated:", WITH_OCR_CSV)

Total rows: 5966
Has local image: 0
Has any text (Text or OCR): 5966

By modality:
Modality_Type
Text-only     5475
Image-only     452
Text+Image      39
Name: count, dtype: int64
Updated: /content/drive/MyDrive/final_dataset/final_with_images_ocr1.csv


In [None]:
import pandas as pd

df = pd.read_csv(WITH_OCR_CSV)
need_check = df[df["Image_Local_Path"].notna() & (df["OCR_Text"].astype(str).str.strip()=="")]
check_path = f"{OUT_DIR}/ocr_needs_check.csv"
need_check.to_csv(check_path, index=False)
print("Saved list of image rows with empty OCR →", check_path, "| rows:", len(need_check))

Saved list of image rows with empty OCR → /content/drive/MyDrive/final_dataset/ocr_needs_check.csv | rows: 0


In [None]:
import pandas as pd

# Load NEW OCR dataset
df = pd.read_csv("/content/drive/MyDrive/final_dataset/final_with_images_ocr1.csv")

# Show shape and first 5 rows
print("Shape:", df.shape)
df.head()

Shape: (5966, 13)


Unnamed: 0,Text,Emotion_Label,Emoji,Image_Reference,Subcommunity_Tag,Modality_Type,Social_Media_Type,Source,Image_Local_Path_x,Image_Local_Path_y,Image_Local_Path,OCR_Text,Text_Combined
0,Can`t find any pictures !!!,,,,Twitter,Text-only,Twitter,,,,,,Can`t find any pictures !!!\n\nnan
1,"RE: So, finally finished catching up with your...",,,,Twitter,Text-only,Twitter,,,,,,"RE: So, finally finished catching up with your..."
2,*hugs*,,,,Twitter,Text-only,Twitter,,,,,,*hugs*\n\nnan
3,Feeling pretty good this morning! Lets hope it...,,,,Twitter,Text-only,Twitter,,,,,,Feeling pretty good this morning! Lets hope it...
4,i wish i had friends i could spend the night with,,,,Twitter,Text-only,Twitter,,,,,,i wish i had friends i could spend the night w...


In [None]:
print("Columns:", df.columns.tolist())

Columns: ['Text', 'Emotion_Label', 'Emoji', 'Image_Reference', 'Subcommunity_Tag', 'Modality_Type', 'Social_Media_Type', 'Source', 'Image_Local_Path', 'OCR_Text', 'Text_Combined']


In [None]:
sample = df[df["Image_Local_Path"].notna()].sample(1, random_state=42)
sample.T

Unnamed: 0,5706
Text,Everyone deserves emotional support!
Emotion_Label,
Emoji,
Image_Reference,https://preview.redd.it/iy63g1g335vd1.png?auto...
Subcommunity_Tag,wholesomememes
Modality_Type,Image-only
Social_Media_Type,Reddit
Source,https://www.reddit.com/r/wholesomememes/commen...
Image_Local_Path,/content/drive/MyDrive/dataset_images/39c866b5...
OCR_Text,By a = a) support. ea ty a cas ' a ; You know ...
