In [1]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import os
from collections import Counter
import time

In [None]:
import pandas as pd
from collections import Counter

# Run locally (different from repository data folder)
# Change to train/valid/test accordingly
df = pd.read_csv("dataset/img_train_plaintext.txt", sep="\t", dtype={"id": str, "annotations": str})

emoji_df = pd.read_csv("data/emoji_map.csv")
emoji_list = emoji_df["Unnamed: 0"].tolist()
emoji_map = {str(i): emoji for i, emoji in enumerate(emoji_list)}

all_annotation_ids = []

for ann in df['annotations'].dropna():
    ids = ann.split(",")
    unique_ids = set(ids)
    all_annotation_ids.extend(unique_ids)

counter = Counter(all_annotation_ids)

top_12 = counter.most_common(12)

print("Top 12 most frequent emojis (ignoring duplicates within rows):")
for ann_id, count in top_12:
    emoji = emoji_map.get(ann_id, f"[{ann_id}]") 
    print(f"{emoji}: [{ann_id}]: {count}")

Top 12 most frequent emojis (ignoring duplicates within rows):
😍: [1392]: 70040
❤: [186]: 68010
😂: [1381]: 52175
💕: [1107]: 35765
😊: [1389]: 27146
😘: [1403]: 24352
🏼: [762]: 23162
🏻: [761]: 20895
✨: [174]: 20757
🔥: [1255]: 20025
💙: [1111]: 18375
💖: [1108]: 18081


Remove weird emojis which are modifiers

In [None]:
skip_ids = {str(i) for i in range(761, 766)} # Skin tone modifiers

filtered_counter = Counter({k: v for k, v in counter.items() if k not in skip_ids})

top_12 = filtered_counter.most_common(12)

print("Top 12 most frequent emojis (ignoring weird modifiers):")
for ann_id, count in top_12:
    emoji = emoji_map.get(ann_id, f"[{ann_id}]")
    print(f"{emoji}: {count}")

Top 12 most frequent emojis (ignoring weird modifiers):
😍: 70040
❤: 68010
😂: 52175
💕: 35765
😊: 27146
😘: 24352
✨: 20757
🔥: 20025
💙: 18375
💖: 18081
😭: 17104
👌: 16257


In [5]:
top_12_ids = [int(ann_id) for ann_id, count in top_12]
print(top_12_ids)

[1392, 186, 1381, 1107, 1389, 1403, 174, 1255, 1111, 1108, 1424, 883]


Create a new CSV that only contains entry with the top 12 emoji

In [None]:
# Change to train/valid/test accordingly
df = pd.read_csv("dataset/img_train_plaintext.txt", sep="\t", dtype={"id": str, "annotations": str})

all_annotation_ids = []
for ann in df['annotations'].dropna():
    ids = [a.strip() for a in ann.split(",")]
    unique_ids = set(ids)
    all_annotation_ids.extend(unique_ids)

counter = Counter(all_annotation_ids)

skip_ids = {str(i) for i in range(761, 766)}

filtered_counter = Counter({k: v for k, v in counter.items() if k not in skip_ids})

top_12 = filtered_counter.most_common(12)
top_12_ids = [ann_id for ann_id, count in top_12]
print("Top 12 annotation IDs (after filtering):", top_12_ids)

def contains_top_annotation(ann_str):
    if pd.isna(ann_str):
        return False
    ids = [a.strip() for a in ann_str.split(",")]
    return any(a in top_12_ids for a in ids)

filtered_df = df[df['annotations'].apply(contains_top_annotation)]

# Change to train/valid/test accordingly
filtered_df.to_csv("dataset/img_train_top12.csv", sep="\t", index=False)

Top 12 annotation IDs (after filtering): ['1392', '186', '1381', '1107', '1389', '1403', '174', '1255', '1111', '1108', '1424', '883']


Download valid images (accessible URL) from the dataset containing the Top 12 emojis.

In [None]:
start = time.time()
# Load CSV of the newly Top 12 CSV
# Change to train/valid/test accordingly
df = pd.read_csv("dataset/img_train_top12.csv", sep="\t", dtype={"id": str})

def fetch_image_data(row):
    url = row['imgid']
    new_row = row.copy()  
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        new_row['image'] = img
        new_row['has_image'] = True
    except Exception as e:
        #print(f"Error fetching image {row['id']}: {e}")
        new_row['image'] = None
        new_row['has_image'] = False
    return new_row

max_workers = os.cpu_count() * 10

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(executor.map(fetch_image_data, df.to_dict(orient="records")))

result_df = pd.DataFrame(results)

result_df = result_df[result_df["has_image"]].copy()
result_df.reset_index(drop=True, inplace=True)
# Change to train/valid/test accordingly
result_df.drop(columns=["image"]).to_csv("img_train.csv", index=False)

# Store individual images in this folder, due to large size, we do not push it to our repository
img_folder = "img_train"
if not os.path.exists(img_folder):
    os.makedirs(img_folder)

def save_compressed_image(row):
    if row["has_image"] and row["image"] is not None:
        file_id = row["id"].lstrip("'")
        file_path = os.path.join(img_folder, f"{file_id}.jpg")
        try:
            row["image"].save(file_path, format="JPEG", quality=40, optimize=True)
        except Exception as e:
            print(f"Error saving image for id {file_id}: {e}")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(save_compressed_image, result_df.to_dict(orient="records"))

end = time.time()
print(end-start)

Modify and map the annotation ID

In [None]:
df = pd.read_csv("Train_AfterPreprocessing/output_valid_images.csv")
df = df.drop(columns=["has_image", "imgid"]) # To make it look cleaner

top_12_ids.sort()

# Map top 12 id into value 0 to 11
id_to_label = {cls_id: idx for idx, cls_id in enumerate(top_12_ids)}

for cls_id, label in id_to_label.items():
    print(f"{cls_id}: {label}")

def map_annotations(annotation_str):
    ids = [int(x) for x in annotation_str.split(',')]
    filtered = [str(id_to_label[i]) for i in ids if i in id_to_label]
    return ','.join(filtered)

df['annotations'] = df['annotations'].apply(map_annotations)

df.to_csv("Train_AfterPreprocessing/img_train.csv", index=False) # Change to train/valid/test accordingly