# init

In [26]:
# !pip install paddleocr, face_recognition, google-genai, pypinyin, json_repair

# # install one of below
# !pip install paddlepaddle    # to use cpu
# !pip install paddlepaddle-gpu    # to use Nvidia gpu

In [3]:
import os
import argparse
import json
import pickle
import pprint
import time
import datetime
import pandas as pd
import numpy as np

from json_repair import repair_json 
from zoneinfo import ZoneInfo
from tqdm.auto import tqdm
from getpass import getpass
from pathlib import Path
from typing import List, Optional, Tuple, Union
from PIL import Image, ImageDraw
from pypinyin import pinyin, Style
from paddleocr import PaddleOCR

import face_recognition

from google import genai
from google.genai import types

  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import resource_filename


In [5]:
GEMINI_API_KEY = getpass('api key: ')
client = genai.Client(api_key=GEMINI_API_KEY)

In [None]:
# dataset paths
meme_dir_path = '../datasets/test_dir_real/'

# face recg params
RECG_MODEL_PATH = "../models/trained_svm_model_real25.pkl"
RECG_THRESHOLD = 0.1

# ocr params
OCR_THRESHOLD = 0.9
ocr_cache_path = "caches/ocr_cache/"

# llm params
output_dir_path = 'results'

# load data

In [7]:
def extract_id_names_from_path(meme_path: str):
    """ 
    given a img path, return the meme id and politician name. 
    e.g. input: './test/004_lai_qing_de.jpg', returns '004' and 'lai_qing_de'.
    """
    # get stem
    stem = Path(meme_path).stem
    parts = stem.split('_')
    meme_num = parts[0]
    others = '_'.join(parts[1:])

    # multiple names
    if '&' in others:
        names = others.split('&')
    else:
        names = [others]
    
    return meme_num, names

In [8]:
entries = os.listdir(meme_dir_path)
meme_paths = []
for entry in entries:
    if entry.endswith(('.jpg', '.jpeg', '.png')):
        meme_path = meme_dir_path + entry
        meme_paths.append(meme_path)
print(len(meme_paths))

1


In [9]:
data_lst = []
for meme_path in meme_paths:
    meme_num, meme_names = extract_id_names_from_path(meme_path)
    data_lst.append({
        'meme_num': meme_num,
        'meme_names': meme_names,
        'meme_path': meme_path
    })
data_df = pd.DataFrame(data_lst)
len(data_df)

1

In [10]:
data_df.head()

Unnamed: 0,meme_num,meme_names,meme_path
0,300,[lai_qing_de],datasets/quick_test/300_lai_qing_de.png


In [11]:
imgs_to_predict = data_df['meme_path']

# get candidate list w/ face recg model

In [12]:
def predict_names(model, image_path: str, prediction_threshold: float):
    test_image = face_recognition.load_image_file(image_path)

    # find faces
    face_locations = face_recognition.face_locations(test_image)
    num_faces = len(face_locations)

    face_recg_pred = []

    if not face_locations:
        return face_recg_pred

    # encode faces
    test_image_encodings = face_recognition.face_encodings(
        test_image, known_face_locations=face_locations, model="large"
    )

    if np.isnan(test_image_encodings).any():
        print(f"Warning: Skipping {image_path}. Encoding contains NaN.")
        return face_recg_pred

    # predict
    for index, test_image_enc in enumerate(test_image_encodings, start=1):
        probabilities = model.predict_proba([test_image_enc])[0]
        entries = [
            {
                "face_index": index,
                "name": name,
                "prob": float(f"{prob:.4f}"),
            }
            for name, prob in zip(model.classes_, probabilities)
        ]

        sorted_entries = sorted(entries, key=lambda item: item["prob"], reverse=True)
        for entry in sorted_entries:
            if entry["prob"] >= prediction_threshold:
                face_recg_pred.append(entry)
    return face_recg_pred

In [13]:
# load model
try:
    with open(RECG_MODEL_PATH, "rb") as model_path:
        model = pickle.load(model_path)
        print('Face recognition model loaded successfully!')
except Exception as e:
    print(f"Error loading recognition model: {e}")

Face recognition model loaded successfully!


In [14]:
# predict candidates
face_recg_preds = []
for img_path in tqdm(imgs_to_predict):
    pred_res = predict_names(model, img_path, RECG_THRESHOLD)
    candidates_lst = list(set([p['name'] for p in pred_res]))
    face_recg_preds.append({
        'meme_path': img_path,
        'candidates': candidates_lst,
        'cand_details': pred_res
    })

100%|██████████| 1/1 [00:00<00:00,  2.11it/s]


In [15]:
# join result back to data_df
face_recg_df = pd.json_normalize(face_recg_preds)
data_df = pd.merge(right=data_df, left=face_recg_df, on='meme_path', how='inner')
len(data_df)

1

In [16]:
data_df.head()

Unnamed: 0,meme_path,candidates,cand_details,meme_num,meme_names
0,datasets/quick_test/300_lai_qing_de.png,[lai_qing_de],"[{'face_index': 1, 'name': 'lai_qing_de', 'pro...",300,[lai_qing_de]


# get ocr text w/ paddleocr

In [17]:
def get_ocr_text(img_path, threshold):
    # init
    ocr = PaddleOCR(
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
        use_textline_orientation=False
    )
    
    # ocr
    result = ocr.predict(input=img_path)
    rec_texts = result[0]['rec_texts']
    rec_probs = result[0]['rec_scores']
    
    # store res
    ocr_res = []
    for i, prob in enumerate(rec_probs):
        if prob >= threshold:
            ocr_res.append({
                "text": rec_texts[i],
                "prob": float(f"{prob:.4f}")
            })
    return ocr_res

def get_ocr_text_cached(img_path, threshold, cache_dir):
    """Return cached OCR results when available."""
    from pathlib import Path
    import hashlib
    import json

    cache_dir_path = Path(cache_dir)
    cache_dir_path.mkdir(parents=True, exist_ok=True)

    key_source = f"{Path(img_path).resolve()}|{threshold}"
    cache_name = hashlib.md5(key_source.encode("utf-8")).hexdigest()
    cache_file = cache_dir_path / f"{cache_name}.json"

    if cache_file.exists():
        with cache_file.open("r", encoding="utf-8") as cache_handle:
            cache_res = json.load(cache_handle)
            # filter with threshold
            ocr_res = []
            for i, pred in enumerate(cache_res):
                if pred['prob'] >= threshold:
                    ocr_res.append({
                        "text": pred['text'],
                        "prob": pred['prob']
                    })
            return ocr_res

    ocr_res = get_ocr_text(img_path, threshold)

    with cache_file.open("w", encoding="utf-8") as cache_handle:
        json.dump(ocr_res, cache_handle, ensure_ascii=False)

    print('Loaded OCR results from cache.')
    return ocr_res

In [18]:
# get ocr texts
ocr_preds = []
for img_path in tqdm(imgs_to_predict):
    ocr_pred = get_ocr_text_cached(img_path, OCR_THRESHOLD, ocr_cache_path)
    ocr_texts = list(set([p['text'] for p in ocr_pred]))
    ocr_probs = [p['prob'] for p in ocr_pred]
    
    ocr_preds.append({
            'meme_path': img_path,
            'ocr_texts': ocr_texts,
            'ocr_details': ocr_pred
            })

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/yoyo.wu.int/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/yoyo.wu.int/.paddlex/official_models/PP-OCRv5_server_rec`.[0m
100%|██████████| 1/1 [00:12<00:00, 12.08s/it]

Loaded OCR results from cache.





In [19]:
# join result back to data_df
ocr_df = pd.json_normalize(ocr_preds)
data_df = pd.merge(left=data_df, right=ocr_df, on='meme_path', how='inner')
len(data_df)

1

In [20]:
data_df.head()

Unnamed: 0,meme_path,candidates,cand_details,meme_num,meme_names,ocr_texts,ocr_details
0,datasets/quick_test/300_lai_qing_de.png,[lai_qing_de],"[{'face_index': 1, 'name': 'lai_qing_de', 'pro...",300,[lai_qing_de],"[早安！, 心想事成！]","[{'text': '早安！', 'prob': 0.987}, {'text': '心想事..."


# get final prediction and explanation w/ gemini

In [21]:
PROMPT = """
你是一位專精於臺灣政治迷因的分析專家，擅長解讀圖片中的視覺隱喻、政治反諷及時事梗。
請根據提供的圖片資訊、OCR 文字及參考名單，完成以下分析任務。

# 任務說明
1. **人物識別**：判斷迷因圖中是否出現（視覺人臉）或文字提及（OCR 內容）`politician_list` 中的政治人物。
2. **內容解讀**：以繁體中文撰寫一句話，精簡說明迷因內容（包含諷刺議題、政治背景或人物行為）。

# 輸入資料
- **meme_texts**（OCR 文字）：{texts}
- **possible_names**（參考線索）：{names} （註：此為人臉辨識模型的初步結果，準確度高但可能包含列表外的人物，請以此為重要線索並搭配 politician_list 過濾）
- **politician_list**（允許的候選名單）：
  [賴清德, 曹興誠, 柯建銘, 林智堅, 郭昱晴 (萬老師), 陳其邁, 邱議瑩, 
   王義川, 陳吉仲, 沈伯洋, 吳崢, 李進勇, 林楚茵, 呂建德, 林俊憲, 賴品妤, 
   吳思瑤, 李俊俋, 蔡英文, 吳靜怡, 黃捷, 蔡其昌, 吳沛憶, 劉世芳, 王定宇, 卓榮泰, 黃偉哲]

# 輸出規則 (嚴格遵守)

## 1. pred_names (List[str])
- **封閉選項**：結果**必須完全來自** `politician_list`。絕對不可自行創造、翻譯或使用列表以外的名字。
- **判斷邏輯**：
    - 請綜合考量 `possible_names` (人臉線索) 與 `meme_texts` (文字線索)。
    - 若 `possible_names` 中的名字也在 `politician_list` 中，請優先納入。
    - 若圖中人物有綽號（例如 OCR 出現「柯P」、「小英」），請自動對應回 `politician_list` 中的本名（如：柯文哲、蔡英文）。
- **空值處理**：若迷因中未出現或提及名單內的任何人物，請回傳空列表 `[]`（不要強行預測）。

## 2. reason (str)
- **單一句子**：必須是語意完整的一句話。
- **內容焦點**：請指出「誰」在「什麼議題」上被「如何描繪/諷刺」。
- **語言**：繁體中文。

## 3. 格式限制
- 僅輸出標準 JSON 格式，不要包含 Markdown 標記（如 ```json ... ```）或任何額外說明的文字。

# 輸出範例
{{
  "pred_names": ["林智堅", "蔡英文"],
  "reason": "此圖諷刺林智堅在論文案爭議中，獲得黨內大力的支持與背書。"
}}
"""

In [23]:
def call_gemini(candidates, ocr_texts, meme_img_bytes, data):
    
    # format prompt
    prompt = PROMPT.format(
        names=candidates,
        texts=ocr_texts
    )

    # choose img file type
    suffix = Path(data['meme_path']).suffix.lower()
    if suffix in ('.jpg', '.jpeg'):
      image_mime = 'image/jpeg'
    elif suffix == '.png':
      image_mime = 'image/png'
    else:
      raise('Error: Only supports jpg, jpeg, and png.')

    # call gemini
    response = client.models.generate_content(
        model='gemini-2.5-flash',
        contents=[
          types.Part.from_bytes(
            data=meme_img_bytes,
            mime_type=image_mime,
          ),
          prompt
        ]
      )

    # process result
    cleaned_res = response.text.replace("```json", "").replace("```", "").replace("\n", "")
    if len(cleaned_res) > 0:
      res = json.loads(cleaned_res)
      translated_names = [get_pinyin(name) for name in res['pred_names']]
      res['pred_names'] = translated_names
    else:
      res = {}
      
    return res

def get_pinyin(chinese_text):
    """Convert Chinese text to pinyin"""
    # Convert to pinyin and join with underscores
    result = pinyin(chinese_text, style=Style.NORMAL)
    return '_'.join([''.join(p) for p in result])

In [24]:
date_time = datetime.datetime.now(ZoneInfo('Asia/Taipei')) 
timestamp = date_time.strftime("%Y-%m-%d_%H-%M-%S") 
output_path = f'{output_dir_path}/{timestamp}.jsonl'
os.makedirs(output_dir_path, exist_ok=True)

for index, row in tqdm(data_df.iterrows(), total=len(data_df), desc='Gemini prediction'):
    
    sleep_count = 0
    data = row.to_dict()
    try:
        # load meme img
        with open(data['meme_path'], 'rb') as f:
            meme_img_bytes = f.read()
    
        # candidates
        candidates = data['candidates']
            
        # ocr
        ocr_texts = data['ocr_texts']
    
        # call gemini
        if sleep_count >= 10:
            time.sleep(3)
            sleep_count = 0
            
        try:
            # attempt 1
            res = call_gemini(candidates, ocr_texts, meme_img_bytes, data)
            data['llm_pred_names'] = res['pred_names']
            data['llm_reason'] = res['reason']
        except Exception as e:
            print(f'error1: {e}')
            # attempt 2
            try:
                res = call_gemini(candidates, ocr_texts, meme_img_bytes, data)
                data['llm_pred_names'] = res['pred_names']
                data['llm_reason'] = res['reason']
            except Exception as e:
                print(f'error2: {e}')
                continue
    
        # store result to cache (to prevent crash)
        with open(output_path, 'a') as f:
            json_line = json.dumps(data) 
            f.write(json_line + '\n')
        sleep_count += 1
        
    except Exception as e:
        print(f'error3: {e}')
        continue


Gemini prediction: 100%|██████████| 1/1 [00:14<00:00, 14.26s/it]


# final result

In [20]:
from IPython.display import Image, display

def display_df_image(df, index, path_column='meme_path'):
    """Displays the image for a specific row index."""
    image_path = df.loc[index, path_column]
    
    try:
        print(image_path)
        display(Image(filename=image_path, width=300))
    except FileNotFoundError:
        print(f"Error: Image file not found at {image_path}")

def cal_acc(df, type_, filter_none=False):
    # Work on a copy
    working_df = df.copy()
    none_count = 0

    # 1. Handle Filtering
    if filter_none:
        if type_ == 'face_rec_hit' or type_ == 'face_rec_top':
            # Check if cand_details is empty/None
            empty_mask = working_df['cand_details'].apply(
                lambda x: len(x) == 0 if isinstance(x, list) else True
            )
        else:
            empty_mask = working_df['llm_pred_names'].apply(
                lambda x: len(x) == 0 if isinstance(x, list) else True
            )
        none_count = empty_mask.sum()
        working_df = working_df[~empty_mask]

    # 2. Extract Predictions based on type
    if type_ == 'face_rec_hit':
        preds = working_df['candidates'].apply(set)
        
    elif type_ == 'face_rec_top':
        # Logic: Group by face_index -> Take max prob -> Collect names
        def get_top_per_face_index(details_list):
            if not isinstance(details_list, list) or len(details_list) == 0:
                return set()
            
            # Dictionary to track the best candidate for each face
            # Key: face_index, Value: {'name': ..., 'prob': ...}
            best_faces = {}
            
            for item in details_list:
                f_idx = item.get('face_index')
                prob = item.get('prob', 0)
                
                # If we haven't seen this face_index, or if this prob is higher than current best
                if f_idx not in best_faces or prob > best_faces[f_idx]['prob']:
                    best_faces[f_idx] = item
            
            # Extract the names of the winners for each face index
            return {v['name'] for v in best_faces.values()}

        preds = working_df['cand_details'].apply(get_top_per_face_index)
        
    elif type_ == 'llm':
        preds = working_df['llm_pred_names'].apply(
            lambda x: set(x) if isinstance(x, list) else set()
        )
    
    else:
        raise ValueError(f"Invalid type_: {type_}")

    # 3. Calculate Counts
    gts = working_df['meme_names'].apply(set)

    # TP Count: Intersection of Predicted Names and Ground Truth Names
    tp_count = sum(len(p.intersection(g)) for p, g in zip(preds, gts))

    # True Count: Total ground truth labels
    true_count = working_df['meme_names'].apply(len).sum()

    # 4. Calculate Accuracy
    acc = tp_count / true_count if true_count > 0 else 0.0

    print(f'acc: {tp_count}/{true_count} = {acc:.4f}')
    
    if filter_none and none_count > 0:
        print(f'none count: {none_count}')

In [1]:
# timestamp

In [5]:
# load result back
timestamp = '2025-11-28_19-40-31'
res = []
with open(f'{output_dir_path}/{timestamp}.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        res.append(data)
        
res_df = pd.DataFrame(res)
len(res_df)

165

In [9]:
cal_acc(res_df, 'face_rec_top', filter_none=False)

acc: 120/167 = 0.7186


In [10]:
cal_acc(res_df, 'face_rec_hit', filter_none=False)

acc: 134/167 = 0.8024


In [21]:
cal_acc(res_df, 'llm', filter_none=False)

acc: 159/167 = 0.9521


In [22]:
cal_acc(res_df, 'llm', filter_none=True)

acc: 159/166 = 0.9578
none count: 1


In [None]:
none = 17
correct = 146
wrong = 

In [14]:
# pd.set_option('display.max_colwidth', None)
res_df_filtered = res_df[['meme_num', 'meme_names', 'candidates', 'llm_pred_names', 'llm_reason', 'meme_path', 'ocr_texts', 'cand_details', 'ocr_details']]

In [18]:
res_df_filtered[res_df_filtered['llm_pred_names'].apply(lambda x: len(x) == 0)]

Unnamed: 0,meme_num,meme_names,candidates,llm_pred_names,llm_reason,meme_path,ocr_texts,cand_details,ocr_details
157,1,[qiu_yi_ying],[],[],此圖以醫生形象搭配「有病記得看醫生」的文字，諷刺政治場域中不理性的言行或決策，暗示其需要反思...,datasets/test_dir_real/001_qiu_yi_ying.jpeg,[有病記得看醫生],[],"[{'text': '有病記得看醫生', 'prob': 0.9729}]"


In [15]:
# for i in range(len(res_df)):
#     display_df_image(res_df, i)