Text is added to all memes and filter out any images that can't be read.

In [3]:
from os.path import isfile, join
import pandas as pd
from paddleocr import PaddleOCR
from PIL import Image
import cv2 as cv
import re
from google.cloud import vision
import json

def extract_text(image_path):
    try:
        img = cv.imread(image_path)

        if not img.size:
            print(f'Error reading {image_path}')
            return None

        with open(image_path, "rb") as image_file:
            content = image_file.read()

            image = vision.Image(content=content)

            response = client.text_detection(image=image)
            
            if response.error.message:
                raise Exception()

            texts = response.text_annotations
            text = texts[0].description.replace("\n", " ")

            if text != "":
                text = re.sub(r"^\s*\S+\.[a-zA-Z]{3}\s*|\s*\S+\.[a-zA-Z]{3}\s*$", "", text)
                text = re.sub(r"\s*\S+\.[a-zA-Z]{3}\s*", " ", text)
                text = re.sub(rf"^\s*[^\x00-\x7F]+\s*|\s*[^\x00-\x7F]+\s*$", "", text)
                text = re.sub(rf"\s*[^\x00-\x7F]+\s*", " ", text)
            else:
                print(f'Excluding image not containing text {image_path}')
                return None  
            return text
    except Exception:
        print(f'Error reading {image_path}')
        return None

client = vision.ImageAnnotatorClient()

df = pd.read_csv('OnToxMeme_dataset/OnToxMeme_annotations.csv')
image_folder = 'OnToxMeme_dataset/combined_images/'
df_filtered = df[df['symbol_id'].map(df['symbol_id'].value_counts()) > 1].copy()
df_filtered['labels'] = [[2]] * len(df_filtered)
df_filtered.rename(columns={'meme_id': 'id'}, inplace = True)
df_filtered = df_filtered[df_filtered['id'].apply(lambda x: isfile(join(image_folder, f'{x}.png')))]
df_filtered['img'] = df_filtered['id'].apply(lambda x: f'{x}.png')
df_filtered['text'] = df_filtered['img'].apply(lambda x: extract_text(join(image_folder, x)))
df_filtered = df_filtered[df_filtered['text'].notnull()]
json_output = df_filtered[['id', 'img', 'labels', 'text', 'symbol_id']].to_json(orient='records')
pretty_json = json.dumps(json.loads(json_output), indent=2)

with open('OnToxMeme_dataset/toxic_symbolism_entries.json', 'w') as f:
    f.write(pretty_json)

df = pd.read_json("OnToxMeme_dataset/harmless_entries.json")
df['text'] = df['img'].apply(lambda x: extract_text(join(image_folder, x)))
df = df.drop(columns=["caption"])
json_output = df.to_json(orient='records')
pretty_json = json.dumps(json.loads(json_output), indent=2)

with open("OnToxMeme_dataset/harmless_entries.json", 'w') as f:
    f.write(pretty_json)

df = pd.read_json("OnToxMeme_dataset/unethical_entries.json")
df['text'] = df['img'].apply(lambda x: extract_text(join(image_folder, x)))
df = df.drop(columns=["caption"])
json_output = df.to_json(orient='records')
pretty_json = json.dumps(json.loads(json_output), indent=2)

with open("OnToxMeme_dataset/unethical_entries.json", 'w') as f:
    f.write(pretty_json)

Error reading OnToxMeme_dataset/combined_images/120.png
Error reading OnToxMeme_dataset/combined_images/332.png




Error reading OnToxMeme_dataset/combined_images/285.png
Error reading OnToxMeme_dataset/combined_images/316.png
Error reading OnToxMeme_dataset/combined_images/61.png
Error reading OnToxMeme_dataset/combined_images/64.png




Error reading OnToxMeme_dataset/combined_images/29.png
Error reading OnToxMeme_dataset/combined_images/30.png
Error reading OnToxMeme_dataset/combined_images/32.png
Error reading OnToxMeme_dataset/combined_images/34.png
Error reading OnToxMeme_dataset/combined_images/503.png
Error reading OnToxMeme_dataset/combined_images/444.png


All json files get combined into one json file.

In [4]:
import pandas as pd
import json

json_files = ['harmless_entries.json', 'unethical_entries.json', 'toxic_symbolism_entries.json']
combined_df = pd.concat([pd.read_json(f'OnToxMeme_dataset/{file}') for file in json_files])
json_output = combined_df.to_json(orient='records')
pretty_json = json.dumps(json.loads(json_output), indent=2)

with open('OnToxMeme_dataset/combined_entries.json', 'w') as f:
    f.write(pretty_json)