# Utility for creating a testing dataset by downloading memes and OCRing them with Google Vision API

In [2]:
# I made a custom API for reddit meme feed that can be used to get around 2.5k newest memes from reddit
# DO NOT use any params or query
# Use only GET request method

memeApiUri="https://meme-feed-api.vercel.app/api/getRedditMemes"

In [3]:
import os
import requests
import json
import time
import urllib.request
import pytesseract

# create memes dir if it doesnt exist
memesFolder = "memes"
if not os.path.exists(memesFolder):
    os.makedirs(memesFolder)

# Example API response:
# [{
# title	"Every Single Time"
# author	"Abschori"
# createdAt	1663673918
# fetchedAt	1663689431254
# contentUrl	"https://i.redd.it/qkz1jrr630p91.gif"
# id	"xj6fj8"
# likes	6026
# nsfw	false
# postLink	"https://www.reddit.com/r…j6fj8/every_single_time/"
# provider	"r"
# subreddit	"dankmemes"
# },...]

# Downloads 1000 OCRable memes to meme folder
def downloadMemes():

    # Get memes from API
    response = requests.get(memeApiUri)
    memes = json.loads(response.text)

    # Filter out NSFW memes and .gifs
    memes = [
        meme
        for meme in memes
        if not meme["nsfw"] and not meme["contentUrl"].endswith(".gif")
    ]

    # sort by likes first, so we discard the bad ones
    memes.sort(key=lambda x: x["likes"], reverse=True)

    # Download the memes and discard the ones that are not text based
    for meme in memes:
        # max out at 1000 memes
        if len(os.listdir(memesFolder)) > 1000:
            return 1

        imageUrl = meme["contentUrl"]
        imageExtension = f".{imageUrl.split('.')[-1]}"
        imagePath = f"{memesFolder}/{meme['id']}{imageExtension}"

        # check if file already exists
        if not os.path.exists(imagePath):
            try:
                # download the image
                urllib.request.urlretrieve(imageUrl, imagePath)
                memesInFolder = len(os.listdir(memesFolder))
                print(f"Downloaded to {imagePath} {memesInFolder}/{1000}")
                # check if it contains text
                if not checkText(imagePath):
                    print(f"No text in {imagePath} -> removing")
                    os.remove(imagePath)
            except:
                print(f"Failed to download {meme['id']}")

    return 1


# Checks if an image contains text
# This is used to filter out memes that are not text based, so there is less wasted API calls for Google Vision API
# Uses Tesseract, as it seems to be faster than EasyOCR
def checkText(imagePath):
    text = pytesseract.image_to_string(imagePath)

    if len(text) > 5:
        return True

    return False


# Check if there is meme folder
if len(os.listdir("memes")) < 1400:
    print("Downloading memes...")
    downloadMemes()
else:
    print("Memes are already downloaded")

Downloading memes...
Downloaded to memes/xj4h3v.jpg
Downloaded to memes/xlxnn7.jpg
Downloaded to memes/xhnbyk.jpg
Downloaded to memes/xo5o30.jpg
Downloaded to memes/xli8di.jpg
Downloaded to memes/xnp16w.png
Downloaded to memes/xkgpsf.jpg
Downloaded to memes/xky9tp.jpg
Downloaded to memes/xo7gvr.jpg
Downloaded to memes/xm6mpf.jpg
Downloaded to memes/xnb8kp.jpg
Downloaded to memes/xhvtg9.jpg
Downloaded to memes/xhomrl.jpg
Downloaded to memes/xnm16o.jpg
Downloaded to memes/xoi08y.jpg
Downloaded to memes/xoa92i.jpg
Downloaded to memes/xewjk8.jpg
Downloaded to memes/xjxapx.jpg
Downloaded to memes/xoki7v.jpg
Downloaded to memes/xjg9ss.jpg
Downloaded to memes/xn4jza.png
Downloaded to memes/xh3ee5.png
Downloaded to memes/xo0l0t.jpg
Downloaded to memes/xk7na6.jpg
Downloaded to memes/xoa0yz.jpg
No text in memes/xoa0yz.jpg -> removing
Downloaded to memes/xp5zr8.jpg
Downloaded to memes/xk43c8.jpg
Downloaded to memes/xmaagj.jpg
Downloaded to memes/xi1p4r.jpg
No text in memes/xi1p4r.jpg -> removing


Now that the memes are downloaded, let's OCR them with Google Vision API to finalize the testing dataset.

In [4]:
import io
from google.cloud import vision

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google_key.json"

logFile = os.open(f"log-{time.strftime('%Y-%m-%d')}.txt", os.O_RDWR | os.O_CREAT)
logFile.write("Starting OCR at \n")

# @see https://cloud.google.com/vision/docs/samples/vision-text-detection#vision_text_detection-python
def detect_text(path):
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.text_detection(image=image)
    texts = response.text_annotations
    print('Texts:')

    for text in texts:
        print('\n"{}"'.format(text.description))

        vertices = (['({},{})'.format(vertex.x, vertex.y)
                    for vertex in text.bounding_poly.vertices])

        print('bounds: {}'.format(','.join(vertices)))
    
    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))

detect_text("memes/uctnxg.jpg")

logFile.close()

Texts:

"Good Juice
Makes You Go
OUCH
Original template
with minimal edits
Work with what you see; don't edit the
imagery to force the template to say
what you want it to say
Unorthodox
misinterpretation of its
visual elements
He looks like he's taking a bite; how
can I work with that?
Context is shifted in a
strange new way
How about "invisible food"?
Humor is derived from
this new context
Oof ouch owie."
bounds: (109,5),(300,5),(300,661),(109,661)

"Good"
bounds: (121,5),(157,5),(157,18),(121,18)

"Juice"
bounds: (162,5),(198,5),(198,18),(162,18)

"Makes"
bounds: (109,25),(154,25),(154,36),(109,36)

"You"
bounds: (159,25),(184,25),(184,36),(159,36)

"Go"
bounds: (189,25),(209,25),(209,36),(189,36)

"OUCH"
bounds: (125,43),(192,42),(192,57),(125,58)

"Original"
bounds: (123,80),(184,80),(184,96),(123,96)

"template"
bounds: (189,80),(259,80),(259,96),(189,96)

"with"
bounds: (121,102),(154,102),(154,116),(121,116)

"minimal"
bounds: (161,102),(222,102),(222,116),(161,116)

"edits"
bou