In [6]:
import json
import os
import pickle

import gensim
import numpy as np
import pandas as pd
import requests
import string

from scipy.spatial import distance
from api.features import TextExtractor, ImageExtractor, SentenceVectorizer

from tqdm import tqdm

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [10]:
db_filename = os.path.join('api', 'images', 'db.json')

db = {}
with open(db_filename, 'r') as f:
    db = json.load(f)
    db = db['_default']

In [11]:
df = pd.DataFrame(db).T
df = df.drop(columns=['thumbnail', 'created_utc', 'author', 'ups', 'downs'])
df.head(10)

Unnamed: 0,title,id,media
1,Num🅱er One,6tehbc,https://i.redd.it/7wgs4dkiihfz.png
2,Got ‘em,8bse8k,https://i.redd.it/65bzzioisir01.jpg
3,50-0,6we7gp,https://i.redd.it/19c4ggoz0ciz.png
4,Allow,7qbcqw,https://i.redd.it/qt5p8ozio0a01.png
5,*mild concern*,7szw80,https://i.redd.it/1cudu2jlgac01.jpg
6,I would kill myself again,7nxaqq,https://i.redd.it/cqtircwmtw701.png
7,Conspiracy,7ep89z,https://i.redd.it/9gx2gd66ehzz.jpg
8,Justin Timberlake,7vbgih,https://i.redd.it/qq501hb3yae01.jpg
9,Lit 🔥,6c13dg,https://i.redd.it/1zb3bomchdyy.jpg
10,Improvise. Adapt. Overcome.,74fa93,https://i.redd.it/ypelzg7uwzpz.jpg


In [2]:
# Donwload the full resolution images
images_path = os.path.join('api', 'images', 'memes', 'memes')
if not os.path.isdir(images_path):
    os.makedirs(images_path)

# row = df.iloc[0,:]

# name, ext = os.path.splitext(row['media'])
# print(name, ext)

# image_name = os.path.join(images_path, row['id']+ext)
# response = requests.get(row['media'])
# with open(image_name, 'wb') as f:
#     f.write(response.content)

In [3]:
te = TextExtractor()
ie = ImageExtractor()

In [9]:
def get_ocr_from_id(base_path, img_id, formats=['.png', '.jpg'], text_extractor=TextExtractor()):
    img_name = os.path.join(base_path, img_id)
    for ext in formats:
        if os.path.isfile(img_name+ext):
            img_name += ext
            break
    
    embedding, text = text_extractor.to_vec(filename=img_name, method='precise', to_numpy=True, return_text=True)
    return embedding, text

embedding, text = get_ocr_from_id(base_path=images_path, img_id=df['id'][0], text_extractor=te)
print(text)
print(embedding.shape)

['rleurope 1h ruvis u/batteries-included "Life Is Now": Stefan Karl Stefansson, LazyTown Actor And Internet Meme, Free Of Cancer ruv.is']
(768,)


In [10]:
df['title_embedding'] = df['title'].apply(lambda x: te.to_vec(text=x, to_numpy=True))

In [11]:
df['ocr_text'] = ''
df['ocr_embedding'] = df['title_embedding']
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        emb, text = get_ocr_from_id(base_path=images_path, img_id=df['id'][idx], text_extractor=te)
        df['ocr_embedding'][idx] = emb
        df['ocr_text'][idx] = text
    except:
        df['ocr_embedding'][idx] *= 0

df['fusion_text_embedding'] = df['ocr_embedding'] + df['title_embedding']

100%|██████████| 3226/3226 [22:07<00:00,  2.43it/s]


In [10]:
def get_image_vector_from_id(base_path, img_id, formats=['.png', '.jpg'], image_extractor=ImageExtractor()):
    img_name = os.path.join(base_path, img_id)
    for ext in formats:
        if os.path.isfile(img_name+ext):
            img_name += ext
            break
    
    embedding = image_extractor.to_vec(filename=img_name, to_numpy=True)
    return embedding

embedding = get_image_vector_from_id(base_path=images_path, img_id=df['id'][0], image_extractor=ie)
print(embedding.shape)

(512,)


In [14]:
df['img_embedding'] = ''
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        df['img_embedding'][idx] = get_image_vector_from_id(base_path=images_path, img_id=df['id'][idx], image_extractor=ie)
    except:
        df['img_embedding'][idx] = np.zeros([512])

100%|██████████| 3226/3226 [03:15<00:00, 16.53it/s]


In [25]:
# print("Loading pretrained embeddings")
# pretrained_emb_filename = os.path.join('api','pretrained', 'glove.6B.300d_converted.txt')
# model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_emb_filename)
# print("Model initialized")

In [4]:
vocab_dict = { word : model[word] for word in model.index_to_key }

vocab_vectors_filename = os.path.join('api','pretrained', 'glove.6B.300d_dict.pickle')
with open(vocab_vectors_filename, 'wb') as f:
    pickle.dump(vocab_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
vocab_vectors_filename = os.path.join('api','pretrained', 'glove.6B.300d_dict.pickle')
with open(vocab_vectors_filename, 'rb') as f:
    vocab_dict = pickle.load(f)

In [8]:
sv = SentenceVectorizer(filename=vocab_vectors_filename)

In [38]:
df['title_glove'] = df['title'].apply(lambda x: sv.encode(x))

# df['ocr_glove'] = df['ocr_text'].apply(lambda x: sv.encode(x[0]))
for idx, row in df.iterrows():
    try:
        df['ocr_text'][idx] = row['ocr_text']
        df['ocr_glove'][idx] = sv.encode(row['ocr_text'])
    except:
        df['ocr_text'][idx] = ' '
        df['ocr_glove'][idx] = np.zeros([300])

# df['fusion_text'] = df.apply(lambda x: x['title'] + ' ' + x['ocr_text'][0])
# df['fusion_text_glove'] = df['fusion_text'].apply(lambda x: sv.encode(x))
for idx, row in df.iterrows():
    try:
        text = row['title'] + ' ' + row['ocr_text']
        df['fusion_text'] = text
        df['fusion_text_glove'][idx] = sv.encode(text)
    except:
        df['fusion_text'] = row['title']
        df['fusion_text_glove'][idx] = np.zeros([300])

df.head(10)

  vec /= np.sqrt(vec.dot(vec))


Unnamed: 0,title,id,media,ocr_text,img_embedding,title_glove,ocr_glove,fusion_text_glove,fusion_text
1,Num🅱er One,6tehbc,https://i.redd.it/7wgs4dkiihfz.png,r,"[0.47470057, 3.1894677, 1.359018, 0.09152906, ...","[-0.07199854341442075, 0.0773735593669768, -0....","[-0.058076682938511204, -0.09807876561138369, ...","[-0.08158695288201284, -0.039476244612579155, ...",Black Monopoly B
2,Got ‘em,8bse8k,https://i.redd.it/65bzzioisir01.jpg,W,"[0.002120382, 1.1061459, 1.149819, 1.5612589, ...","[0.012160748052920282, -0.03837562089045379, -...","[-0.08078028445705333, 0.03380954679347381, -0...","[-0.05483938303667584, 0.003650542385628623, -...",Black Monopoly B
3,50-0,6we7gp,https://i.redd.it/19c4ggoz0ciz.png,H,"[0.40099052, 1.3954234, 0.16197552, 2.6542144,...","[0.04570380842624913, -0.040621292753460665, -...","[-0.012468797458529766, 0.030708376336418684, ...","[0.026232055413328206, -0.009612910002507101, ...",Black Monopoly B
4,Allow,7qbcqw,https://i.redd.it/qt5p8ozio0a01.png,i,"[0.11146166, 0.35562772, 0.14314997, 0.0043752...","[-0.014267144310047065, -0.027631072981699447,...","[-0.019214471631711024, 0.02455295071924358, -...","[-0.020915344166204174, 0.002202026657488687, ...",Black Monopoly B
5,*mild concern*,7szw80,https://i.redd.it/1cudu2jlgac01.jpg,w,"[0.78119, 1.921681, 0.53955406, 2.4841037, 1.1...","[-0.04690530375468269, -0.055024448373724166, ...","[-0.08078028445705333, 0.03380954679347381, -0...","[-0.10683859773771934, -0.0006983161319171384,...",Black Monopoly B
6,I would kill myself again,7nxaqq,https://i.redd.it/cqtircwmtw701.png,2,"[0.3625772, 0.36763048, 0.7479718, 1.4109285, ...","[-0.013693546788499716, 0.0480856588712358, -0...","[-0.06418837450375463, 0.03480370320007013, -0...","[-0.02859157773495234, 0.05204849599213456, -0...",Black Monopoly B
7,Conspiracy,7ep89z,https://i.redd.it/9gx2gd66ehzz.jpg,M,"[0.26197675, 1.7856048, 0.6354927, 0.64990205,...","[0.0048729342408497175, -0.032625272503098414,...","[-0.01349829756600878, 0.06475009907870262, 0....","[-0.006161400462040778, 0.022730671727479265, ...",Black Monopoly B
8,Justin Timberlake,7vbgih,https://i.redd.it/qq501hb3yae01.jpg,Q,"[1.2621086, 0.64310294, 0.5641967, 1.0983516, ...","[0.08559007356763464, 0.02754062131381438, 0.0...","[0.06526505934091793, -0.009461739437502052, 0...","[0.10364010807695737, 0.018511376554727288, 0....",Black Monopoly B
9,Lit 🔥,6c13dg,https://i.redd.it/1zb3bomchdyy.jpg,"""","[1.354462, 1.3184816, 0.48951033, 0.7207344, 0...","[-0.02965367143258549, -0.05274808152269753, 0...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[-0.02965367143258549, -0.05274808152269753, 0...",Black Monopoly B
10,Improvise. Adapt. Overcome.,74fa93,https://i.redd.it/ypelzg7uwzpz.jpg,m,"[0.28811383, 0.16730404, 0.9405307, 1.7245754,...","[0.0013496857503932263, 0.008868756760071836, ...","[-0.01349829756600878, 0.06475009907870262, 0....","[-0.01349829756600878, 0.06475009907870262, 0....",Black Monopoly B


In [43]:
df = df.drop(columns=['title_embedding', 'ocr_embedding', 'fusion_text_embedding'])

In [44]:
# Save embeddings in binary format
index_name = os.path.join('api', 'images', 'index_4.df')
df.to_pickle(index_name)

df.head(10)

Unnamed: 0,title,id,media,ocr_text,img_embedding,title_glove,ocr_glove,fusion_text_glove
1,Num🅱er One,6tehbc,https://i.redd.it/7wgs4dkiihfz.png,"[rleurope 1h ruvis u/batteries-included ""Life ...","[0.47470057, 3.1894677, 1.359018, 0.09152906, ...","[-0.07199854341442075, 0.0773735593669768, -0....","[-0.03253846936098761, -0.021829422793741483, ...","[-0.04115979942813921, -0.007370811909021254, ..."
2,Got ‘em,8bse8k,https://i.redd.it/65bzzioisir01.jpg,[When Reddit is making fun of Mark Zuckerberg ...,"[0.002120382, 1.1061459, 1.149819, 1.5612589, ...","[0.012160748052920282, -0.03837562089045379, -...","[-0.043908957304537595, 0.0026559019682615413,...","[-0.039113712115749266, -0.002397845134246288,..."
3,50-0,6we7gp,https://i.redd.it/19c4ggoz0ciz.png,[How to make millions Damn wish I could read F...,"[0.40099052, 1.3954234, 0.16197552, 2.6542144,...","[0.04570380842624913, -0.040621292753460665, -...","[-0.052404932677483466, 0.024177834791048673, ...","[-0.047197070413993424, 0.018980333617313987, ..."
4,Allow,7qbcqw,https://i.redd.it/qt5p8ozio0a01.png,[is it norma to have a 15 inch penis Google Se...,"[0.11146166, 0.35562772, 0.14314997, 0.0043752...","[-0.014267144310047065, -0.027631072981699447,...","[-0.08672669096490747, -0.014195494539243032, ...","[-0.08250420629898565, -0.01593113608817573, -..."
5,*mild concern*,7szw80,https://i.redd.it/1cudu2jlgac01.jpg,[when re an npc and you see the player quicksa...,"[0.78119, 1.921681, 0.53955406, 2.4841037, 1.1...","[-0.04690530375468269, -0.055024448373724166, ...","[-0.014015999862818299, 0.05802771491249412, 0...","[-0.024548113590612593, 0.05073777657968644, -..."
6,I would kill myself again,7nxaqq,https://i.redd.it/cqtircwmtw701.png,[2 imagine killing urself just to end up in a ...,"[0.3625772, 0.36763048, 0.7479718, 1.4109285, ...","[-0.013693546788499716, 0.0480856588712358, -0...","[-0.04539344077337367, 0.0475500850589299, -0....","[-0.038474636610454034, 0.051813190364034506, ..."
7,Conspiracy,7ep89z,https://i.redd.it/9gx2gd66ehzz.jpg,[Microtransactions in games Micro transactions...,"[0.26197675, 1.7856048, 0.6354927, 0.64990205,...","[0.0048729342408497175, -0.032625272503098414,...","[-0.08507857516319223, 0.057100194241089704, 0...","[-0.07721234610241666, 0.04493450504624606, -0..."
8,Justin Timberlake,7vbgih,https://i.redd.it/qq501hb3yae01.jpg,[Q 9 who is justin timl Google who is justin t...,"[1.2621086, 0.64310294, 0.5641967, 1.0983516, ...","[0.08559007356763464, 0.02754062131381438, 0.0...","[-0.0037676744774116267, 0.028327919885476727,...","[0.011130662968484924, 0.0298554611781323, 0.0..."
9,Lit 🔥,6c13dg,https://i.redd.it/1zb3bomchdyy.jpg,"[""Excuse me where's the lobby?"" ""Idk but the l...","[1.354462, 1.3184816, 0.48951033, 0.7207344, 0...","[-0.02965367143258549, -0.05274808152269753, 0...","[-0.033030197021384475, 0.014560649002300828, ...","[-0.0519897545681474, 0.013238237013960784, 0...."
10,Improvise. Adapt. Overcome.,74fa93,https://i.redd.it/ypelzg7uwzpz.jpg,[meni You dont Know: how To Make when memes Ov...,"[0.28811383, 0.16730404, 0.9405307, 1.7245754,...","[0.0013496857503932263, 0.008868756760071836, ...","[-0.021123843749864778, -0.016384720426450602,...","[-0.021123843749864778, -0.016384720426450602,..."


In [13]:
index_name = os.path.join('api', 'images', 'index_4.df')
df = pd.read_pickle(index_name)
# df['ocr_text'] = df['ocr_text'].apply(lambda x: x[0])
df.head(10)

Unnamed: 0,title,id,media,ocr_text,img_embedding,title_glove,ocr_glove,fusion_text_glove
1,Num🅱er One,6tehbc,https://i.redd.it/7wgs4dkiihfz.png,"[rleurope 1h ruvis u/batteries-included ""Life ...","[0.47470057, 3.1894677, 1.359018, 0.09152906, ...","[-0.07199854341442075, 0.0773735593669768, -0....","[-0.03253846936098761, -0.021829422793741483, ...","[-0.04115979942813921, -0.007370811909021254, ..."
2,Got ‘em,8bse8k,https://i.redd.it/65bzzioisir01.jpg,[When Reddit is making fun of Mark Zuckerberg ...,"[0.002120382, 1.1061459, 1.149819, 1.5612589, ...","[0.012160748052920282, -0.03837562089045379, -...","[-0.043908957304537595, 0.0026559019682615413,...","[-0.039113712115749266, -0.002397845134246288,..."
3,50-0,6we7gp,https://i.redd.it/19c4ggoz0ciz.png,[How to make millions Damn wish I could read F...,"[0.40099052, 1.3954234, 0.16197552, 2.6542144,...","[0.04570380842624913, -0.040621292753460665, -...","[-0.052404932677483466, 0.024177834791048673, ...","[-0.047197070413993424, 0.018980333617313987, ..."
4,Allow,7qbcqw,https://i.redd.it/qt5p8ozio0a01.png,[is it norma to have a 15 inch penis Google Se...,"[0.11146166, 0.35562772, 0.14314997, 0.0043752...","[-0.014267144310047065, -0.027631072981699447,...","[-0.08672669096490747, -0.014195494539243032, ...","[-0.08250420629898565, -0.01593113608817573, -..."
5,*mild concern*,7szw80,https://i.redd.it/1cudu2jlgac01.jpg,[when re an npc and you see the player quicksa...,"[0.78119, 1.921681, 0.53955406, 2.4841037, 1.1...","[-0.04690530375468269, -0.055024448373724166, ...","[-0.014015999862818299, 0.05802771491249412, 0...","[-0.024548113590612593, 0.05073777657968644, -..."
6,I would kill myself again,7nxaqq,https://i.redd.it/cqtircwmtw701.png,[2 imagine killing urself just to end up in a ...,"[0.3625772, 0.36763048, 0.7479718, 1.4109285, ...","[-0.013693546788499716, 0.0480856588712358, -0...","[-0.04539344077337367, 0.0475500850589299, -0....","[-0.038474636610454034, 0.051813190364034506, ..."
7,Conspiracy,7ep89z,https://i.redd.it/9gx2gd66ehzz.jpg,[Microtransactions in games Micro transactions...,"[0.26197675, 1.7856048, 0.6354927, 0.64990205,...","[0.0048729342408497175, -0.032625272503098414,...","[-0.08507857516319223, 0.057100194241089704, 0...","[-0.07721234610241666, 0.04493450504624606, -0..."
8,Justin Timberlake,7vbgih,https://i.redd.it/qq501hb3yae01.jpg,[Q 9 who is justin timl Google who is justin t...,"[1.2621086, 0.64310294, 0.5641967, 1.0983516, ...","[0.08559007356763464, 0.02754062131381438, 0.0...","[-0.0037676744774116267, 0.028327919885476727,...","[0.011130662968484924, 0.0298554611781323, 0.0..."
9,Lit 🔥,6c13dg,https://i.redd.it/1zb3bomchdyy.jpg,"[""Excuse me where's the lobby?"" ""Idk but the l...","[1.354462, 1.3184816, 0.48951033, 0.7207344, 0...","[-0.02965367143258549, -0.05274808152269753, 0...","[-0.033030197021384475, 0.014560649002300828, ...","[-0.0519897545681474, 0.013238237013960784, 0...."
10,Improvise. Adapt. Overcome.,74fa93,https://i.redd.it/ypelzg7uwzpz.jpg,[meni You dont Know: how To Make when memes Ov...,"[0.28811383, 0.16730404, 0.9405307, 1.7245754,...","[0.0013496857503932263, 0.008868756760071836, ...","[-0.021123843749864778, -0.016384720426450602,...","[-0.021123843749864778, -0.016384720426450602,..."


In [39]:
query = 'who would win'
count = 10
query_embedding = sv.encode(query)

XA = np.expand_dims(query_embedding, axis=0)
print(XA.shape)

XB = [np.expand_dims(row, axis=0) for row in df['fusion_text_glove']]
XB = np.concatenate(XB, axis=0)
print(XB.shape)

scores = distance.cdist(XB, XA, metric='euclidean')
similarity_scores = [(i,s) for i,s in enumerate(scores)]

# Get Top K
similarity_scores = sorted(similarity_scores, key=lambda x: x[1])
similarity_scores = similarity_scores[0:count]

# print(similarity_scores[0][1].shape)
print(similarity_scores)

# df_results = df.copy()
# df_results['similarity'] = df_results['title_embedding'].apply(lambda x: distance.euclidean(query_embedding, x))
# df_results = df_results.sort_values(by='similarity', ascending=True)

# df_results = df_results.head(20)
# df_results.head(20)

# Retrieve the results (rows with corresponding indices)
df_results = df.iloc[[x[0] for x in similarity_scores]]
df_results.head(10)

(1, 300)
(3226, 300)
[(567, array([0.57720867])), (47, array([0.63592501])), (1494, array([0.66522023])), (1484, array([0.67241691])), (816, array([0.67504543])), (1003, array([0.69269142])), (925, array([0.69282552])), (5, array([0.69317641])), (950, array([0.69598756])), (17, array([0.69672299]))]


Unnamed: 0,title,id,media,ocr_text,img_embedding,title_glove,ocr_glove,fusion_text_glove,fusion_text
568,They wanted to make sure the next person knew ...,7yqr9s,https://i.redd.it/gzaropkzv8h01.jpg,S,"[0.90342027, 1.4105773, 1.1308568, 1.8778474, ...","[-0.039380610307763804, 0.04451268421339465, -...","[0.05291583722080206, 0.008006591078363783, -0...","[-0.038335754597997596, 0.04185425269971279, -...",Black Monopoly B
48,"If you outlaw emojis, then only outlaws will h...",7bdel6,https://i.redd.it/0ucizt92akwz.jpg,G,"[0.8733979, 1.4701654, 1.3850347, 2.7285836, 0...","[-0.043652082027957, 0.05656749190009431, -0.0...","[-0.054981830958605704, 0.05101286884328031, -...","[-0.053024919116297344, 0.06447448154270126, -...",Black Monopoly B
1495,AutoMod told me I needed a longer title so her...,7v08v2,https://i.redd.it/ao610n9ex0e01.jpg,W,"[0.0, 0.04075798, 0.6291934, 2.5670784, 0.0921...","[-0.04454200471795491, 0.01761353756650466, -0...","[-0.08078028445705333, 0.03380954679347381, -0...","[-0.06101810352326423, 0.022931072966784313, -...",Black Monopoly B
1485,It could go either way,7vvroe,https://i.redd.it/6qura7ksbse01.jpg,M,"[0.7703749, 1.3478632, 1.7184458, 1.3303148, 0...","[-0.02260652038032033, 0.030777697787893375, -...","[-0.01349829756600878, 0.06475009907870262, 0....","[-0.024513387664758107, 0.046948917190067906, ...",Black Monopoly B
817,Who Would Win?,7wq7q4,https://i.redd.it/sn9lct7wyhf01.jpg,W,"[0.96935314, 0.63193, 2.3049726, 1.694774, 1.0...","[-0.03377124555349856, 0.10392549163526993, -0...","[-0.08078028445705333, 0.03380954679347381, -0...","[-0.1038859429725868, 0.07591794150913247, -0....",Black Monopoly B
1004,Because her husband will come at night.,84mn8q,https://i.redd.it/bfggexjeoxl01.jpg,H,"[0.40093154, 1.377236, 0.5739363, 1.5687023, 0...","[-0.03713070536764561, 0.040140007097916465, -...","[-0.012468797458529766, 0.030708376336418684, ...","[-0.049947662531000594, 0.049677907238595255, ...",Black Monopoly B
926,It will be the best company ever,7mefns,https://i.redd.it/nbw2lxbm6g601.png,w,"[0.2976423, 0.5465578, 0.20421335, 0.14275393,...","[-0.02504020059189688, 0.07367622109522633, -0...","[-0.08078028445705333, 0.03380954679347381, -0...","[-0.04262449873680249, 0.07844468281178187, -0...",Black Monopoly B
6,I would kill myself again,7nxaqq,https://i.redd.it/cqtircwmtw701.png,2,"[0.3625772, 0.36763048, 0.7479718, 1.4109285, ...","[-0.013693546788499716, 0.0480856588712358, -0...","[-0.06418837450375463, 0.03480370320007013, -0...","[-0.02859157773495234, 0.05204849599213456, -0...",Black Monopoly B
951,they should make a movie together,72ritv,https://i.redd.it/r0y75q8xqeoz.jpg,T,"[1.4994001, 1.8011785, 0.6891692, 1.1128241, 0...","[-0.04594625724939806, 0.03967583619100774, -0...","[-0.030284860237509263, -0.05703427457108378, ...","[-0.049246449782043115, 0.021755488897424874, ...",Black Monopoly B
18,Now we know why this man became disloyal and d...,6xvwtf,https://i.imgur.com/5FnBYPZ.jpg,1,"[0.06091951, 1.4678813, 1.927265, 1.5361513, 0...","[-0.007854419870188091, 0.013179211552238917, ...","[-0.043685646831520965, 0.018671932267749543, ...","[-0.02820026422657229, 0.012174876254864199, 0...",Black Monopoly B
