In [1]:
import hashlib
import json
import os
import random
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from tqdm import tqdm
import shutil

In [2]:
with open("../templates/outputs/comparing.json", "r") as f:
    comparasions = json.load(f)

In [3]:
comparasions[random.randint(0, len(comparasions) - 1)]

[['Peter Griffin Shovel', 5038],
 ['Terminator fatality', 1585],
 36,
 {'simple_r': 36, 'token_sort_r': 36}]

In [4]:
def custom_r(data):
    return data[2]
    # compares = list(data[3].values())
    # return max(compares)


def compare_func(data: list, threshold: int) -> bool:
    r = custom_r(data)
    return r >= threshold


def filter_data(data: list, threshold: int) -> list:
    """
    Return a sublist with comparasion score >= threshold
    """
    f = partial(compare_func, threshold=threshold)
    return list(filter(f, data))

In [5]:
filtered = filter_data(comparasions, 86)
filtered.sort(key=lambda x: custom_r(x))
len(filtered)

863

In [8]:
merged = []  # list of (set of names, set of index)
for i in tqdm(range(len(filtered))):
    t1, t2, _, _ = filtered[i]

    if_existed = False
    for m in merged:
        if t1[0] in m[0] or t2[0] in m[0]:
            if_existed = True
            # add title
            m[0].add(t1[0])
            m[0].add(t2[0])

            # add index
            m[1].add(t1[1])
            m[1].add(t2[1])

    if not if_existed:
        merged.append((set([t1[0], t2[0]]), set([t1[1], t2[1]])))

100%|██████████| 863/863 [00:00<00:00, 46080.69it/s]


In [9]:
len(merged), merged

(483,
 [({'Walter white dying', 'walter white dies'}, {147, 249}),
  ({'Disappearing guy',
    'Dissapear',
    'Dissapearing',
    'Guy dissapearing',
    'disappear',
    'dissapearing guy',
    'guy disappearing'},
   {188, 295, 379, 1353, 1393, 2092, 2196}),
  ({'Man Sweating', 'Man Sweating Gif'}, {290, 433}),
  ({'Fast Talking', 'TALK FAST', 'Talking fast'}, {422, 731, 3690}),
  ({'Confused Dog GIF', 'Confused dog', 'confused dog'}, {9, 940, 5499}),
  ({'Discord dancing', 'dancing discord mods'}, {714, 1169}),
  ({'Bored Spongebob', 'SpongeBob bow'}, {566, 1224}),
  ({'Spider-Man Snap', 'Spiderman sad', 'sad spiderman'},
   {172, 278, 1650, 5608}),
  ({'Thanos: Fine Ill do it myself', 'fine ill do it myself'}, {1622, 1657}),
  ({'Fire writting', 'Writing on fire', 'writing in fire'}, {870, 2039, 5552}),
  ({'Baby Yoda Coffee', 'Baby Yoda Sips Coffee'}, {36, 2107}),
  ({'Peter Griffin Knee', 'Peter griffin dance', 'Peter griffin dancing'},
   {16, 2174, 2252, 2838}),
  ({'Crazy gi

In [10]:
with open("../templates/outputs/metadata_w_examples.json") as f:
    metadatas = json.load(f)

In [11]:
len(metadatas)

6389

In [12]:
merged[1], metadatas[0]

(({'Disappearing guy',
   'Dissapear',
   'Dissapearing',
   'Guy dissapearing',
   'disappear',
   'dissapearing guy',
   'guy disappearing'},
  {188, 295, 379, 1353, 1393, 2092, 2196}),
 {'href': '/memetemplate/407065720/YEAH',
  'title': 'YEAH',
  'html_file': '-memetemplate-407065720-YEAH',
  'src': 'i.imgflip.com/6qctx4.mp4',
  'src_type': 'video/mp4',
  'Template ID': '407065720',
  'Format': 'mp4',
  'Dimensions': '640x400 px',
  'Filesize': '471 KB',
  'examples': []})

In [13]:
directory = "../templates/outputs/template_examples"

print("Before:", len(metadatas))

delete_indexes = []
for merge in merged:
    indexes = list(merge[1])
    og_metadata = metadatas[indexes[0]]
    og_template_id = og_metadata["Template ID"]
    for i in range(len(indexes)):
        if i == 0:
            continue

        index = indexes[i]
        metadata = metadatas[index]
        delete_indexes.append(index)

        template_id = metadata["Template ID"]
        for example in metadata["examples"]:
            og_metadata["examples"].append(example)

            byte_string = example["title"].encode()
            md5_hash = hashlib.md5(byte_string).hexdigest()
            ext = example["url"].split("/")[-1].split(".")[-1]
            old_filename = f"{md5_hash}_{template_id}.{ext}"
            new_filename = f"{md5_hash}_{og_template_id}.{ext}"

            old_file_path = os.path.join(directory, old_filename)
            new_file_path = os.path.join(directory, new_filename)
            if os.path.exists(old_file_path):
                os.rename(old_file_path, new_file_path)
                print(f"File renamed from {old_file_path} to {new_file_path}")
            else:
                print("Not exist: ", old_filename)

        template_example = {
            "title": metadata["title"],
            "url": metadata["src"],
            "id": metadata["Template ID"],
        }
        ext = template_example["url"].split("/")[-1].split(".")[-1]
        byte_string = template_example["title"].encode()
        md5_hash = hashlib.md5(byte_string).hexdigest()
        new_filename = f"{md5_hash}_{og_template_id}.{ext}"
        og_metadata["examples"].append(template_example)

        source_file = (
            f'../templates/outputs/template_data/{og_metadata["html_file"]}.{ext}'
        )
        destination_file = f"{directory}/{new_filename}"
        shutil.copy2(source_file, destination_file)

        # copy template over folder

for i in sorted(delete_indexes, reverse=True):
    del metadatas[i]

print("After:", len(metadatas))

Before: 6389
File renamed from ../templates/outputs/template_examples/eeab04e0a690cca2ad9163dc72679582_331639373.gif to ../templates/outputs/template_examples/eeab04e0a690cca2ad9163dc72679582_283758184.gif
File renamed from ../templates/outputs/template_examples/dc3c77fde3ba5c8340154d60c01340e7_503936658.gif to ../templates/outputs/template_examples/dc3c77fde3ba5c8340154d60c01340e7_360155060.gif
File renamed from ../templates/outputs/template_examples/ab519c73fa1c8cb1f703d14146b6c011_319610859.gif to ../templates/outputs/template_examples/ab519c73fa1c8cb1f703d14146b6c011_360155060.gif
File renamed from ../templates/outputs/template_examples/a600e0f41220b5a20de74e1fed920b78_319610859.gif to ../templates/outputs/template_examples/a600e0f41220b5a20de74e1fed920b78_360155060.gif
File renamed from ../templates/outputs/template_examples/9f5d52c4d103102e99cc70699542b23b_319610859.gif to ../templates/outputs/template_examples/9f5d52c4d103102e99cc70699542b23b_360155060.gif
File renamed from ../t

In [14]:
with open("../templates/outputs/metadata_w_examples_merged.json", "w") as f:
    json.dump(metadatas, f, indent=6)

In [15]:
for merge in merged:
    # if "Bunny jumping into trash can" in merge[0]:
    print(merge[0])

{'walter white dies', 'Walter white dying'}
{'Dissapearing', 'disappear', 'Dissapear', 'Disappearing guy', 'Guy dissapearing', 'dissapearing guy', 'guy disappearing'}
{'Man Sweating', 'Man Sweating Gif'}
{'Fast Talking', 'Talking fast', 'TALK FAST'}
{'Confused dog', 'confused dog', 'Confused Dog GIF'}
{'dancing discord mods', 'Discord dancing'}
{'Bored Spongebob', 'SpongeBob bow'}
{'sad spiderman', 'Spiderman sad', 'Spider-Man Snap'}
{'fine ill do it myself', 'Thanos: Fine Ill do it myself'}
{'Writing on fire', 'writing in fire', 'Fire writting'}
{'Baby Yoda Coffee', 'Baby Yoda Sips Coffee'}
{'Peter griffin dance', 'Peter griffin dancing', 'Peter Griffin Knee'}
{'Crazy gif', 'Go crazy gif'}
{'Coffin Dance', 'Coffin dance', 'coffin dance', 'Coffin dance Gif'}
{'Peter griffin dance', 'Peter Griffin choking', 'Peter griffin dancing'}
{'is back on the menu', 'Back on the menu 2'}
{'Patrick bateman music', 'Patrick Bateman sigma'}
{'Cat fight', 'Fighting cat', 'Rat Fight'}
{'No god please n