In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd

In [None]:
def get_template_links(index, headers={'User-Agent': 'Mozilla/5.0'}):
    meme_list_link = 'https://imgflip.com/memetemplates?page='
    page = requests.get(meme_list_link + str(index), headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    # find all elements with class "mt-box"
    link_elements = soup.find_all('a', class_='mt-caption l but')
    links = [link_element.get('href') for link_element in link_elements]
    swapped_links = []
    for link in links:
        start_idx = link.find('memegenerator')
        end_idx = start_idx + len('memegenerator')
        # change 'memegenerator' to 'meme'
        link = link[:start_idx] + 'meme' + link[end_idx:]
        swapped_links.append(link)

    return swapped_links

print(get_template_links(10000))

In [None]:
import random
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

meme_template_links = []

last_page = False
index = 1
while not last_page:
    print(f"Getting links from page {index}")
    random_number = random.random()
    time.sleep(random_number)
    links = get_template_links(index, headers)
    if len(links) == 0:
        last_page = True
    meme_template_links += links
    index += 1


print(len(meme_template_links))

In [None]:
print(len(set(meme_template_links)))

In [None]:
print(meme_template_links)

meme_template_links = list(set(meme_template_links))

meme_template_dict = {link.split('/')[-1]: link for link in meme_template_links}
print(len(meme_template_dict))

In [None]:
template_df = pd.DataFrame(meme_template_dict.items(), columns=['template_name', 'template_link'])
template_df.to_parquet('../data/meme_template_links.parquet')

In [None]:
def download_template_info(soup: BeautifulSoup):
    tags_element = soup.find('div', class_='alt-names')
    if not tags_element:
        return None
    else:
        tags_string = tags_element.text
    if tags_string.startswith('aka:'):
        tags_string = tags_string[4:]
    tags = tags_string.split(', ')
    return tags

# links = get_template_links(2)
# print(links[0])
# full_link = 'https://imgflip.com' + links[0]
# print(full_link)

# page = requests.get(full_link)
# soup = BeautifulSoup(page.content, 'html.parser')
# print(download_template_info(soup))
    

In [None]:
import time

def download_template_gallery(template_name: str, link: str, dest_folder='../test-folder'):
    # create destination folder if it does not exist
    template_folder = os.path.join(dest_folder, template_name)
    if not os.path.exists(template_folder):
        os.makedirs(template_folder)
    full_link = 'https://imgflip.com' + link
    meme_entries =  {'id': [],'template_name':[], 'image_link': [], 'path': []}
    
    index = 1
    img_counter = 0
    no_image_left = False
    while (not no_image_left) and (img_counter < 100):
        print('Downloading page:', index)
        random_number = random.random() * 2
        time.sleep(random_number)
        page = requests.get(full_link + '?page=' + str(index))
        if page.status_code != 200:
            raise Exception('Error:', page.status_code)
        soup = BeautifulSoup(page.content, 'html.parser')
        if index == 1:
            tags = download_template_info(soup)
        img_elements = soup.find_all('img', class_='base-img')
        if len(img_elements) == 0:
            no_image_left = True
        else:
            for img in img_elements:
                img_src = img.get('src')
                img_extension = img_src.split('.')[-1]
                img_counter += 1
                meme_entries['image_link'].append(img_src)
                id = template_name + '_' + str(img_counter)
                meme_entries['id'].append(id)
                meme_entries['template_name'].append(template_name)
                # save image from img_src
                img_data = requests.get('https:'+img_src).content
                path = os.path.join(template_folder, id + '.' + img_extension)
                meme_entries['path'].append(path)
                with open(path, 'wb') as handler:
                    handler.write(img_data)
                          
        index += 1
    return meme_entries, tags

Remove any GIF templates, that might have been added by mistake.

In [None]:
template_df = template_df[~template_df['template_link'].str.contains('gif-maker')]
template_df.reset_index(drop=True, inplace=True)
template_df

Scrape all the memes

In [None]:
import time
import random 

dest_folder = 'D:/Memes2024'

# create dict from template_df's template_name and template_link columns
meme_template_dict = template_df.set_index('template_name')['template_link'].to_dict()
meme_template_dict

def download_memes(template_link_dict, template_df, dest_folder, start_index=0):
    for idx, (template_name, link) in tqdm(enumerate(template_link_dict.items()), total=len(template_link_dict)):
        if idx < start_index:
            continue
        print('Downloading: ', template_name)
        _, tags = download_template_gallery(template_name, link, dest_folder)
        template_df.loc[template_df['template_name'] == template_name, 'tags'] = str(tags)

def download_specific_meme_with_new_name(link, template_df, dest_folder, new_template_name):
    _, tags = download_template_gallery(new_template_name, link, dest_folder)
    template_df.loc[template_df['template_link'] == link, 'template_name'] = new_template_name

# download_memes(meme_template_dict, template_df, dest_folder, start_index=0)
download_specific_meme_with_new_name('/meme/39524143/donald-trump', template_df, dest_folder, 'donald-trump-2')



Delete any folders that are empty

In [None]:
import os

def find_empty_folders(directory):
    empty_folders = []
    for root, dirs, files in os.walk(directory):
        if not dirs and not files:
            empty_folders.append(root)
    return empty_folders

empty_folders = find_empty_folders('D:/Memes2024')
print(empty_folders)

def delete_empty_folders(empty_folders):
    for folder in empty_folders:
        os.rmdir(folder)

delete_empty_folders(empty_folders)

empty_folders = find_empty_folders('D:/Memes2024')
print(len(empty_folders))


Create a dataframe that keeps track of each meme, its template and the filepaths of the image

In [3]:
from tqdm.notebook import tqdm
import pandas as pd
import os
root_folder = r"/home/hsdslab/murgi/Memes2024"

all_meme_entries = pd.DataFrame({'id': [],'template_name':[], 'path': []})

for folder in tqdm(os.listdir(root_folder), total=len(os.listdir(root_folder)), disable=True):
    template_name = os.path.basename(folder)
    template_name = template_name.lower()
    for file in os.listdir(os.path.join(root_folder, folder)):
        path = os.path.join(root_folder,folder, file)
        id = os.path.splitext(file)[0]
        entry = {'id': id, 'template_name': template_name, 'path': path}
        entry_df = pd.DataFrame(entry, index=[0])
        all_meme_entries = pd.concat([all_meme_entries, entry_df])

all_meme_entries.reset_index(drop=True, inplace=True)
all_meme_entries

Unnamed: 0,id,template_name,path
0,I-dont-want-to-play-with-you-anymore_75,i-dont-want-to-play-with-you-anymore,/home/hsdslab/murgi/Memes2024/I-dont-want-to-p...
1,I-dont-want-to-play-with-you-anymore_101,i-dont-want-to-play-with-you-anymore,/home/hsdslab/murgi/Memes2024/I-dont-want-to-p...
2,I-dont-want-to-play-with-you-anymore_37,i-dont-want-to-play-with-you-anymore,/home/hsdslab/murgi/Memes2024/I-dont-want-to-p...
3,I-dont-want-to-play-with-you-anymore_1,i-dont-want-to-play-with-you-anymore,/home/hsdslab/murgi/Memes2024/I-dont-want-to-p...
4,I-dont-want-to-play-with-you-anymore_39,i-dont-want-to-play-with-you-anymore,/home/hsdslab/murgi/Memes2024/I-dont-want-to-p...
...,...,...,...
124203,Hello-My-Name-Is_7,hello-my-name-is,/home/hsdslab/murgi/Memes2024/Hello-My-Name-Is...
124204,Hello-My-Name-Is_26,hello-my-name-is,/home/hsdslab/murgi/Memes2024/Hello-My-Name-Is...
124205,Hello-My-Name-Is_5,hello-my-name-is,/home/hsdslab/murgi/Memes2024/Hello-My-Name-Is...
124206,Hello-My-Name-Is_2,hello-my-name-is,/home/hsdslab/murgi/Memes2024/Hello-My-Name-Is...


Make sure the names in template_df match the names in all_meme_entries

In [None]:
template_df.loc[:, 'template_name'] = template_df['template_name'].str.lower()
template_df

In [None]:
import numpy as np

template_df['tags'] = template_df['tags'].replace('None', np.nan)

In [None]:
# get all the template_name values that are duplicated
duplicated_template_names = template_df[template_df.duplicated('template_name')]['template_name'].values
duplicated_template_names

template_df[template_df['template_name'].isin(duplicated_template_names)].sort_values('template_name')

If multiple entries are present for a single template in template_df, then we can merge their tags with the following functions:

In [None]:
import ast

def merge_template_tags_different_names(df, template_names:list):
    tags = []
    for template_name in template_names:
        temp_tags = df[df['template_name'] == template_name]['tags']
        temp_tags = [ast.literal_eval(tag) for tag in temp_tags]
        temp_tags = [item for sublist in temp_tags for item in sublist]
        tags += temp_tags

    for template_name in template_names:
        df.loc[df['template_name'] == template_name, 'tags'] = str(tags)

def merge_template_tags(df,template_name):
    tags = df[df['template_name'] == template_name]['tags'].dropna()
    tags = [ast.literal_eval(tag) for tag in tags]
    tags = [item for sublist in tags for item in sublist]
    df.loc[df['template_name'] == template_name, 'tags'] = str(tags)

# duplicated_template_names = duplicated_template_names.template_name.unique()

for template_name in duplicated_template_names:
    merge_template_tags(template_df, template_name)


In [None]:
template_df[template_df['template_name'].isin(duplicated_template_names)].sort_values('template_name').head(6)

In [None]:
# rename hide-the-pain-harold-2 to hide-the-pain-harold
template_df.loc[template_df['template_link'] == '/meme/Hide-the-Pain-Harold', 'template_name'] = 'hide-the-pain-harold-og'
template_df.loc[template_df['template_link'] == '/meme/75105871/Hide-the-pain-harold', 'template_name'] = 'hide-the-pain-harold-thumbs-up'
template_df.loc[template_df['template_link'] == '/meme/Waiting-Skeleton', 'template_name'] = 'waiting-skeleton-og'

duplicated_template_names = template_df[template_df.duplicated('template_name')]['template_name'].values
template_df[template_df['template_name'].isin(duplicated_template_names)].sort_values('template_name')


Drop any duplicates from the list above

In [None]:
template_df.drop_duplicates(subset='template_name', keep='first', inplace=True)

Delete template records left after empty folders are deleted

In [None]:
template_df[template_df['template_name'].str.contains('hide-the-')]

In [None]:
difference = list(set(template_df['template_name'].str.lower()) - set(all_meme_entries['template_name'].str.lower()))
difference

template_df = template_df[~template_df['template_name'].isin(difference)]
template_df

In [None]:
assert len(all_meme_entries.template_name.unique()) == len(template_df.template_name.unique())

Save progress

In [7]:
template_df.to_parquet('../data/meme_template_links.parquet')
all_meme_entries.to_parquet('../data/meme_entries.parquet')

In [None]:
!dagshub login

import dagshub
TOKEN = dagshub.auth.get_token()

In [None]:
USER_NAME = 'levente-murgas'
REPO_NAME = 'meme-research-2024'


## Download blank template images

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

template_df = pd.read_parquet('../data/meme_template_links.parquet')

meme_template_dict = template_df.set_index('template_name')['template_link'].to_dict()
meme_template_dict


def download_blank_image(template_name: str, link: str, dest_folder='../data/test-folder'):
    missing_imgs = []
    # create destination folder if it does not exist
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    full_link = 'https://imgflip.com' + link
    page = requests.get(full_link)
    if page.status_code != 200:
        raise Exception('Error:', page.status_code)
    soup = BeautifulSoup(page.content, 'html.parser')
    blank_img_div = soup.find('div', class_='meme-text-link')
    
    try:
        blank_img_link = blank_img_div.find_next_sibling('img').get('src')
        img_extension = blank_img_link.split('.')[-1]
        if blank_img_link.startswith('/s/'):
            img_data = requests.get('https://imgflip.com' + blank_img_link).content
        else:
            img_data = requests.get('https:' + blank_img_link).content
        path = os.path.join(dest_folder, template_name + '.' +  img_extension)
        with open(path, 'wb') as handler:
            handler.write(img_data)
    except:
        print('No blank image found for:', template_name)
        missing_imgs.append(template_name)

    return missing_imgs

missing_imgs = []
for template_name, link in tqdm(meme_template_dict.items(), total=len(meme_template_dict)):
    missing = download_blank_image(template_name, link, '../data/blank_images')
    missing_imgs += missing

print(missing_imgs)

In [None]:
blank_images = os.listdir('../data/blank_images')
# create full path not relative
blank_images = [os.path.join('../data/blank_images', img) for img in blank_images]
# create full path not relative
blank_images = [os.path.abspath(img) for img in blank_images]

blank_images_templates = [os.path.basename(img).split('.')[0] for img in blank_images]

# create df
blank_images_df = pd.DataFrame({'template_name': blank_images_templates, 'path': blank_images})
blank_images_df

In [None]:
template_df = template_df.merge(blank_images_df, on='template_name', how='left')
template_df.path.isna().sum()

In [None]:
template_df.to_parquet('../data/meme_template_links.parquet')

In [None]:
import pandas as pd
def download_captions_chart(template_name: str, link: str, dest_folder='../data/test-folder'):
    missing_memes = []
    # create destination folder if it does not exist
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    full_link = 'https://imgflip.com' + link
    page = requests.get(full_link)
    if page.status_code != 200:
        raise Exception('Error:', page.status_code)
    soup = BeautifulSoup(page.content, 'html.parser')

    captions_chart = soup.find('div', class_='meme-captions-chart')
    table = captions_chart.find('table')
# Create an empty dataframe to store the extracted data
extracted_data = pd.DataFrame(columns=['Date', 'Number'])

# Iterate over the rows of the existing dataframe
for index, row in template_df.iterrows():
    # Extract the date and number from each row
    date = row['date']
    number = row['number']
    
    # Create a new row in the extracted_data dataframe
    extracted_data.loc[index] = [date, number]

# Print the extracted data
print(extracted_data)
    