In [2]:
import pandas as pd

template_df = pd.read_parquet('../../data/meme_template_links.parquet')
template_names = template_df['template_name'].unique()
template_names = list(template_names)
print(f"Number of unique template names: {len(template_names)}")
print(type(template_names))

Number of unique template names: 1145
<class 'list'>


In [2]:
import pandas as pd
meme_df = pd.read_parquet('../../data/meme_entries.parquet')
meme_df

Unnamed: 0,id,template_name,path
0,0-days-without-lenny-simpsons,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
1,0-days-without-Lenny-Simpsons_1,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
2,0-days-without-Lenny-Simpsons_10,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
3,0-days-without-Lenny-Simpsons_100,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
4,0-days-without-Lenny-Simpsons_101,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
...,...,...,...
124203,Zuckerberg_5,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_5.jpg
124204,Zuckerberg_6,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_6.jpg
124205,Zuckerberg_7,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_7.jpg
124206,Zuckerberg_8,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_8.jpg


In [None]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm.notebook import tqdm

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Put the model in "evaluation" mode, turns off dropout

# send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



# Function to encode a list of meme names into embeddings
def encode_template_names(template_names):
    print(f"Encoding {len(template_names)} template names...")
    encoded_inputs = tokenizer(template_names, padding=True, truncation=True, return_tensors="pt")
    encoded_inputs.to(device)
    with torch.no_grad():
        outputs = model(**encoded_inputs)

    # Use the mean of the last hidden states as the meme representation
    embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings

# Encode meme names
embeddings = encode_template_names(template_names)

# Compute cosine similarities between meme embeddings
# Since embeddings are in a PyTorch tensor, we need to convert them to a numpy array first
embeddings_np = embeddings.cpu().numpy()
similarity_matrix = cosine_similarity(embeddings_np)

# Set a similarity threshold
similarity_threshold = 0.9  # You might need to adjust this based on your dataset

matching_meme_names = []

# Iterate through the similarity matrix
for i in tqdm(range(len(template_names)), total=len(template_names), desc="Finding matching memes"):
    for j in range(i+1, len(template_names)):
        # Check if the cosine similarity is above the threshold
        if similarity_matrix[i][j] >= similarity_threshold:
            matching_meme_names.append((template_names[i], template_names[j]))

# Print the matching meme names
for meme_pair in matching_meme_names:
    print(f"Matching memes: {meme_pair[0]} and {meme_pair[1]}")




In [None]:
import networkx as nx

# Create a graph
G = nx.Graph()
# add edges from the matching meme names
G.add_edges_from(matching_meme_names)
# draw transitive closure of the graph
transitive_closure = nx.transitive_closure(G)
transitive_closure

# plot the transitive closure
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
pos = nx.spring_layout(transitive_closure)
# display the graph
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
nx.draw(transitive_closure, pos=pos, with_labels=False, node_size=30, node_color='skyblue', edge_color='gray')
plt.show()

# Walk over every component in the graph
for component in nx.connected_components(transitive_closure):
    # Fetch the blank image from template_df's path column
    blank_images = [template_df.loc[template_df['template_name'] == template_name, 'path'].values[0] for template_name in component]

    # Ask the user for input to determine whether any two templates should be merged
    while len(component) > 1:
        print(f"Component: {component}")
        print("Blank Images:")
        for image in blank_images:
            print(image)
        merge_decision = input("Do you want to merge any two templates? (y/n): ")
        if merge_decision.lower() == 'n':
            break
        elif merge_decision.lower() == 'y':
            template1 = input("Enter the name of the first template to merge: ")
            template2 = input("Enter the name of the second template to merge: ")
            if template1 in component and template2 in component:
                component.remove(template1)
                component.remove(template2)
                component.append(f"{template1}-{template2}")
                blank_images = [template_df.loc[template_df['template_name'] == template_name, 'path'].values[0] for template_name in component]
            else:
                print("Invalid template names. Please try again.")
        else:
            print("Invalid input. Please try again.")




In [None]:
import ast
import os
import shutil


def merge_templates(keep_template, merge_template, temp_template_df):
    # Fetch the rows for the two templates
    keep_row = temp_template_df.loc[temp_template_df['template_name'] == keep_template]
    merge_row = temp_template_df.loc[temp_template_df['template_name'] == merge_template]

    # if merge template is not found, return the original dataframe
    if merge_row.empty:
        return temp_template_df

    # append tags of the merge template to the keep template
    # check if tags type is list
    if type(keep_row['tags'].values[0]) != list:
        keep_tags = ast.literal_eval(keep_row['tags'].values[0])
    else: 
        keep_tags = keep_row['tags'].values[0]

    if type(merge_row['tags'].values[0]) != list:
        merge_tags = ast.literal_eval(merge_row['tags'].values[0])
    else:
        merge_tags = merge_row['tags'].values[0]
        
    keep_tags.extend(merge_tags)
    keep_tags = list(set(keep_tags))
    temp_template_df.loc[temp_template_df['template_name'] == keep_template, 'tags'] = str(keep_tags)

    # delete the row for the merge template
    temp_template_df = temp_template_df[temp_template_df['template_name'] != merge_template]

    return temp_template_df

def move_memes(keep_template, merge_template, meme_df):
    # Get the folder path of keep_template
    keep_template_folder = os.path.dirname(meme_df.loc[meme_df['template_name'] == keep_template, 'path'].values[0])
    # print(keep_template_folder)

    # Get the folder path of merge_template
    merge_template_folder = os.path.dirname(meme_df.loc[meme_df['template_name'] == merge_template, 'path'].values[0])
    # print(merge_template_folder)

    # Fetch the highest id of keep_template
    files = os.listdir(keep_template_folder)
    highest_id = len(files)

    new_ids = range(highest_id + 1, highest_id + 1 + len(meme_df.loc[meme_df['template_name'] == merge_template]))


    # # Move the memes from merge_template folder to keep_template folder
    for root, dirs, files in os.walk(merge_template_folder):
        for new_id, file in zip(new_ids,files):
            # rename the file
            # print(f"Moving {file} from {root} to {keep_template_folder}")
            shutil.copy2(os.path.join(root, file), keep_template_folder)
            # rename using shutil.move
            # print(f"Renaming {os.path.splitext(os.path.basename(file))[0]} to {keep_template}-{new_id}")
            new_file = file.replace(os.path.splitext(os.path.basename(file))[0], f"{keep_template}-{new_id}")
            os.rename(os.path.join(keep_template_folder, file), os.path.join(keep_template_folder, new_file))

# meme_df = pd.read_parquet('../data/meme_entries.parquet')
# meme_df




In [None]:
temp_template_df = template_df.copy()
temp_template_df

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import os
from IPython.display import display, clear_output

# Walk over every component in the graph
# num_components = len(nx.connected_components(transitive_closure))
num_components = nx.number_connected_components(G)

# replace "None" in tags with empty list
temp_template_df['tags'] = temp_template_df['tags'].apply(lambda x: [] if x is None else x)
temp_template_df

test_transitive_closure = nx.transitive_closure(G)

start_idx = 56

for idx, component in enumerate(nx.connected_components(test_transitive_closure)):
    if idx < start_idx:
        continue
    try:
        # Ask the user for input to determine whether any two templates should be merged
        while len(component) > 1:
            # clear cell output
            clear_output(wait=True)
            plt.close('all')
            print(f"Component {idx + 1} of {num_components}")


            nodes = list(component)
            for index, node in enumerate(nodes):
                print(f"{index}: {node}")
                    
            print("*"*50)

            blank_images = [template_df.loc[template_df['template_name'] == template_name, 'path'].values[0] for template_name in component]

            # Create a new figure for each image
            for i, image_path in enumerate(blank_images):
                # Create a new plot for each image
                plt.figure(i+1)
                
                # Load and plot the image
                image = Image.open(image_path)
                plt.imshow(image)
                plt.axis('off')        
                plt.title(os.path.splitext(os.path.basename(image_path))[0])
                
                # Show the plot
                plt.show()


            plt.tight_layout()
            plt.show()
        


            merge_decision = input("Do you want to merge any two templates? (y/n): ")
            if merge_decision.lower() == 'n':
                break
            elif merge_decision.lower() == 'y':
                template1_idx = int(input("Enter the index of the to KEEP: "))
                template2_idx = int(input("Enter the index of the to MERGE: "))

                keep_template = nodes[template1_idx]
                merge_template = nodes[template2_idx]

                temp_template_df = merge_templates(keep_template, merge_template, temp_template_df)
                move_memes(keep_template, merge_template, meme_df)

                component.remove(merge_template)
            else:
                print("Invalid input. Please try again.")
    except Exception as e:
        print("Stopped at component", idx)
        raise e



In [None]:
template_df = temp_template_df.copy()

template_df.loc[:,'tags'] = template_df.loc[:,'tags'].apply(lambda x: str(x))
template_df.to_parquet('../data/meme_template_links.parquet')


In [None]:
template_df[template_df['template_link'].str.contains('/meme/335274522/finally-you-can-make-your-own-meme-easy')]

In [None]:
templates_to_delete = [
    "blank-comic-panel-1x2",
    "blank-comic-panel-2x2",
    "blank-comic-panel-2x1",
    "blank-white-template",
    "blank-transparent-square",
    "free",
    "white-background",
    "transparent",
    "black-background",
    "make-your-own-meme"
]

template_df = template_df[~template_df['template_name'].isin(templates_to_delete)]
template_df.to_parquet('../data/meme_template_links.parquet')

### More to merge:


In [None]:
temp_template_df = template_df.copy()

In [None]:
temp_template_df.loc[:,'tags'] = temp_template_df.loc[:,'tags'].apply(lambda x: str(x))


In [None]:
keep_template = "Sad-Pablo-Escobar"
keep_template = keep_template.lower()
merge_template = "forever-alone"
merge_template = merge_template.lower()

temp_template_df = merge_templates(keep_template, merge_template, temp_template_df)
move_memes(keep_template, merge_template, meme_df)

In [None]:
template_df = temp_template_df.copy()

### Clean up remaining folders:

In [None]:
meme_dir = 'D:/Memes2024'

template_folders = os.listdir(meme_dir)

counter = 0
for folder in template_folders:
    if folder.lower() not in template_df['template_name'].values:
        # if input is enter
        print(f"{folder}")
        answer = input(f"Delete {folder}? (y/n): ")
        if answer == 'y':
            shutil.rmtree(os.path.join(meme_dir, folder))

print(f"Deleted {counter} folders")


In [None]:
template_df.to_parquet('../data/meme_template_links.parquet')

In [None]:
def rename_files_in_directory_to_dir_name(directory):
    for root, dirs, files in os.walk(directory):
        template_name = os.path.basename(root).lower()
        for idx, file in enumerate(files):
            new_file = file.replace(os.path.splitext(os.path.basename(file))[0], f"{template_name}-{idx}")
            print(f"Renaming {file} to {new_file}")
            os.rename(os.path.join(root, file), os.path.join(root, new_file))

rename_files_in_directory_to_dir_name(r'D:\Memes2024\Forever-Alone-Happy')

## Finding duplicates using reference image embeddings this time

In [None]:
import os
import torch
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# List of your meme template image paths
image_paths = list(template_df['path'])
template_names = list(template_df['template_name'])

# Function to preprocess and encode a list of image paths
def encode_images(image_paths):
    images = [Image.open(path).convert("RGB") for path in image_paths]
    inputs = processor(images=images, return_tensors="pt", padding=True)
    inputs.to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    return image_features

# Encode images
image_embeddings = encode_images(image_paths)

# Compute cosine similarities between image embeddings
image_embeddings_np = image_embeddings.cpu().numpy()
similarity_matrix = cosine_similarity(image_embeddings_np)

# Set a similarity threshold
similarity_threshold = 0.8  # Adjust based on your dataset

matching_meme_names = []

for i in tqdm(range(len(template_names)), total=len(template_names), desc="Finding matching memes"):
    for j in range(i+1, len(template_names)):
        # Check if the cosine similarity is above the threshold
        if similarity_matrix[i][j] >= similarity_threshold:
            matching_meme_names.append((template_names[i], template_names[j]))

# Print the matching meme names
for meme_pair in matching_meme_names:
    print(f"Matching memes: {meme_pair[0]} and {meme_pair[1]}")

## Here we go again:


In [None]:
temp_template_df = template_df.copy()
temp_template_df

In [None]:
import networkx as nx

# Create a graph
G = nx.Graph()
# add edges from the matching meme names
G.add_edges_from(matching_meme_names)
# draw transitive closure of the graph
transitive_closure = nx.transitive_closure(G)
transitive_closure

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import os
from IPython.display import display, clear_output

# Walk over every component in the graph
# num_components = len(nx.connected_components(transitive_closure))
num_components = nx.number_connected_components(G)

# replace "None" in tags with empty list
temp_template_df['tags'] = temp_template_df['tags'].apply(lambda x: [] if x is None else x)

test_transitive_closure = nx.transitive_closure(G)

start_idx = 0

for idx, component in enumerate(nx.connected_components(G)):
    if idx < start_idx:
        continue
    try:
        # Ask the user for input to determine whether any two templates should be merged
        while len(component) > 1:
            # clear cell output
            clear_output(wait=True)
            plt.close('all')
            print(f"Component {idx + 1} of {num_components}")


            nodes = list(component)
            for index, node in enumerate(nodes):
                print(f"{index}: {node}")
                    
            print("*"*50)

            blank_images = [template_df.loc[template_df['template_name'] == template_name, 'path'].values[0] for template_name in component]

            # Create a new figure for each image
            for i, image_path in enumerate(blank_images):
                # Create a new plot for each image
                plt.figure(i+1)
                
                # Load and plot the image
                image = Image.open(image_path)
                plt.imshow(image)
                plt.axis('off')        
                plt.title("["+str(i)+"]: " +os.path.splitext(os.path.basename(image_path))[0])
                
                # Show the plot
                plt.show()


            plt.tight_layout()
            plt.show()
        


            merge_decision = input("Do you want to merge any two templates? (y/n): ")
            if merge_decision.lower() == 'n':
                break
            elif merge_decision.lower() == 'y':
                template1_idx = int(input("Enter the index of the to KEEP: "))
                template2_idx = int(input("Enter the index of the to MERGE: "))

                keep_template = nodes[template1_idx]
                merge_template = nodes[template2_idx]

                temp_template_df = merge_templates(keep_template, merge_template, temp_template_df)
                move_memes(keep_template, merge_template, meme_df)

                component.remove(merge_template)
            else:
                print("Invalid input. Please try again.")
    except Exception as e:
        print("Stopped at component", idx)
        raise e



In [None]:
template_df = temp_template_df.copy()

template_df.loc[:,'tags'] = template_df.loc[:,'tags'].apply(lambda x: str(x))
template_df.to_parquet('../data/meme_template_links.parquet')

In [None]:
templates_to_delete = [
"black-box-meme",
"blank-meme-template",
"blank-black"
]

template_df = template_df[~template_df['template_name'].isin(templates_to_delete)]
template_df.to_parquet('../data/meme_template_links.parquet')
template_df

In [None]:
meme_dir = 'D:/Memes2024'

template_folders = os.listdir(meme_dir)

counter = 0
for folder in template_folders:
    if folder.lower() not in template_df['template_name'].values:
        # if input is enter
        counter += 1
        # answer = input(f"Delete {folder}? (y/n): ")
        shutil.rmtree(os.path.join(meme_dir, folder))

print(f"Deleted {counter} folders")


In [None]:
template_df['ref_image_name'] = template_df['path'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
template_df

In [None]:
blank_images = os.listdir('../data/blank_images')
curr_ref_images = list(template_df['ref_image_name'].values)

counter = 0
for image in blank_images:
    image_name = os.path.splitext(image)[0]
    if image_name not in curr_ref_images:
        os.remove(os.path.join('../data/blank_images', image))


## Drop templates with very few memes:

In [None]:
meme_df = pd.read_parquet('../data/meme_entries.parquet')
meme_df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assume meme_df is your DataFrame and 'template_name' is the column of interest.
# First, calculate the value counts to get the number of memes per template.
value_counts = meme_df['template_name'].value_counts()

# Next, determine which templates have less than 30 memes.
less_than_30 = value_counts < 30

# We prepare bins based on the unique counts to make sure each count gets its own bar.
max_count = value_counts.max()
bins = np.arange(0, max_count + 2) - 0.5  # to center bins on integers

plt.figure(figsize=(10, 6))

# Plot the histogram for all templates.
plt.hist(value_counts, bins=bins, color='skyblue', label='30 or more memes')

# Overlay the histogram for templates with less than 30 memes.
# Note: value_counts[less_than_30] filters to just those counts less than 30.
plt.hist(value_counts[less_than_30], bins=bins, color='red', label='Less than 30 memes')

plt.xlabel('Number of memes')
plt.ylabel('Number of templates')
plt.title('Number of memes per template')
plt.yscale('log')  # Use a logarithmic scale for the y-axis
plt.legend()

plt.show()


In [None]:
temp_template_df[temp_template_df['template_name'].str.contains('was-a-girl')]

In [None]:
less_than_30_templates = list(value_counts[less_than_30].index)
less_than_30_templates

# Plot the reference images of the templates with less than 30 memes
for template_name in less_than_30_templates:
    ref_image_path = template_df.loc[template_df['template_name'] == template_name, 'path'].values[0]
    ref_image = Image.open(ref_image_path)
    plt.imshow(ref_image)
    plt.axis('off')
    plt.title(template_name)
    plt.show()

In [243]:
# drop templates with less than 30 memes
template_df = template_df[~template_df['template_name'].isin(less_than_30_templates)]
template_df.to_parquet('../data/meme_template_links.parquet')

  if _pandas_api.is_sparse(col):


In [244]:
meme_dir = 'D:/Memes2024'

template_folders = os.listdir(meme_dir)

counter = 0
for folder in template_folders:
    if folder.lower() not in template_df['template_name'].values:
        # if input is enter
        counter += 1
        # answer = input(f"Delete {folder}? (y/n): ")
        shutil.rmtree(os.path.join(meme_dir, folder))

print(f"Deleted {counter} folders")


Deleted 355 folders


In [245]:
blank_images = os.listdir('../data/blank_images')
curr_ref_images = list(template_df['ref_image_name'].values)

counter = 0
for image in blank_images:
    image_name = os.path.splitext(image)[0]
    if image_name not in curr_ref_images:
        os.remove(os.path.join('../data/blank_images', image))


Copy each blank template its corresponding template folder.


In [23]:
meme_df

Unnamed: 0,id,template_name,path
0,0-days-without-Lenny-Simpsons_1,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
1,0-days-without-Lenny-Simpsons_10,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
2,0-days-without-Lenny-Simpsons_100,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
3,0-days-without-Lenny-Simpsons_101,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
4,0-days-without-Lenny-Simpsons_102,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
...,...,...,...
123058,Zuckerberg_5,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_5.jpg
123059,Zuckerberg_6,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_6.jpg
123060,Zuckerberg_7,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_7.jpg
123061,Zuckerberg_8,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_8.jpg


In [22]:
import os
import shutil
import pandas as pd

temp_meme_df = meme_df.copy()
temp_meme_df['template_folder'] = temp_meme_df['path'].apply(lambda x: os.path.dirname(x))
temp_meme_df

template_name_template_folder = temp_meme_df[['template_name', 'template_folder']].drop_duplicates()
temp_template_df = template_df.copy()
temp_template_df['template_folder'] = temp_template_df['template_name'].map(template_name_template_folder.set_index('template_name')['template_folder'])
temp_template_df


def copy_ref_image_to_template_folder(row):
    source_path = row['path']
    destination_folder = row['template_folder']
    
    # Copy the file to the destination folder
    shutil.copy(source_path, destination_folder)

temp_template_df.apply(copy_ref_image_to_template_folder, axis=1)

1       None
2       None
3       None
5       None
6       None
        ... 
1881    None
1882    None
1883    None
1884    None
1885    None
Length: 1145, dtype: object

In [24]:
from tqdm.notebook import tqdm

root_folder = "D:/Memes2024/"

all_meme_entries = pd.DataFrame({'id': [],'template_name':[], 'path': []})

for folder in tqdm(os.listdir(root_folder), total=len(os.listdir(root_folder)), disable=True):
    template_name = os.path.basename(folder)
    template_name = template_name.lower()
    for file in os.listdir(os.path.join(root_folder, folder)):
        path = os.path.join(root_folder,folder, file)
        id = os.path.splitext(file)[0]
        entry = {'id': id, 'template_name': template_name, 'path': path}
        entry_df = pd.DataFrame(entry, index=[0])
        all_meme_entries = pd.concat([all_meme_entries, entry_df])

all_meme_entries.reset_index(drop=True, inplace=True)
all_meme_entries

Unnamed: 0,id,template_name,path
0,0-days-without-lenny-simpsons,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
1,0-days-without-Lenny-Simpsons_1,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
2,0-days-without-Lenny-Simpsons_10,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
3,0-days-without-Lenny-Simpsons_100,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
4,0-days-without-Lenny-Simpsons_101,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons\0-d...
...,...,...,...
124203,Zuckerberg_5,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_5.jpg
124204,Zuckerberg_6,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_6.jpg
124205,Zuckerberg_7,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_7.jpg
124206,Zuckerberg_8,zuckerberg,D:/Memes2024/Zuckerberg\Zuckerberg_8.jpg


In [26]:
all_meme_entries.to_parquet('../../data/meme_entries.parquet')

In [3]:
import pandas as pd
files_df = pd.read_parquet('../../data/meme_entries.parquet')
files = files_df.loc[:,'path'].tolist()
len(files)

124208