In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
def create_df_from_txt_file(path):
    with open(path, "r") as f:
        memes = f.readlines()

    path_hash_dict = {}
    for line in memes:
        path, _hash = line.split("\t")
        path = path.replace("\\", "/")
        path_hash_dict[path] = _hash.strip()

    df = pd.DataFrame.from_dict(path_hash_dict, orient="index", columns=["phash"])
    df.reset_index(inplace=True)
    df.columns = ["path", "phash"]
    try:
        df["template"] = df["path"].apply(lambda x: x.split("/")[-2] if "Memes2023_splitted" in x else None) 
    except Exception as e:
        print("Error at row: ",df["path"])
        raise e
    return df

def write_back_df_to_txt(df, outpath):
    with open(outpath, "w") as f:
        for row in df.itertuples():
            # get the index of the row
            try:
                idx = row.Index
                # write the idx and the phash to the file
                f.write(f"{df.loc[idx,'path']}\t{df.loc[idx,'phash']}\n")
            except Exception as e:
                print("Error at row: ",row,idx)
                raise e

In [3]:
meme_df = pd.read_parquet("data/meme_entries.parquet")
meme_df.head(10)

Unnamed: 0,id,template_name,path,phash
0,0-days-without-lenny-simpsons,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,0fe9b236e884fc38
1,0-days-without-Lenny-Simpsons_23,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,07e5ba2ee806fe30
2,0-days-without-Lenny-Simpsons_1,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,1fed3226e804fe38
3,0-days-without-Lenny-Simpsons_24,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,07e1b83ee886fa38
4,0-days-without-Lenny-Simpsons_10,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,17e5b22ee806fe30
5,0-days-without-Lenny-Simpsons_25,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,17e1ba2ee806fa38
6,0-days-without-Lenny-Simpsons_27,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,0fedb236e806fa30
7,0-days-without-Lenny-Simpsons_26,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,17edb226e884fe30
8,0-days-without-Lenny-Simpsons_28,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,0fe9b236e886f829
9,0-days-without-Lenny-Simpsons_3,0-days-without-lenny-simpsons,D:/Memes2024/0-days-without-Lenny-Simpsons/0-d...,0fedb236e806fa30


In [6]:
# split the data into train and test stratified by template
train_df, test_df = train_test_split(meme_df, test_size=0.2, random_state=42, stratify=meme_df["template_name"])

# write back to txt files
train_outpath = "./data/phashes/imgflip_cluster_phashes.txt"
test_outpath = "./data/phashes/imgflip_annotate_phashes.txt"
write_back_df_to_txt(train_df, train_outpath)
write_back_df_to_txt(test_df, test_outpath)

In [9]:
facebook_file = "./data/phashes/social-media/facebook_phashes.txt"
reddit_file = "./data/phashes/social-media/reddit_phashes.txt"
twitter_file = "./data/phashes/social-media/twitter_phashes.txt"
imgflip_cluster_file = train_outpath

merged_cluster_file = "./data/phashes/imgflip_plus_sm_to_cluster.txt"

In [10]:

def add_to_clustering(file_to_add, clustering_file):
    with open(file_to_add, "r") as add_f:
        lines = add_f.readlines()
    print("Lines to add: ", len(lines))

    with open(clustering_file, "a") as f:
        for line in lines:
            f.write(line)
            
add_to_clustering(imgflip_cluster_file, merged_cluster_file)
add_to_clustering(facebook_file, merged_cluster_file)
add_to_clustering(reddit_file, merged_cluster_file)
add_to_clustering(twitter_file, merged_cluster_file)

Lines to add:  99360
Lines to add:  236357
Lines to add:  955654
Lines to add:  174339


In [11]:
to_cluster_df = create_df_from_txt_file(merged_cluster_file)
to_cluster_df


Unnamed: 0,path,phash,template
0,D:/Memes2024/UNO-Draw-25-Cards/UNO-Draw-25-Car...,93c9dc8d240aea7b,
1,D:/Memes2024/Peter-parker-reading-a-book/Peter...,c57a6ca563962a65,
2,D:/Memes2024/Daring-today-arent-we-squidward/D...,713230b47efa8c33,
3,D:/Memes2024/Sidious-Error/Sidious-Error_63.jpg,99c96721abc18b9b,
4,D:/Memes2024/They-took-our-jobs-stance-South-P...,ab3245cc9cd96635,
...,...,...,...
1465705,D:/Murgi/Twitter2023/2020_6-36655.jpg,bb839b1a2598837b,
1465706,D:/Murgi/Twitter2023/2019_6-28933.jpg,e73199e7618c7096,
1465707,D:/Murgi/Twitter2023/2020_6-6090.jpg,d78c9adb2da44525,
1465708,D:/Murgi/Twitter2023/2020_6-27653.jpg,f98a0ef5626e6641,
