Saving the counts of the different classes to csv files for both train and test sets
---------------------------------------------------------------------------

In [None]:
train_folder = 'D:/Memes2023_splitted/finetuning/train/'
val_folder = 'D:/Memes2023_splitted/finetuning/val/'

import os
from tqdm import tqdm
import pandas as pd

train_meme_folders = os.listdir(train_folder)
val_meme_folders = os.listdir(val_folder)
train_file_counts = {meme_folder: len(os.listdir(os.path.join(train_folder, meme_folder))) for meme_folder in tqdm(train_meme_folders)}
val_file_counts = {meme_folder: len(os.listdir(os.path.join(val_folder, meme_folder))) for meme_folder in tqdm(val_meme_folders)}

#save train_file_counts as csv
df = pd.DataFrame.from_dict(train_file_counts, orient='index')
#key column name = Class   value column name = Count
df = df.rename_axis('Class').reset_index().rename(columns={0: 'Count'})
df.to_csv('train_file_counts.csv', index=False)

#save val_file_counts as csv
df = pd.DataFrame.from_dict(val_file_counts, orient='index')
df = df.rename_axis('Class').reset_index().rename(columns={0: 'Count'})
df.to_csv('val_file_counts.csv', index=False)


Reading basics statistics of the dataset
----------------------------------------

In [None]:
import numpy as np

print('Train data:')
#find minimum value of train file counts
min_value = min(train_file_counts.values())
print(f'Minimum value of train file counts: {min_value}')

#get max value of train file counts
max_value = max(train_file_counts.values())
print(f'Maximum value of train file counts: {max_value}')

#get mean of the values in the dict
mean = sum(train_file_counts.values()) / len(train_file_counts)
print(f'Mean of train file counts: {mean}')

#get median of the values in the dict
median = np.median(list(train_file_counts.values()))
print(f'Median of train file counts: {median}')

#get 75th percentile of the values in the dict
percentile_75 = np.percentile(list(train_file_counts.values()), 75)
print(f'75th percentile of train file counts: {percentile_75}')

print('Val data:')
#find minimum value of val file counts
min_value = min(val_file_counts.values())
print(f'Minimum value of val file counts: {min_value}')

#get max value of val file counts
max_value = max(val_file_counts.values())
print(f'Maximum value of val file counts: {max_value}')

#get mean of the values in the dict
mean = sum(val_file_counts.values()) / len(val_file_counts)
print(f'Mean of val file counts: {mean}')

#get median of the values in the dict
median = np.median(list(val_file_counts.values()))
print(f'Median of val file counts: {median}')

#get 75th percentile of the values in the dict
percentile_75 = np.percentile(list(val_file_counts.values()), 75)
print(f'75th percentile of val file counts: {percentile_75}')




Plot the distribution of the number of images per class in the dataset
---------------------------------------------------------------------

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

train_file_counts = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/train_file_counts.csv')
train_file_counts = {row['Class']: row['Count'] for row in train_file_counts.to_dict(orient='records')}

val_file_counts = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/val_file_counts.csv')
val_file_counts = {row['Class']: row['Count'] for row in val_file_counts.to_dict(orient='records')}

#Add the values of val_file_counts to train_file_counts
combined_file_counts = {k: train_file_counts[k] + val_file_counts[k] for k in train_file_counts.keys()}
#Sort the combined file counts
combined_file_counts = dict(sorted(combined_file_counts.items(), key=lambda item: item[1]))

#Sum of all values in combined_file_counts
# sum_of_values = sum(combined_file_counts.values())
# print(f'We have {sum_of_values} images in total.')

#Create a bar chart of the combined file counts
# Show only the top 20 classes
plt.figure(figsize=(20, 10))
plt.bar(list(combined_file_counts.keys())[-50:], list(combined_file_counts.values())[-50:])
plt.xticks(rotation=90)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Number of images per class')
plt.show()

# Create a bar chart for the last 20 classes
plt.figure(figsize=(20, 10))
plt.bar(list(combined_file_counts.keys())[:50], list(combined_file_counts.values())[:50])
plt.xticks(rotation=90)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Number of images per class')
plt.show()

plt.figure(figsize=(20, 10))
plt.bar(list(combined_file_counts.keys()), list(combined_file_counts.values()))
plt.xticks(rotation=90)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Number of images per class')
plt.show()


Check dataset if there are any corrupted images to replace them
---------------------------------------------------------------

In [None]:
import os
from PIL import Image

# Directory where the images are stored
train_folder = 'D:/Memes2023_splitted/finetuning/train/'
val_folder = 'D:/Memes2023_splitted/finetuning/val/'

corrupt_files = []

for folder in [train_folder, val_folder]:
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
                file_path = os.path.join(subdir, file)
                try:
                    img = Image.open(file_path)  # open the image file
                    img.verify()  # verify that it is, in fact an image
                except (IOError, SyntaxError) as e:
                    corrupt_files.append(file_path)
                    print('Bad file:', file_path)  # print out the names of corrupt files

# Save the corrupt files to a text file
with open('corrupt_files.txt', 'w') as f:
    for file in corrupt_files:
        f.write(file + '\n')

Replace corrupted images with their repaired versions
-----------------------------------------------------

In [None]:
import os
from PIL import Image

folder = 'C:/Users/Murgi/Downloads/repaired-val/repaired-val'
# Directory where the images are stored
corrupt_files = []
for subdir, dirs, files in os.walk(folder):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
            file_path = os.path.join(subdir, file)
            try:
                img = Image.open(file_path)  # open the image file
                img.verify()  # verify that it is, in fact an image
            except (IOError, SyntaxError) as e:
                corrupt_files.append(file_path)
                print('Bad file:', file_path)  # print out the names of corrupt files

print(corrupt_files)

In [None]:
import shutil
train_folder_path = "D:/Memes2023_splitted/finetuning/train"
val_folder_path = "D:/Memes2023_splitted/finetuning/val"
for file in os.listdir(folder):
    cls = file.split('_')[0]
    class_folder_path = os.path.join(val_folder_path, cls)
    shutil.copy(os.path.join(folder, file), os.path.join(class_folder_path, file))
    print('File copied to', os.path.join(class_folder_path, file))    

Convert the dataset to HDF5 format
----------------------------------

In [None]:
import os
import h5py
from PIL import Image
import numpy as np
from tqdm import tqdm
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

hdf5_file_path = 'D:/Memes2023/dataset.hdf5'
root = "D:/Memes2023_splitted/finetuning"
corrupted = []

def create_hdf5_file(root, hdf5_file_path):
    # Create an HDF5 file to store the dataset
    with h5py.File(hdf5_file_path, 'w') as f:
        for subdir, dirs, files in tqdm(os.walk(root), total=410674):
            for file in files:
                try:
                    # Ensure the file is an image
                    if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                        file_path = os.path.join(subdir, file)
                        # Get the class name from the file path
                        class_name = os.path.basename(os.path.dirname(file_path))
                        # Open the image using PIL
                        img = Image.open(file_path)
                        img_arr = np.array(img)
                        # Create a group for each class
                        if class_name not in f:
                            class_group = f.create_group(class_name)
                        else:
                            class_group = f[class_name]
                        # Save the image to the group
                        class_group.create_dataset(file, data=img_arr)
                except:
                    print("Corrupted file: ", file_path)
                    corrupted.append(file_path)
                    continue


create_hdf5_file(root, hdf5_file_path)
# save the corrupted files to a text file
with open('corrupted.txt', 'w') as f:
    for item in corrupted:
        f.write("%s\n" % item)

Verify the HDF5 dataset
-----------------------

Check the number of classes and images in the dataset

In [None]:
import h5py

hdf5_file_path = 'D:/Memes2023/dataset.hdf5'

with h5py.File(hdf5_file_path, 'r') as f:
    num_classes = len(f.keys())
    num_images = sum(len(f[class_name]) for class_name in f.keys())
    print(f'Number of classes: {num_classes}')
    print(f'Number of images: {num_images}')

Inscpecting the dataset

In [None]:
import matplotlib.pyplot as plt

with h5py.File(hdf5_file_path, 'r') as f:
    # Choose a random class
    class_name = random.choice(list(f.keys()))
    class_group = f[class_name]
    # Choose a random image from the class
    image_name = random.choice(list(class_group.keys()))
    img_data = class_group[image_name][()]
    # Display the image
    plt.imshow(img_data)
    plt.title(f'Class: {class_name}, Image: {image_name}')
    plt.show()


Caching the paths to the images in our dataset
---------------------------------------------

In [None]:
import os
import pickle
from tqdm import tqdm 

# First time only: create and save the list of image paths
img_train_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths.pkl'
img_val_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_val_paths.pkl'
img_train_dir = 'D:/Memes2023_splitted/finetuning/train'
img_val_dir = 'D:/Memes2023_splitted/finetuning/val'


def cache_image_paths(img_dir, cache_path):
    print(f'Caching image paths in {img_dir}')
    img_label_pairs = []
    for root, dirs, files in tqdm(os.walk(img_dir), total=23083):
        for fname in files:
            img_label_pairs.append((os.path.join(root, fname), os.path.basename(root)))
    with open(cache_path, 'wb') as f:
        pickle.dump(img_label_pairs, f)
    print(f'Cached {len(img_label_pairs)} paths to {cache_path}')

cache_image_paths(img_train_dir, img_train_path_file)
cache_image_paths(img_val_dir, img_val_path_file)

In [2]:
import pickle
from PIL import Image
img_train_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths.pkl'

cache = pickle.load(open(img_train_path_file, 'rb'))
print(type(cache))
filenames = [x[0] for x in cache] 
print(filenames[0])

file = filenames[0]
parts = file.split('/')
to_replace = '/'.join(parts[:3])
replace_with = '../storage/kym-datasets/Memes2023_splitted_resized/finetuning'
new_file = file.replace(to_replace, replace_with)
print(new_file)
#Try to open the image
img = Image.open(new_file)
img.show()

<class 'list'>
D:/Memes2023_splitted/finetuning/train\%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy\%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy_0.png
../storage/kym-datasets/Memes2023_splitted_resized/finetuning/train\%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy\%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy_0.png


In [17]:
import pickle
from PIL import Image
from tqdm import tqdm
img_train_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths.pkl'
img_val_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_val_paths.pkl'

train_rel_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths_rel.pkl'
val_rel_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_val_paths_rel.pkl'

def create_cache_with_rel_path(img_train_path_file, new_cache_file):
    cache = pickle.load(open(img_train_path_file, 'rb'))
    new_cache = []
    for t in tqdm(cache):
        file = t[0]
        parts = file.split('/')
        to_replace = '/'.join(parts[:3])
        replace_with = '/storage/kym-datasets/Memes2023_splitted_resized/finetuning'
        new_file = file.replace(to_replace, replace_with)
        new_file = new_file.replace('\\', '/')
        new_t = (new_file, t[1])
        new_cache.append(new_t)
    with open(new_cache_file, 'wb') as f:
        pickle.dump(new_cache, f)
    print(f'Cached {len(cache)} paths to {new_cache_file}')

create_cache_with_rel_path(img_train_path_file, train_rel_path_file)
create_cache_with_rel_path(img_val_path_file, val_rel_path_file)

100%|██████████| 319466/319466 [00:00<00:00, 399424.18it/s]


Cached 319466 paths to C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths_rel.pkl


100%|██████████| 91205/91205 [00:00<00:00, 332171.36it/s]


Cached 91205 paths to C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_val_paths_rel.pkl


In [19]:
import pickle
img_train_path_file = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_train_paths_rel.pkl'

cache = pickle.load(open(img_train_path_file, 'rb'))
print(type(cache))
print(cache[0])

<class 'list'>
('/storage/kym-datasets/Memes2023_splitted_resized/finetuning/train/%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy/%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy_0.png', '%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy')


In [None]:
import pandas as pd

accuracies = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/plots/class_accuracies.csv')
train_counts = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/train_file_counts.csv')
val_counts = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/val_file_counts.csv')
counts = pd.merge(train_counts, val_counts, on='Class')
train_counts.head(5)
val_counts.head(5)
counts.head(5)
# Sum the counts of train and val
counts['Total'] = counts['Count_x'] + counts['Count_y']
accuracies['Count'] = counts['Total']
accuracies.head(5)
# Save the accuracies with counts
accuracies.to_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/plots/class_accuracies_with_counts_train_val_splitted.csv', index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your data
df = pd.read_csv('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/plots/class_accuracies_with_counts.csv')  
df.columns = ['class', 'accuracy', 'count']
# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['count'], df['accuracy'])
plt.xlabel('Number of Samples')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Samples per Class (Train/Val)')

# # Optionally, add a trendline
# z = np.polyfit(df['count'], df['accuracy'], 1)
# p = np.poly1d(z)
# plt.plot(df['count'], p(df['count']), "r--")
# What is the number of points on the plot?
print(f'Number of points: {len(df)}')

plt.show()
# Save the plot
plt.savefig('C:/Users/Murgi/Documents/GitHub/meme_research/outputs/plots/accuracy_vs_count_train_val_splitted.png')


Compress dataset with LZ4 algorithm
-----------------------------------

In [None]:
import os
import lz4framed

def compress_directory(directory_path, output_file_path):
    compressor = lz4framed.Compressor(open(output_file_path, "wb"))

    for root, dirs, files in os.walk(directory_path):
        relative_path = os.path.relpath(root, directory_path)
        for file in files:
            file_path = os.path.join(root, file)
            relative_file_path = os.path.join(relative_path, file)

            with open(file_path, "rb") as f:
                while True:
                    data = f.read(8192)
                    if not data:
                        break
                    compressor.update(data, False)
            
            compressor.flush()
            compressor.update(relative_file_path.encode(), False)
    
    compressed_data = compressor.end()
    with open(output_file_path, "wb") as f:
        f.write(compressed_data)



# Usage example
directory_path = 'C:/Users/Murgi/Documents/GitHub/meme_research/src/pHash/test_images'
output_file = 'compressed_file.lz4'

compress_directory(directory_path, output_file)
print(f'Compressed {directory_path} to {output_file}')

Resize the images to 64x64 pixels for better storage utilization
---------------------------

Test image conversion and deconversion

In [None]:
import os
import PIL
import torchvision.transforms as transforms
import numpy as np
test_image = 'C:/Users/Murgi/Documents/GitHub/meme_research/src/pHash/test_images/1468988563469.jpg'

# Load the image
img = PIL.Image.open(test_image)
img.show()
# Resize the image
img = img.resize((224, 224))
# Convert to numpy array
img = np.array(img)
# Reshape to (224, 224, 3)
img = img.reshape((224, 224, 3))
# Convert back to PIL image
img = PIL.Image.fromarray(img)
# Show the image
img.show()


In [None]:
import os
import numpy as np
import torchvision.transforms as transforms
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

def resize_and_save_images(original_dir, new_dir, size=(380, 380)):
    # Define the image transformation
    resize_transform = transforms.Resize(size)

    # Walk through the original directory
    for subdir, dirs, files in os.walk(original_dir):
        for file in files:
            input_path = os.path.join(subdir, file)
            
            # Prepare the output subdirectory, preserving the folder structure
            rel_path = os.path.relpath(subdir, original_dir)  # get the relative path
            outdir = os.path.join(new_dir, rel_path)  # prepare the output directory path
            os.makedirs(outdir, exist_ok=True)  # ensure the output directory exists

            # Open and resize the image
            try:
                img = Image.open(input_path).convert('RGB')
                resized_img = resize_transform(img)

                # Save the resized image to the new directory
                output_path = os.path.join(outdir, file)                
                resized_img.save(output_path)

            except Exception as e:
                print(f"Unable to process file {input_path}. Error: {e}")

# Use the function
print('Started resizing images')
resize_and_save_images('D:/Memes2023_splitted', 'D:/Memes2023_splitted_resized')
print('Done with kym dataset')
resize_and_save_images('D:/Memes2022Final2', 'D:/Memes2022Final2_resized')
print('Done with reddit dataset')


In [None]:
import os
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

kym_memes = "D:/Memes2023_splitted_resized/finetuning"
reddit_memes = "D:/Memes2022Final2_resized"

kym_captions = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/captions/kym_captions.txt'
reddit_captions = 'C:/Users/Murgi/Documents/GitHub/meme_research/outputs/captions/reddit_captions.txt'

def caption_dataset(dataset_path, output_path):
    with open(output_path, "w") as f:
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                try:
                    img_path = os.path.join(root, file)
                    image = Image.open(img_path).convert("RGB")
                    image = vis_processors["eval"](image).unsqueeze(0).to(device)
                    caption = " ".join(model.generate({"image": image}))
                    # f.write(img_path + "\t" + caption + "\n")
                    print("Captioned image: " + img_path)
                except:
                    print("Error captioning image: " + img_path)
                    continue

print('Captioning kym dataset...')
caption_dataset(kym_memes, kym_captions)
print('Captioning reddit dataset...')
caption_dataset(reddit_memes, reddit_captions)

In [1]:
import pickle
pkl = "C:/Users/Murgi/Documents/GitHub/meme_research/outputs/cache/image_val_paths_rel.pkl"

with open(pkl, "rb") as f:
    data = pickle.load(f)

print(data[0])

('/storage/kym-datasets/Memes2023_splitted_resized/finetuning/val/%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy/%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy_1.jpg', '%E2%9D%84%EF%B8%8F-u-so-icy-ima-glacier-boy')


In [2]:
import pandas as pd

df = pd.read_csv('../model_predictions_alexnet.csv')
df.head()

Unnamed: 0,image_path,predicted_class,probability
0,D:/Memes2022Final2_resized/2014.01.01_14.jpg,ancient-aliens,0.137078
1,D:/Memes2022Final2_resized/2014.01.01_15.jpg,ascii-normal-heart-rate,0.036212
2,D:/Memes2022Final2_resized/2014.01.01_17.gif,2019-hong-kong-anti-extradition-bill-protests,0.093999
3,D:/Memes2022Final2_resized/2014.01.01_2.jpg,2013-nsa-surveillance-scandal,0.111166
4,D:/Memes2022Final2_resized/2014.01.01_3.jpg,2010-wikipedia-fundraising-campaign,0.214086


In [3]:
# Make numpy arrays from the dataframe columns
images_done = df['image_path'].to_numpy()
assigned_labels = df['predicted_class'].to_numpy()
confidence_scores = df['probability'].to_numpy()

In [7]:
print(images_done.shape)

(955593,)


In [10]:
import numpy as np
print(len(np.unique(assigned_labels)))

1352


In [8]:
import json

# Step 5: Save results to JSON file
print(images_done.shape)

# Collecting the results
results = dict()

for path, label, confidence in zip(images_done, assigned_labels, confidence_scores):
    if label not in results:
        results[label] = {"cluster_name": label, "images": {}}
    results[label]["images"][path] = float(confidence)  # convert numpy float to Python float

# Save the results to a JSON file
with open('../outputs/clusters/jsons/model_results.json', 'w') as f:
    json.dump(list(results.values()), f, indent=4)

(955593,)
