In [3]:
import glob
from cuml import HDBSCAN
import numpy as np
from collections import Counter
import os
import json

In [None]:
 import cudf
 print(cudf.Series([1, 2, 3]))

In [4]:
labeled_embedding_output_dir = "D:/embeddings"
unlabeled_embedding_output_dir = "D:/embeddings"

labeled_phases_files = "C:/Users/Murgi/Documents/GitHub/meme_research/src/pHash/labeled_phashes.txt"
unlabeled_phases_files = "C:/Users/Murgi/Documents/GitHub/meme_research/src/pHash/unlabeled_phashes.txt"

In [5]:
phash = True

def load_embeddings(embedding_output_dir):
    # Load all the embeddings into a single numpy array
    files = glob.glob(embedding_output_dir + '/*.npy')
    class_names = []
    embeddings_list = []  # create a list to hold embeddings
    for i, file in enumerate(files):
        #Get filename
        filename = file.split('\\')[-1]
        class_name = filename.split('_')[1]
        class_names.append(class_name)
        embedding = np.load(file)
        embeddings_list.append(embedding)
    # Create a numpy array to hold the class names
    class_names = np.array(class_names)
    embeddings = np.concatenate(embeddings_list)
    return embeddings, class_names

def load_phases(phases_files):
    phashes = []
    imgage_paths = []
    class_names = []
    # Open the file containing the phashes
    with open(phases_files, 'r') as f:
        # Read the file into a list of lines
        lines = f.readlines()
    for line in lines:
        # Split the line into the filename and phash
        path, phash = line.strip().split('\t')
        imgage_paths.append(path)
        parts = os.path.normpath(path).split(os.sep)
        if "finetuning" in parts:
            #Get the dir of the image
            class_name = path.split('\\')[-2]
        else:
            class_name = parts[-2]
        # Get the class name from the path
        class_names.append(class_name)
        phashes.append(phash)
    return np.array(phashes), np.array(class_names), np.array(imgage_paths)

def phash_to_bin(phashes):
    binary_repr = [bin(int(phash, 16))[2:].zfill(64) for phash in phashes]
    return np.array([[int(bit) for bit in bin_str] for bin_str in binary_repr])

if not phash:
    # Load the embeddings
    labeled_embeddings, labeled_templates, _ = load_embeddings(labeled_embedding_output_dir)
    print(labeled_embeddings)
    unlabeled_embeddings, _, image_paths = load_embeddings(unlabeled_embedding_output_dir)

else:
    # Load the phashes
    labeled_embeddings, labeled_templates, _ = load_phases(labeled_phases_files)
    labeled_embeddings = phash_to_bin(labeled_embeddings)
    unlabeled_embeddings, _, image_paths = load_phases(unlabeled_phases_files)
    unlabeled_embeddings = phash_to_bin(unlabeled_embeddings)

print("finished loading")


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Murgi/Documents/GitHub/meme_research/src/pHash/labeled_phashes.txt'

In [None]:
# Step 1: Cluster the unlabeled data
clusterer = HDBSCAN(min_cluster_size=2, metric='hamming')
unlabeled_clusters = clusterer.fit_predict(unlabeled_embeddings)
print(clusterer.labels_)

# Get the unique cluster labels, excluding -1
unique_clusters = np.unique(unlabeled_clusters[unlabeled_clusters != -1])

# Initialize a list to hold the label for each cluster
cluster_labels = [None] * len(np.unique(unique_clusters))

# Initialize a list to hold the labels for all labeled memes
assigned_labels = [None] * len(labeled_embeddings)

# Initialize a list to hold the confidence for each cluster label
confidence_scores = [None] * len(np.unique(unique_clusters))

# Calculate the centroid for each cluster
centroids = [np.mean(unlabeled_embeddings[unlabeled_clusters == cluster], axis=0) for cluster in unique_clusters]
# Step 2: Assign labeled data to closest cluster
for i, embedding in enumerate(labeled_embeddings):
    distances = np.linalg.norm(centroids - embedding, axis=1)
    assigned_labels[i] = np.argmin(distances)

# Step 3: Determine cluster labels and confidence scores
for i, cluster in enumerate(unique_clusters):
    # Get the templates assigned to the cluster
    assigned_templates = labeled_templates[np.array(assigned_labels) == cluster]
    
    # Count the occurrences of each template in the cluster
    template_counts = Counter(assigned_templates)

    # Assign the most common template as the label for the cluster
    cluster_labels[i] = template_counts.most_common(1)[0][0]

    # Calculate the confidence score for the label
    confidence_scores[i] = template_counts.most_common(1)[0][1] / len(assigned_templates)

# Step 4: Gather image paths for each cluster and noise
cluster_image_paths = []
for cluster in np.unique(unlabeled_clusters):
    cluster_image_indices = np.where(unlabeled_clusters == cluster)[0]
    cluster_image_paths.append(image_paths[cluster_image_indices].tolist())

# Step 5: Save results to JSON file
with open('cluster_results.json', 'w') as json_file:
    cluster_data = []
    for i, cluster in enumerate(np.unique(unlabeled_clusters)):
        if cluster == -1:
            cluster_dict = {
                "cluster_no": str(cluster),
                "template_label": "noise",
                "confidence_score": "NaN",
                "images": cluster_image_paths[i]
            }
        else:
            cluster_dict = {
                "cluster_no": str(cluster),
                "template_label": cluster_labels[unique_clusters.tolist().index(cluster)],
                "confidence_score": str(confidence_scores[unique_clusters.tolist().index(cluster)]),
                "images": cluster_image_paths[i]
            }
        cluster_data.append(cluster_dict)
    json.dump(cluster_data, json_file, indent=4)
print(cluster_labels)
print(confidence_scores)


np.save('cluster_labels.npy', cluster_labels)
np.save('confidence_scores.npy', confidence_scores)
np.save('unlabeled_clusters.npy', unlabeled_clusters)
np.save('assigned_labels.npy', assigned_labels)