In [1]:
import os
import pickle
frame_folder = "cluster_analysis/nov/18/23_18_56_10000_0.6_0.1_0.1_0.1_0.1/frame_to_cluster_mapping.pkl"

with open(frame_folder, 'rb') as file:
    frame_to_cluster_mapping = pickle.load(file)


In [2]:
frame_to_cluster_mapping_transformed = {}
for key in frame_to_cluster_mapping:
    frame_to_cluster_mapping_transformed[key] = [cluster_id for _, cluster_id in frame_to_cluster_mapping[key]]

In [22]:
# Create a dictionary to store cluster frequencies across all videos
cluster_frequencies = {}

# Count occurrences of each cluster ID across all videos
for video_clusters in frame_to_cluster_mapping_transformed.values():
    for cluster_id in video_clusters:
        if cluster_id in cluster_frequencies:
            cluster_frequencies[cluster_id] += 1
        else:
            cluster_frequencies[cluster_id] = 1

# Sort clusters by frequency in descending order
sorted_clusters = sorted(cluster_frequencies.items(), key=lambda x: x[1], reverse=True)

# Print the most common clusters and their frequencies
print("Most common cluster IDs and their frequencies:")
for cluster_id, freq in sorted_clusters[:6]:
    print(f"Cluster {cluster_id}: {freq} occurrences")


Most common cluster IDs and their frequencies:
Cluster 421: 256 occurrences
Cluster 4376: 238 occurrences
Cluster 7698: 215 occurrences
Cluster 9316: 213 occurrences
Cluster 3976: 207 occurrences
Cluster 177: 192 occurrences


In [5]:
# Analyze run lengths of cluster IDs across all videos
run_length_stats = {}

# For each video, find runs of consecutive cluster IDs
for video_name, cluster_sequence in frame_to_cluster_mapping_transformed.items():
    current_cluster = cluster_sequence[0]
    current_run = 1
    
    # Process each cluster ID in sequence
    for cluster_id in cluster_sequence[1:]:
        if cluster_id == current_cluster:
            current_run += 1
        else:
            # Store run length for current cluster
            if current_cluster not in run_length_stats:
                run_length_stats[current_cluster] = []
            run_length_stats[current_cluster].append(current_run)
            
            # Reset for new cluster
            current_cluster = cluster_id
            current_run = 1
    
    # Don't forget to store the last run
    if current_cluster not in run_length_stats:
        run_length_stats[current_cluster] = []
    run_length_stats[current_cluster].append(current_run)

# Calculate statistics for each cluster
print("Run length statistics by cluster ID:")
print("Cluster ID | Avg Run Length | Max Run Length | Total Runs")
print("-" * 55)

for cluster_id in sorted(run_length_stats.keys()):
    runs = run_length_stats[cluster_id]
    avg_run = sum(runs) / len(runs)
    max_run = max(runs)
    num_runs = len(runs)
    
    print(f"{cluster_id:^10d} | {avg_run:^13.2f} | {max_run:^13d} | {num_runs:^10d}")

# Find clusters with longest average runs
avg_run_lengths = {
    cluster_id: sum(runs)/len(runs) 
    for cluster_id, runs in run_length_stats.items()
}

print("\nTop 5 clusters with longest average runs:")
top_runs = sorted(avg_run_lengths.items(), key=lambda x: x[1], reverse=True)[:5]
for cluster_id, avg_length in top_runs:
    print(f"Cluster {cluster_id}: {avg_length:.2f} frames average")


Run length statistics by cluster ID:
Cluster ID | Avg Run Length | Max Run Length | Total Runs
-------------------------------------------------------
    0      |     8.00      |       8       |     2     
    1      |     26.00     |      26       |     2     
    2      |     12.30     |      26       |     10    
    3      |     2.50      |       3       |     4     
    4      |     8.00      |       8       |     2     
    5      |     8.00      |      11       |     4     
    6      |     6.50      |      19       |     8     
    7      |     6.40      |       9       |     10    
    8      |     11.25     |      34       |     8     
    9      |     9.00      |      23       |     8     
    10     |     3.00      |       6       |     8     
    11     |     4.27      |       7       |     11    
    12     |     3.15      |      12       |     20    
    13     |     2.17      |       4       |     12    
    14     |     11.00     |      11       |     4     
    15   

In [10]:
# Calculate average number of unique clusters per video
unique_clusters_per_video = []
for video_name, cluster_sequence in frame_to_cluster_mapping_transformed.items():
    num_unique_clusters = len(set(cluster_sequence))
    unique_clusters_per_video.append(num_unique_clusters)

avg_unique_clusters = sum(unique_clusters_per_video) / len(unique_clusters_per_video)
min_unique_clusters = min(unique_clusters_per_video)
max_unique_clusters = max(unique_clusters_per_video)

# Calculate quartiles
sorted_clusters = sorted(unique_clusters_per_video)
n = len(sorted_clusters)
q1_idx = n // 4
q2_idx = n // 2  # median
q3_idx = 3 * n // 4

q1 = sorted_clusters[q1_idx]
median = sorted_clusters[q2_idx]
q3 = sorted_clusters[q3_idx]

print(f"\nUnique clusters per video statistics:")
print(f"Average: {avg_unique_clusters:.2f}")
print(f"Minimum: {min_unique_clusters}")
print(f"Maximum: {max_unique_clusters}")
print(f"Q1: {q1}")
print(f"Median: {median}")
print(f"Q3: {q3}")



Unique clusters per video statistics:
Average: 15.85
Minimum: 2
Maximum: 43
Q1: 11
Median: 15
Q3: 20


In [36]:
# Calculate threshold for top 15% most common clusters
num_clusters = len(sorted_clusters)
top_15_percent_threshold = int(num_clusters * 0.4)

# Get set of cluster IDs in top 15%
common_cluster_ids = {cluster_id for cluster_id, _ in sorted_clusters[:top_15_percent_threshold]}
print(common_cluster_ids)

# Find videos that only use clusters from top 15% and have at least 11 unique clusters
good_videos = []
for video_name, cluster_sequence in frame_to_cluster_mapping_transformed.items():
    # Convert sequence to set to get unique clusters used
    unique_clusters = set(cluster_sequence)
    
    # Check if all clusters are in the common set and has at least 11 unique clusters
    if len(unique_clusters) >= 14 and unique_clusters.issubset(common_cluster_ids):
        good_videos.append((video_name, cluster_sequence))

# Sort by length and take 3 videos
good_videos.sort(key=lambda x: len(x[1]))
selected_videos = good_videos[:3]

print(f"\nFound {len(good_videos)} videos using only common clusters and having at least 11 unique clusters")
print("\nSelected 3 shortest videos:")
for video_name, clusters in selected_videos:
    print(f"\nVideo: {video_name}")
    print(f"Length: {len(clusters)} frames")
    print(f"Unique clusters used: {len(set(clusters))}")


{8193, 2, 1, 8196, 5, 6, 7, 8, 9, 8198, 11, 12, 8205, 14, 8207, 8209, 19, 20, 8212, 22, 21, 8216, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 8233, 42, 8235, 46, 8242, 50, 52, 8245, 54, 55, 53, 57, 8250, 59, 60, 56, 58, 63, 8257, 67, 68, 8259, 69, 71, 72, 73, 74, 8264, 76, 8268, 78, 80, 8274, 83, 84, 8278, 86, 88, 89, 8282, 8283, 92, 91, 8281, 95, 96, 98, 99, 101, 102, 8295, 104, 103, 8296, 107, 109, 111, 8304, 114, 115, 8308, 118, 120, 121, 123, 124, 8315, 125, 127, 128, 129, 130, 131, 132, 8321, 134, 8328, 137, 138, 139, 140, 8334, 142, 144, 8335, 146, 147, 148, 8341, 150, 151, 152, 153, 154, 8345, 157, 158, 159, 160, 8352, 162, 8355, 8357, 166, 167, 165, 172, 8364, 8366, 8365, 8367, 177, 8370, 180, 8373, 8374, 183, 184, 8376, 186, 187, 8380, 188, 8381, 185, 192, 8384, 194, 8387, 8388, 191, 195, 199, 200, 8393, 202, 8391, 204, 8396, 8398, 206, 209, 8401, 8403, 212, 213, 8405, 215, 8407, 217, 218, 219, 220, 221, 8413, 8414, 8412, 8416, 1695, 227, 228, 230, 234, 8426, 236, 237, 238