In [27]:
import os
import cv2
import numpy as np
import time
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import shutil
from tqdm import tqdm

# Load pre-trained VGG16 model and remove the top layer
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

# Function to extract features from a batch of images
def extract_features_batch(image_paths, model):
    batch_images = []
    for image_path in image_paths:
        img = cv2.imread(image_path)
        img = cv2.resize(img, (224, 224))
        img = img.astype('float32')
        img = preprocess_input(img)
        batch_images.append(img)
    batch_images = np.array(batch_images)
    features = model.predict(batch_images)
    return features

# Function to load images and extract features in batches
def load_and_extract_features_in_batches(folder_path, batch_size=32):
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.png')]
    features = []
    start_time = time.time()
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting features", unit="batch"):
        batch_paths = image_paths[i:i + batch_size]
        batch_features = extract_features_batch(batch_paths, model)
        features.append(batch_features)
    end_time = time.time()
    print(f"Feature extraction time: {end_time - start_time} seconds")
    return np.vstack(features), image_paths

In [28]:

# Load features
folder_path = './extracted_data/frames'  # Replace with your folder path
features, image_paths = load_and_extract_features_in_batches(folder_path)

Extracting features:   0%|          | 0/305 [00:00<?, ?batch/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


Extracting features:   0%|          | 1/305 [00:04<22:30,  4.44s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   1%|          | 2/305 [00:09<24:51,  4.92s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   1%|          | 3/305 [00:14<25:24,  5.05s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   1%|▏         | 4/305 [00:20<25:38,  5.11s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   2%|▏         | 5/305 [00:25<25:48,  5.16s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   2%|▏         | 6/305 [00:30<25:34,  5.13s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   2%|▏         | 7/305 [00:35<25:08,  5.06s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   3%|▎         | 8/305 [00:40<24:41,  4.99s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   3%|▎         | 9/305 [00:45<24:47,  5.03s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   3%|▎         | 10/305 [00:50<24:43,  5.03s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step


Extracting features:   4%|▎         | 11/305 [00:55<25:22,  5.18s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:   4%|▍         | 12/305 [01:03<28:42,  5.88s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   4%|▍         | 13/305 [01:12<32:44,  6.73s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step


Extracting features:   5%|▍         | 14/305 [01:23<40:03,  8.26s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   5%|▍         | 15/305 [01:32<40:25,  8.36s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   5%|▌         | 16/305 [01:41<40:45,  8.46s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   6%|▌         | 17/305 [01:50<42:27,  8.85s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   6%|▌         | 18/305 [02:01<44:28,  9.30s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step


Extracting features:   6%|▌         | 19/305 [02:14<49:26, 10.37s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   7%|▋         | 20/305 [02:23<48:08, 10.13s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   7%|▋         | 21/305 [02:33<47:24, 10.01s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   7%|▋         | 22/305 [02:41<44:41,  9.47s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   8%|▊         | 23/305 [02:49<43:00,  9.15s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step


Extracting features:   8%|▊         | 24/305 [03:00<44:49,  9.57s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   8%|▊         | 25/305 [03:10<45:02,  9.65s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   9%|▊         | 26/305 [03:20<45:35,  9.80s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:   9%|▉         | 27/305 [03:30<45:57,  9.92s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:   9%|▉         | 28/305 [03:39<43:40,  9.46s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  10%|▉         | 29/305 [03:47<41:58,  9.12s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  10%|▉         | 30/305 [03:55<40:44,  8.89s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  10%|█         | 31/305 [04:03<39:18,  8.61s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  10%|█         | 32/305 [04:12<39:11,  8.61s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:  11%|█         | 33/305 [04:22<40:52,  9.02s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:  11%|█         | 34/305 [04:31<41:15,  9.13s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  11%|█▏        | 35/305 [04:39<39:55,  8.87s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  12%|█▏        | 36/305 [04:48<38:57,  8.69s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  12%|█▏        | 37/305 [04:56<38:09,  8.54s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  12%|█▏        | 38/305 [05:04<37:04,  8.33s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  13%|█▎        | 39/305 [05:11<35:23,  7.98s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  13%|█▎        | 40/305 [05:19<35:57,  8.14s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  13%|█▎        | 41/305 [05:27<35:34,  8.08s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  14%|█▍        | 42/305 [05:35<34:20,  7.84s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


Extracting features:  14%|█▍        | 43/305 [05:44<35:59,  8.24s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step


Extracting features:  14%|█▍        | 44/305 [05:51<34:44,  7.99s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  15%|█▍        | 45/305 [06:00<36:03,  8.32s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  15%|█▌        | 46/305 [06:09<36:46,  8.52s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  15%|█▌        | 47/305 [06:18<37:15,  8.67s/batch]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step


Extracting features:  16%|█▌        | 48/305 [06:27<36:31,  8.53s/batch]

In [None]:
# Apply PCA for dimensionality reduction
pca_start_time = time.time()
pca = PCA(n_components=50)
reduced_features = pca.fit_transform(features)
pca_end_time = time.time()
print(f"PCA time: {pca_end_time - pca_start_time} seconds")

In [None]:
# Apply KMeans clustering
kmeans_start_time = time.time()
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(reduced_features)
kmeans_end_time = time.time()
print(f"KMeans time: {kmeans_end_time - kmeans_start_time} seconds")

In [None]:
# Function to select diverse images from each cluster
def select_diverse_images(features, clusters, num_images):
    selected_images = []
    for cluster in range(num_clusters):
        cluster_features = features[clusters == cluster]
        cluster_indices = np.where(clusters == cluster)[0]
        variances = np.var(cluster_features, axis=0)
        diverse_indices = cluster_indices[np.argsort(-variances)[:num_images]]
        selected_images.extend(diverse_indices)
    return selected_images

In [None]:
# Select diverse images
num_images_per_cluster = 15000
selection_start_time = time.time()
selected_indices = select_diverse_images(reduced_features, clusters, num_images_per_cluster)
selection_end_time = time.time()
print(f"Selection time: {selection_end_time - selection_start_time} seconds")

# Create folders and move images
folders = ['Bruno', 'Hikari', 'Tucci', 'Fabio', 'Godoy', 'rest']
for folder in folders:
    os.makedirs(folder, exist_ok=True)

for idx in selected_indices:
    cluster = clusters[idx]
    folder_name = folders[cluster]
    shutil.move(image_paths[idx], os.path.join(folder_name, os.path.basename(image_paths[idx])))

# Move remaining images to 'rest' folder
remaining_indices = set(range(len(image_paths))) - set(selected_indices)
for idx in remaining_indices:
    shutil.move(image_paths[idx], os.path.join('rest', os.path.basename(image_paths[idx])))