<a href="https://colab.research.google.com/github/izhilina/ll/blob/master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install diffusers

In [None]:
!pip install git+https://github.com/huggingface/accelerate.git

In [None]:
!pip install accelerate
!accelerate config default
!accelerate env

# Новый раздел

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import os
import numpy as np
from scipy.spatial.distance import pdist, squareform

import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import resnet50

from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel
from diffusers import StableDiffusionXLPipeline

import torchvision
import pathlib
from tqdm import tqdm

from scipy.spatial.distance import pdist, squareform


cuda_flag = True
torch.cuda.empty_cache()

def generate_images_with_prompt(prompt, output_dir, num_images=10, image_size=(256, 256), image_mode='RGB'):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for i in range(num_images):
        # Generate a random image using the prompt

        pipe = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype = torch.float16
        )
        if cuda_flag:
            pipe = pipe.to("cuda")
        # pipe.enable_model_cpu_offload()
        image = pipe(prompt).images[0]
        torch.cuda.empty_cache()

        # Save the generated image
        image_path = os.path.join(output_dir, f"generated_image_{i}.png")
        image.save(image_path)

        print(f"Generated image saved at: {image_path}")

def extract_features(path):
    from transformers import AutoImageProcessor, AutoModel
    from PIL import Image
    import requests

    image = Image.open(path)
    # image = Image.open(requests.get(url, stream=True).raw)

    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
    model = AutoModel.from_pretrained('facebook/dinov2-small')

    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states

def process_images_in_directory(directory, method_function, *args, **kwargs):
    """
    Process all images in the specified directory using the provided method.

    Parameters:
        - directory (str): Path to the directory containing images.
        - method_function (callable): The method/function to run for each image.
        - *args, **kwargs: Additional arguments to pass to the method_function.

    Returns:
        - List: A list containing the results of applying the method to each image.
    """
    results = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            # Construct the full path to the image
            image_path = os.path.join(directory, filename)

            # Run the method for the current image
            result = method_function(image_path, *args, **kwargs)
            results.append(result)

    return results

def cluster_embeddings(embeddings, num_clusters, dmin_c):
    """
    Cluster the embeddings of generated images using K-Means++ and filter clusters.

    Parameters:
        - image_directory (str): Path to the directory containing generated images.
        - embeddings_function (callable): Function to extract embeddings from images.
        - num_clusters (int): Number of clusters for K-Means++.
        - dmin_c (int): Minimum cluster size threshold.

    Returns:
        - List: List of filtered clusters.

    K - MEANS + + removing all clusters whose size is below
    a pre - defined threshold dmin - c

    Among the remaining clusters, we
    choose the most cohesive

    define the cohesion
    as the average distance between the members
    of c and its centroid ccen
    """

    # Convert embeddings list to numpy array
    embeddings = np.array(embeddings)

    # Normalize embeddings
    embeddings_normalized = normalize(embeddings)

    # Perform K-Means++ clustering
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings_normalized)

    # Calculate cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(embeddings_normalized)

    # Initialize filtered clusters
    filtered_clusters = []

    # Iterate over unique cluster labels
    for cluster_label in np.unique(cluster_labels):
        # Find indices of images belonging to the current cluster
        cluster_indices = np.where(cluster_labels == cluster_label)[0]

        # Calculate the average cosine similarity within the cluster
        #TODO: switch to cohession
        avg_similarity = np.mean(cosine_sim_matrix[cluster_indices][:, cluster_indices])

        # Filter clusters based on the minimum size threshold and average similarity
        if len(cluster_indices) >= dmin_c and avg_similarity >= 0.5:  # Adjust similarity threshold as needed
            filtered_clusters.append({
                'label': cluster_label,
                'size': len(cluster_indices),
                'cluster': cluster_indices
                # ,
                # 'image_paths': [image_paths[i] for i in cluster_indices]
            })
        #TODO: where is cluster

    return filtered_clusters


def calculate_average_distance(embeddings):
    """
    Calculate the average pairwise Euclidean distance between embeddings.

    Parameters:
        - embeddings (np.ndarray): Array of shape (N, D) containing N embeddings of dimension D.

    Returns:
        - float: The average pairwise Euclidean distance.
    """
    pairwise_distances = squareform(pdist(embeddings, metric='euclidean'))
    average_distance = np.mean(pairwise_distances)
    return average_distance

In [94]:

import os
import shutil

def copy_selected_files(src_directory, dest_directory, selected_file_ids, selected_file_extensions):
    """
    Copy selected files from the source directory to the destination directory.

    Parameters:
        - src_directory (str): Source directory path.
        - dest_directory (str): Destination directory path.
        - selected_file_ids (list): List of file IDs to copy.
        - selected_file_extensions (list): List of file extensions to copy.

    Returns:
        - None
    """
    # Check if the destination directory exists, and create it if not
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    else:
        # Empty the destination directory if it's not already empty
        for file_name in os.listdir(dest_directory):
            file_path = os.path.join(dest_directory, file_name)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")

    # Iterate over files in the source directory
    for file_name in os.listdir(src_directory):
        src_file_path = os.path.join(src_directory, file_name)

        print(src_file_path)

        # Check if the file has the desired extension and is in the selected IDs
        _, file_extension = os.path.splitext(file_name)

        if file_extension.lower() in selected_file_extensions \
            and file_name in selected_file_ids:
            dest_file_path = os.path.join(dest_directory, file_name)

            # Copy the selected file to the destination directory
            shutil.copy2(src_file_path, dest_file_path)

            print(f"File copied: {file_name}")


In [None]:
!git clone https://github.com/huggingface/diffusers
!cd diffusers
!pip install -r diffusers/examples/text_to_image/requirements.txt

In [95]:
# prompt = "beautiful woman wearing glasses in the style of naïve drawing, in the style of quick drawing --W 256 --H 256"

#     # Set the output directory
# output_directory = "generated_images"

#     # Set the number of images to generate
# num_images_to_generate = 10

#     # Generate and save N images with the same prompt
# generate_images_with_prompt(prompt, output_directory, num_images_to_generate)


# # !rm -R generated_images/class/.ipynb_checkpoints
# # !rm -R generated_images/.ipynb_checkpoints
# # !ls generated_images/ -a

# embeddings = process_images_in_directory(output_directory, extract_features)

# embeddings=[e.detach().numpy() for e in embeddings]
# print(np.array(embeddings).shape)
# embeddings = np.array(embeddings).reshape(10, -1)

# filtered_clusters = cluster_embeddings(embeddings, num_clusters=5, dmin_c=2)

# print(f'Filtered clusters: {filtered_clusters}')

# min_distance = 0
# final_cluster_n = 0
# for c in range(len(filtered_clusters)):
#     selected_cluster = [embeddings[i] for i in filtered_clusters[c]['cluster']]
#     dis = calculate_average_distance(np.array(selected_cluster).reshape(len(selected_cluster), -1))
#     print(f'cluster: {c}, distance: {dis}')
#     if dis>0 and dis < min_distance:
#       min_distance=dis
#       final_cluster_n = c
#     elif min_distance==0 and dis>min_distance:
#       min_distance=dis
#       final_cluster_n = c

# print(f'Selected cluster: {final_cluster_n}')

# final_cluster_imgs = filtered_clusters[final_cluster_n]['cluster']

print(f'Selected images: {final_cluster_imgs}')

destination_directory = os.path.join(output_directory, "lora_dataset")
selected_file_ids = [f'generated_image_{i}.png' for i in final_cluster_imgs]  # Mention the IDs of the files you want to copy
selected_extensions = ['.png', '.jpg']

copy_selected_files(output_directory, destination_directory, selected_file_ids, selected_extensions)





Selected images: [2 3 6]
generated_images/generated_image_2.png
File copied: generated_image_2.png
generated_images/generated_image_7.png
generated_images/generated_image_10.png
generated_images/generated_image_8.png
generated_images/lora_dataset
generated_images/generated_image_1.png
generated_images/generated_image_5.png
generated_images/generated_image_9.png
generated_images/generated_image_6.png
File copied: generated_image_6.png
generated_images/generated_image_4.png
generated_images/generated_image_3.png
File copied: generated_image_3.png
