In [None]:
import os
import cv2
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import tensorflow as tf

# Load the model
model = tf.keras.models.load_model('model/style.h5')

# Dataset path
data_dir = "D:\\cvLearning\\clustering\\all"

# 3. Dimensionality reduction
def reduce_dimensionality(features):
    # Use PCA for dimensionality reduction
    pca = PCA(n_components=3)
    reduced_features = pca.fit_transform(features)
    return reduced_features

# 4. Incremental clustering
def cluster_authors(features, max_clusters):
    best_score = -1
    best_cluster_labels = None
    for num_clusters in range(2, max_clusters + 1):
        kmeans = MiniBatchKMeans(n_clusters=num_clusters)
        cluster_labels = kmeans.fit_predict(features)
        silhouette_avg = silhouette_score(features, cluster_labels)
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_cluster_labels = cluster_labels
    return best_cluster_labels

# 1. Preprocessing in the feature extraction function
def preprocess_image(image):
    # Preprocess the image according to your model's needs, such as scaling and normalization
    # Here we simply resize the image to the model's input size
    image = cv2.resize(image, (224, 224))
    image = image / 255.0  # Normalize
    return image

# 1. Feature extraction
def extract_features(image_paths):
    features = []
    for image_path in image_paths:
        image = cv2.imread(image_path)
        # Preprocess the image according to the model's needs
        image = preprocess_image(image)
        # Extract features
        feature = model.predict(np.expand_dims(image, axis=0)).flatten()
        features.append(feature)
    return np.array(features)

# 2. Batch read the dataset and extract features
batch_size = 16
feature_vectors = []
image_paths = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir) if filename.endswith(('.jpg', '.jpeg', '.png'))]
for i in range(0, len(image_paths), batch_size):
    batch_image_paths = image_paths[i:i + batch_size]
    batch_features = extract_features(batch_image_paths)
    feature_vectors.append(batch_features)
feature_vectors = np.concatenate(feature_vectors)

# Clustering
reduced_features = reduce_dimensionality(feature_vectors)
cluster_labels = cluster_authors(feature_vectors, max_clusters=10)
dim = feature_vectors.shape[0]
print("Dimension of feature vectors:", dim)
# Output the clustering labels for each sample
print(cluster_labels)
