In [8]:
import os
import csv
from PIL import Image
import pandas as pd
import numpy as np
from prettytable import PrettyTable

dataset_folder = os.getcwd() + '/Gemstones_dataset_short'

def count_images_per_category(root_dir):
    category_counts = {}
    for foldername in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, foldername)
        if os.path.isdir(folder_path):
            category_counts[foldername] = len(os.listdir(folder_path))
    return category_counts

# Train count
train_folder = os.path.join(dataset_folder, 'train')
train_category_counts = count_images_per_category(train_folder)

# Test count
test_folder = os.path.join(dataset_folder, 'test')
test_category_counts = count_images_per_category(test_folder)

# Total count
combined_category_counts = {}
for category in sorted(set(list(train_category_counts.keys()) + list(test_category_counts.keys()))):
    combined_category_counts[category] = train_category_counts.get(category, 0) + test_category_counts.get(category, 0)

# Total count of all images
total_count = sum(combined_category_counts.values())

table = PrettyTable(['Category', 'Train Count', 'Test Count', 'Total Count'])
for category, count in combined_category_counts.items():
    train_count = train_category_counts.get(category, 0)
    test_count = test_category_counts.get(category, 0)
    table.add_row([category, train_count, test_count, count])

table.add_row(["Total", sum(train_category_counts.values()), sum(test_category_counts.values()), total_count])
print(table)

num_categories = len(combined_category_counts)
print(f"\nNumber of categories: {num_categories}")

+-------------------+-------------+------------+-------------+
|      Category     | Train Count | Test Count | Total Count |
+-------------------+-------------+------------+-------------+
|    Alexandrite    |      68     |     8      |      76     |
|     Almandine     |      62     |     9      |      71     |
|     Amazonite     |      64     |     8      |      72     |
|       Amber       |      62     |     8      |      70     |
|      Amethyst     |      70     |     8      |      78     |
|      Ametrine     |      70     |     8      |      78     |
|     Andalusite    |      64     |     8      |      72     |
|     Andradite     |      62     |     8      |      70     |
|     Aquamarine    |      72     |     10     |      82     |
|  Aventurine Green |      90     |     10     |     100     |
| Aventurine Yellow |      68     |     8      |      76     |
|       Total       |     752     |     93     |     845     |
+-------------------+-------------+------------+-------

In [12]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from colorthief import ColorThief
from sklearn.cluster import KMeans

# Set the target directory
target_dir = 'C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short'

# Create a list to store the vectorized images and dominant colors
vectorized_images = []
labels = []
dominant_colors = []

# Counter to track the number of folders processed
folder_count = 0

# Loop through each sub-directory
for subdir, dirs, files in os.walk(target_dir):
    for file in files:
        # Check if the file is an image
        if file.endswith('.jpg'):
            # Construct the full file path
            file_path = os.path.join(subdir, file)
            
            # Open the image
            image = Image.open(file_path)

            # Remove the white background
            image = image.convert("RGBA")
            data = np.array(image)
            red, green, blue, alpha = data.T
            white_areas = (red > 200) & (blue > 200) & (green > 200)
            data[..., -1][white_areas.T] = 0
            image = Image.fromarray(data)

            image = image.resize((128, 128))

            new_image = Image.new("RGBA", image.size, "WHITE") # Create a white rgba background
            new_image.paste(image, (0, 0), image)              # Paste the image on the background.
            image = new_image.convert('RGB')

            labels.append(subdir.split("\\")[-1])

            X_image = np.asarray(image)
            X = X_image.reshape(-1, 3)

            kmeans = KMeans(n_clusters = 4, random_state = 42)
            kmeans.fit(X)

            segmented_img = kmeans.cluster_centers_[kmeans.labels_] #turn each data point into its cluster center
            segmented_img = segmented_img.reshape(X_image.shape)
            new_image = Image.fromarray(segmented_img.astype(np.uint8))
            
            # Save the resized image as PNG, due to issues with transparency
            new_file_path = os.path.join(subdir, f'{file.split(".")[0]}.png')
            new_image.save(new_file_path)
            
            # Get the dominant colors using ColorThief
            color_thief = ColorThief(new_file_path)
            dominant_color = color_thief.get_palette(color_count=3, quality=1)

            dominant_colors.append(dominant_color)            
            # Close the image
            image.close()
            
            print(f'Resized {file} to 128x128 and saved as {new_file_path}')

# Convert the image data and labels to DataFrame
df = pd.DataFrame(dominant_colors, columns=[f'dominant_color_{i+1}' for i in range(4)])
df['label'] = labels
df.head()

Resized alexandrite_18.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Alexandrite\alexandrite_18.png
Resized alexandrite_28.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Alexandrite\alexandrite_28.png
Resized alexandrite_3.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Alexandrite\alexandrite_3.png
Resized alexandrite_6.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Alexandrite\alexandrite_6.png
Resized almandine_18.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Almandine\almandine_18.png
Resized almandine_28.jpg to 128x128 and saved as C:/Users/natal/Desktop/School/SEM_6/MAL1/Final/Gemstones_dataset_short\test\Almandine\almandine_28.png
Resized almandine_3.jpg to 128x128 and saved as C:/Users/natal/Deskt

Unnamed: 0,dominant_color_1,dominant_color_2,dominant_color_3,dominant_color_4,label
0,"(68, 100, 108)","(36, 44, 60)","(100, 156, 156)","(68, 116, 132)",Alexandrite
1,"(156, 188, 188)","(92, 124, 124)","(28, 44, 52)","(160, 136, 160)",Alexandrite
2,"(52, 116, 124)","(116, 172, 188)","(12, 60, 60)","(68, 84, 108)",Alexandrite
3,"(140, 172, 180)","(60, 87, 76)","(92, 176, 148)","(92, 176, 148)",Alexandrite
4,"(148, 52, 52)","(52, 28, 28)","(148, 132, 132)","(152, 52, 92)",Almandine


In [10]:
df.to_csv('image_data_short.csv', index=False)