# Set Up

In [None]:
import os
import shutil
import zipfile
import requests

zip_file = 'Data/outfit_dataset.zip'
output_dir = 'Downloaded-Images/KaggleFashion' #Where the downloaded images should go

# Extract the dataset
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

# Create a directory to store the downloaded images
download_dir = 'Downloaded-Images/KaggleFashion'
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Iterate over the image files and download them to your computer
for root, dirs, files in os.walk(output_dir):
    for file in files:
        if file.endswith('.png'):
            image_path = os.path.join(root, file)
            image_file = os.path.join(download_dir, file)

            # Construct the download link
            file_id = file.split('_')[-1].split('.')[0]
            download_link = f"https://drive.google.com/uc?id={file_id}&export=download"

            # Download the image using requests
            response = requests.get(download_link, stream=True)
            if response.status_code == 200:
                with open(image_file, 'wb') as f:
                    response.raw.decode_content = True
                    shutil.copyfileobj(response.raw, f)
                print(f"Downloaded image: {file}")
print(f"All images downloaded.")

All images downloaded.


# CSV With Link to All Images: Saved in DataFrame and CSV Folder

In [None]:
import os
import csv
import pandas as pd

download_dir = 'Downloaded-Images/PracticeTest'
batch_size = 50  # Number of images in each batch

# Create a list to store image information
image_info = []

def process_files(root):
    for file in os.listdir(root):
        file_path = os.path.join(root, file)

        if os.path.isdir(file_path):
            process_files(file_path)  # Recursively process subdirectories
        elif file_path.endswith('.jpg') and not file_path.startswith('.'):
            # Extract only the file name from the image file path
            file_name = os.path.basename(file_path)

            # Construct the file path for the image
            file_path = os.path.abspath(file_path)

            image_info.append([file_name, file_path])  # Store image information

# Traverse through the subdirectories within download_dir and the subfolders of KaggleFashion
process_files(download_dir)

# Create a DataFrame from the image information
df = pd.DataFrame(image_info, columns=['Image File', 'File Path'])

# Create the first folder to store the DataFrame if it doesn't exist
folder1 = 'CSV/Color-Analysis/KaggleFashion'
if not os.path.exists(folder1):
    os.makedirs(folder1)

# Create the second folder to store the DataFrame if it doesn't exist
folder2 = 'CSV/KaggleFashion'
if not os.path.exists(folder2):
    os.makedirs(folder2)

# Save the DataFrame as a CSV file in the first folder
csv_file1 = os.path.join(folder1, 'outfit-unprocessed-KaggleFashion.csv')
df.to_csv(csv_file1, index=False)

# Save the DataFrame as a CSV file in the second folder
csv_file2 = os.path.join(folder2, 'outfit-unprocessed-KaggleFashion.csv')
df.to_csv(csv_file2, index=False)

print(f"DataFrame saved in Folder 1: {csv_file1}")
print(f"DataFrame saved in Folder 2: {csv_file2}")

ImportError: dlopen(/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/_libs/interval.cpython-311-darwin.so, 0x0002): tried: '/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/_libs/interval.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/_libs/interval.cpython-311-darwin.so' (no such file), '/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/_libs/interval.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))

# DataFrame: Month and Year Image Was Collected

In [None]:
import os
import pandas as pd
from PIL import Image
import datetime

input_folder = "Downloaded-Images/KaggleFashion"
existing_csv = "CSV/KaggleFashion/outfit-unprocessed-KaggleFashion.csv"
dataframe_folder = "CSV/KaggleFashion"
new_dataframe_csv = os.path.join(dataframe_folder, "KaggleFashion-TimeCollection.csv")
updated_csv_folder = "CSV/TimeCollection"
existing_updated_csv_folder = "CSV/KaggleFashion"  # Existing folder to save updated CSV

# Read the existing CSV file into a pandas DataFrame
df = pd.read_csv(existing_csv)

# Extract file name from the 'File Path' column
df['File Name'] = df['File Path'].str.split('/').str[-1]

# Create a list to store image details
image_details = []

# Function to process files in a directory recursively
def process_files(root):
    for file in os.listdir(root):
        file_path = os.path.join(root, file)

        if os.path.isdir(file_path):
            process_files(file_path)  # Recursively process subdirectories
        elif file_path.endswith('.jpg') and not file_path.startswith('.'):
            # Open the image using PIL
            image = Image.open(file_path)

            # Get creation time of the image file
            creation_time = os.path.getctime(file_path)

            # Extract month and year from the creation time
            dt = datetime.datetime.fromtimestamp(creation_time)
            month_year = dt.strftime("%Y-%m")

            # Update the 'Date of Collection' column with month and year
            file_indices = df[df['File Name'] == file].index
            df.loc[file_indices, 'Date of Collection'] = month_year

            # Create a dictionary of image details
            image_info = {
                'File Name': file,
                'Date of Collection': month_year
            }

            # Append image details to the list
            image_details.append(image_info)

# Traverse through the subdirectories within input_folder and the subfolders of KaggleFashion
process_files(input_folder)

# Remove the 'Collection Time' column if it exists
if 'Collection Time' in df.columns:
    df.drop(columns=['Collection Time'], inplace=True)

# Save the updated DataFrame to a new CSV file in the DataFrame folder
df.to_csv(new_dataframe_csv, index=False)

# Save the updated DataFrame to multiple CSV files in different folders
updated_csv_file_1 = os.path.join(updated_csv_folder, "TimeCollection-KaggleFashion.csv")
updated_csv_file_2 = os.path.join(existing_updated_csv_folder, "TimeCollection-KaggleFashion.csv")
df.to_csv(updated_csv_file_1, index=False)
df.to_csv(updated_csv_file_2, index=False)

print("New DataFrame CSV file created: " + new_dataframe_csv)
print("Updated CSV files created: " + updated_csv_file_1 + ", " + updated_csv_file_2)
print("Image compression and DataFrame update completed!")

New DataFrame CSV file created: CSV/KaggleFashion/KaggleFashion-TimeCollection.csv
Updated CSV files created: CSV/TimeCollection/TimeCollection-KaggleFashion.csv, CSV/KaggleFashion/TimeCollection-KaggleFashion.csv
Image compression and DataFrame update completed!


# DataFrame: Raw Images Pie Chart + CSV (Top 3)

In [None]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd
import matplotlib.pyplot as plt

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_colors(image_path, number_of_colors):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    reshaped_image = cv2.resize(image, (600, 400))
    reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
    clf = KMeans(n_clusters=number_of_colors)
    clf.fit(reshaped_image)
    center_colors = clf.cluster_centers_
    ordered_colors = [center_colors[i] for i in range(number_of_colors)]
    hex_colors = [RGB_HEX(ordered_colors[i]) for i in range(number_of_colors)]
    rgb_colors = [ordered_colors[i] for i in range(number_of_colors)]

    return hex_colors, rgb_colors

# Set the input folder containing images
input_folder = 'Downloaded-Images/KaggleFashion'

# Set the output folder paths for saving the pie charts and DataFrame as CSV
output_folder_pie = 'Pie-Charts/Raw/Dominant/KaggleFashion'
output_folder_csv_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_csv_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_pie, exist_ok=True)
os.makedirs(output_folder_csv_1, exist_ok=True)
os.makedirs(output_folder_csv_2, exist_ok=True)

# Create a list to store the results
results = []

# Process images in batches of 1250
count = 0  # Counter to keep track of the number of processed images
batch_count = 0  # Counter to keep track of the number of processed batches
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            hex_colors, rgb_colors = get_colors(image_path, number_of_colors=8)

            # Store the image and colors in the results list
            results.append({'Image': file, 'Colors': rgb_colors[:3]})

            # Generate and save the pie chart
            plt.figure()
            plt.pie([1] * 3, colors=hex_colors[:3])
            pie_chart_path = os.path.join(output_folder_pie, file + '.png')
            plt.savefig(pie_chart_path)
            plt.close()

            count += 1

            # Check if count is a multiple of 1250
            if count % 1250 == 0:
                batch_count += 1
                print(f"{count} processed")

                # Create a DataFrame from the results of the batch
                df = pd.DataFrame(results)

                # Save the DataFrame as CSV for the batch
                output_csv_file_batch_1 = os.path.join(output_folder_csv_1, f'Dominant-Raw-Batch{batch_count}.csv')
                output_csv_file_batch_2 = os.path.join(output_folder_csv_2, f'Dominant-Raw-Batch{batch_count}.csv')
                df.to_csv(output_csv_file_batch_1, index=False)
                df.to_csv(output_csv_file_batch_2, index=False)

                # Clear the results list for the next batch
                results.clear()

# Create a DataFrame from the remaining results (if any)
if results:
    df = pd.DataFrame(results)

    # Save the remaining results as CSV
    output_csv_file_remaining_1 = os.path.join(output_folder_csv_1, 'Dominant-Raw-KaggleFashion.csv')
    output_csv_file_remaining_2 = os.path.join(output_folder_csv_2, 'Dominant-Raw-KaggleFashion.csv')
    df.to_csv(output_csv_file_remaining_1, index=False)
    df.to_csv(output_csv_file_remaining_2, index=False)

print("Processing completed.")

# DataFrame: Processed Images Pie Chart + CSV (Top 3)

In [None]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd
import matplotlib.pyplot as plt

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_colors(image_path, number_of_colors):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    reshaped_image = cv2.resize(image, (600, 400))
    reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
    clf = KMeans(n_clusters=number_of_colors)
    clf.fit(reshaped_image)
    center_colors = clf.cluster_centers_
    ordered_colors = [center_colors[i] for i in range(number_of_colors)]
    hex_colors = [RGB_HEX(ordered_colors[i]) for i in range(number_of_colors)]
    rgb_colors = [ordered_colors[i] for i in range(number_of_colors)]

    return hex_colors, rgb_colors

# Set the input folder containing images
input_folder = 'Processed-Images/KaggleFashion'

# Set the output folder paths for saving the pie charts and DataFrame as CSV
output_folder_pie = 'Pie-Charts/Processed/Dominant/KaggleFashion'
output_folder_csv_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_csv_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_pie, exist_ok=True)
os.makedirs(output_folder_csv_1, exist_ok=True)
os.makedirs(output_folder_csv_2, exist_ok=True)

# Create a list to store the results
results = []

# Process images in batches of 1250
count = 0  # Counter to keep track of the number of processed images
batch_count = 0  # Counter to keep track of the number of processed batches
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            hex_colors, rgb_colors = get_colors(image_path, number_of_colors=8)

            # Store the image and colors in the results list
            results.append({'Image': file, 'Colors': rgb_colors[:3]})

            # Generate and save the pie chart
            plt.figure()
            plt.pie([1] * 3, colors=hex_colors[:3])
            pie_chart_path = os.path.join(output_folder_pie, file + '.png')
            plt.savefig(pie_chart_path)
            plt.close()

            count += 1

            # Check if count is a multiple of 1250
            if count % 1250 == 0:
                batch_count += 1
                print(f"{count} processed")

                # Create a DataFrame from the results of the batch
                df = pd.DataFrame(results)

                # Save the DataFrame as CSV for the batch
                output_csv_file_batch_1 = os.path.join(output_folder_csv_1, f'Dominant-Processed-Batch{batch_count}.csv')
                output_csv_file_batch_2 = os.path.join(output_folder_csv_2, f'Dominant-Processed-Batch{batch_count}.csv')
                df.to_csv(output_csv_file_batch_1, index=False)
                df.to_csv(output_csv_file_batch_2, index=False)

                # Clear the results list for the next batch
                results.clear()

# Create a DataFrame from the remaining results (if any)
if results:
    df = pd.DataFrame(results)

    # Save the remaining results as CSV
    output_csv_file_remaining_1 = os.path.join(output_folder_csv_1, 'Dominant-Processed-KaggleFashion.csv')
    output_csv_file_remaining_2 = os.path.join(output_folder_csv_2, 'Dominant-Processed-KaggleFashion.csv')
    df.to_csv(output_csv_file_remaining_1, index=False)
    df.to_csv(output_csv_file_remaining_2, index=False)

print("Processing completed.")



# DataFrame: Most Popular Color from the Dataset of Processed Images

In [None]:
# Goes through 1250 images at a time of Pie Charts
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_most_prominent_color(image_paths):
    colors = []

    counter = 0
    print('input paths ' + str(len(image_paths)))

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)
        counter = counter + 1

    colors = np.array(colors)
    clf = KMeans(n_clusters=1)  # Only one cluster for the most prominent color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)
    print('counter ' + str(counter))

    return hex_color

# Set the input folder containing images
input_folder = 'Pie-Charts/Processed/Dominant/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the most prominent color from the image paths
    prominent_color = get_most_prominent_color(image_paths)

    # Create a DataFrame with a single row containing the most prominent color
    df = pd.DataFrame({'Prominent Color': [prominent_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'MostPopular-Processed-Dominant-KaggleFashionP.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'MostPopular-Processed-Dominant-KaggleFashionP.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

                # Print the number of images processed
                print('Processed', batch_size, 'images')

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

    # Print the number of images processed
    print('Processed', len(image_paths), 'images')

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

In [None]:
# Goes through 1250 images at a time of Images
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_most_prominent_color(image_paths):
    colors = []

    counter = 0
    print('input paths ' + str(len(image_paths)))

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)
        counter = counter + 1

    colors = np.array(colors)
    clf = KMeans(n_clusters=1)  # Only one cluster for the most prominent color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)
    print('counter ' + str(counter))

    return hex_color

# Set the input folder containing images
input_folder = 'Processed-Images/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the most prominent color from the image paths
    prominent_color = get_most_prominent_color(image_paths)

    # Create a DataFrame with a single row containing the most prominent color
    df = pd.DataFrame({'Prominent Color': [prominent_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'MostPopular-Processed-Dominant-KaggleFashionI.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'MostPopular-Processed-Dominant-KaggleFashionI.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

                # Print the number of images processed
                print('Processed', batch_size, 'images')

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

    # Print the number of images processed
    print('Processed', len(image_paths), 'images')

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

# DataFrame: Least Popular Color from the Dataset of Processed Images

In [None]:
# Goes through 1250 images at a time of Pie Charts
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_least_common_color(image_paths):
    colors = []

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)

    colors = np.array(colors)
    clf = KMeans(n_clusters=1, init='random')  # Only one cluster for the least common color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)

    return hex_color

# Set the input folder containing images
input_folder = 'Pie-Charts/Processed/Dominant/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the least common color from the image paths
    least_common_color = get_least_common_color(image_paths)

    # Create a DataFrame with a single row containing the least common color
    df = pd.DataFrame({'Least Common Color': [least_common_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'LeastPopular-Processed-Dominant-KaggleFashionP.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'LeastPopular-Processed-Dominant-KaggleFashionP.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

In [None]:
# Goes through 1250 images at a time of Images
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_least_common_color(image_paths):
    colors = []

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)

    colors = np.array(colors)
    clf = KMeans(n_clusters=1, init='random')  # Only one cluster for the least common color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)

    return hex_color

# Set the input folder containing images
input_folder = 'Processed-Images/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the least common color from the image paths
    least_common_color = get_least_common_color(image_paths)

    # Create a DataFrame with a single row containing the least common color
    df = pd.DataFrame({'Least Common Color': [least_common_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'LeastPopular-Processed-Dominant-KaggleFashionI.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'LeastPopular-Processed-Dominant-KaggleFashionI.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

# DataFrame: Most Popular Color from the Dataset of Raw Images

In [None]:
# Goes through 1250 images at a time of Pie Charts
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_most_prominent_color(image_paths):
    colors = []

    counter = 0
    print('input paths ' + str(len(image_paths)))

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)
        counter = counter + 1

    colors = np.array(colors)
    clf = KMeans(n_clusters=1)  # Only one cluster for the most prominent color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)
    print('counter ' + str(counter))

    return hex_color

# Set the input folder containing images
input_folder = 'Pie-Charts/Raw/Dominant/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the most prominent color from the image paths
    prominent_color = get_most_prominent_color(image_paths)

    # Create a DataFrame with a single row containing the most prominent color
    df = pd.DataFrame({'Prominent Color': [prominent_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'MostPopular-Raw-Dominant-KaggleFashionP.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'MostPopular-Raw-Dominant-KaggleFashionP.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

                # Print the number of images processed
                print('Processed', batch_size, 'images')

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

    # Print the number of images processed
    print('Processed', len(image_paths), 'images')

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

In [None]:
# Goes through 1250 images at a time of Images
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_most_prominent_color(image_paths):
    colors = []

    counter = 0
    print('input paths ' + str(len(image_paths)))

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)
        counter = counter + 1

    colors = np.array(colors)
    clf = KMeans(n_clusters=1)  # Only one cluster for the most prominent color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)
    print('counter ' + str(counter))

    return hex_color

# Set the input folder containing images
input_folder = 'Downloaded-Images/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the most prominent color from the image paths
    prominent_color = get_most_prominent_color(image_paths)

    # Create a DataFrame with a single row containing the most prominent color
    df = pd.DataFrame({'Prominent Color': [prominent_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'MostPopular-Raw-Dominant-KaggleFashionI.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'MostPopular-Raw-Dominant-KaggleFashionI.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

                # Print the number of images processed
                print('Processed', batch_size, 'images')

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

    # Print the number of images processed
    print('Processed', len(image_paths), 'images')

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

# DataFrame: Least Popular Color from the Dataset of Raw Images

In [None]:
# Goes through 1250 images at a time of Pie Charts
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_least_common_color(image_paths):
    colors = []

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)

    colors = np.array(colors)
    clf = KMeans(n_clusters=1, init='random')  # Only one cluster for the least common color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)

    return hex_color

# Set the input folder containing images
input_folder = 'Pie-Charts/Raw/Dominant/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the least common color from the image paths
    least_common_color = get_least_common_color(image_paths)

    # Create a DataFrame with a single row containing the least common color
    df = pd.DataFrame({'Least Common Color': [least_common_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'LeastPopular-Raw-Dominant-KaggleFashionP.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'LeastPopular-Raw-Dominant-KaggleFashionP.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))

In [None]:
# Goes through 1250 images at a time of Images
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import pandas as pd

def RGB_HEX(color):
    return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

def get_least_common_color(image_paths):
    colors = []

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        reshaped_image = cv2.resize(image, (600, 400))
        reshaped_image = reshaped_image.reshape(reshaped_image.shape[0] * reshaped_image.shape[1], 3)
        colors.extend(reshaped_image)

    colors = np.array(colors)
    clf = KMeans(n_clusters=1, init='random')  # Only one cluster for the least common color
    clf.fit(colors)
    center_color = clf.cluster_centers_[0]
    hex_color = RGB_HEX(center_color)

    return hex_color

# Set the input folder containing images
input_folder = 'Downloaded-Images/KaggleFashion'

# Set the output folder paths for saving the DataFrame as CSV
output_folder_1 = 'CSV/Color-Analysis/KaggleFashion'
output_folder_2 = 'CSV/KaggleFashion'

# Create the output folders if they don't exist
os.makedirs(output_folder_1, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

# Set the batch size
batch_size = 1250

# Create a list to store the image paths
image_paths = []

# Function to process a batch of image paths
def process_batch(image_paths):
    # Get the least common color from the image paths
    least_common_color = get_least_common_color(image_paths)

    # Create a DataFrame with a single row containing the least common color
    df = pd.DataFrame({'Least Common Color': [least_common_color]})

    # Print the DataFrame
    print(df)

    # Save the DataFrame as CSV in two different folders
    output_csv_file_1 = os.path.join(output_folder_1, 'LeastPopular-Raw-Dominant-KaggleFashionI.csv')
    output_csv_file_2 = os.path.join(output_folder_2, 'LeastPopular-Raw-Dominant-KaggleFashionI.csv')

    df.to_csv(output_csv_file_1, index=False)
    df.to_csv(output_csv_file_2, index=False)

# Traverse subfolders and collect image paths from the input folder
for root, dirs, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.png') or file.endswith('.jpg') or file.endswith('.jpeg'):
            image_path = os.path.join(root, file)
            image_paths.append(image_path)

            # Process the batch when it reaches the specified batch size
            if len(image_paths) == batch_size:
                process_batch(image_paths)

                # Clear the image paths list for the next batch
                image_paths = []

# Process any remaining image paths in the last batch
if len(image_paths) > 0:
    process_batch(image_paths)

# Print the total number of image paths processed
print('Total image paths processed:', len(image_paths))