In [None]:
!pip install transformers numpy pandas colorthief pillow opencv-python requests

In [None]:
import pandas as pd
import requests
from PIL import Image
import numpy as np
from io import BytesIO
from colorthief import ColorThief
import cv2

# Define the functions (from the previous step)
def download_image(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    return img

def get_dominant_color(image_url):
    image = download_image(image_url)
    color_thief = ColorThief(BytesIO(requests.get(image_url).content)) # Pass the image object, not the URL
    dominant_color = color_thief.get_color(quality=1)
    return dominant_color

def is_patterned(image_url):
    image = download_image(image_url)
    img_cv = np.array(image)
    img_gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(img_gray, threshold1=100, threshold2=200)
    if np.sum(edges) > 100000:  # Adjust based on image size
        return 'patterned'
    else:
        return 'plain'

def compare_with_pattern(image_url, expected_pattern):
    detected_pattern = is_patterned(image_url)
    return detected_pattern == expected_pattern

# Load the dataset
file_path = './preprocessed_dataset.csv'  # Path to your dataset
data = pd.read_csv(file_path)

# Preprocessing functions
def preprocess_data(df):
    total_rows = len(df)

  # 2. Add dominant color and pattern detection
    df['dominant_color'] = None
    df['detected_pattern'] = None
    df['pattern_match'] = None

    for idx, row in df.iterrows():
        # Progress tracking
        progress = (idx + 1) / total_rows * 100
        print(f"Processing row {idx + 1}/{total_rows} - {progress:.2f}% completed", end='\r')

        image_url = row['image_url']

        # Get dominant color and pattern
        dominant_color = get_dominant_color(image_url)
        detected_pattern = is_patterned(image_url)

        # Store results in the DataFrame
        df.at[idx, 'dominant_color'] = dominant_color
        df.at[idx, 'detected_pattern'] = detected_pattern
        df.at[idx, 'pattern_match'] = compare_with_pattern(image_url, row['pattern'])

    # 4. Fill missing values and other preprocessing steps (as per your existing logic)
    # For example, filling missing values in the 'pattern' column
    df['pattern'] = df['pattern'].fillna('unknown')

    return df

# Apply preprocessing
preprocessed_data = preprocess_data(data)

# Save the updated dataset
preprocessed_data.to_csv('./preprocessed_dataset_with_pattern_and_color.csv', index=False)

# Display the first few rows of the updated dataset
print(preprocessed_data.head())


KeyboardInterrupt: 

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from transformers import ViTModel, ViTFeatureExtractor
from sklearn.cluster import KMeans
import torch

# Path to the directory of raw images
image_dir = './raw_images'

# Pre-trained models for feature extraction
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# Placeholder for storing image data and features
image_paths = []
image_features = []

# Function to extract features using Vision Transformer (ViT)
def extract_vit_features(image):
    # Convert image to the format expected by the feature extractor
    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = vit_model(**inputs).last_hidden_state
    return features.mean(dim=1).numpy()  # Average over patches to get global representation

# Iterate over the images and extract features
for index, image_file in enumerate(os.listdir(image_dir), start=1):
    image_path = os.path.join(image_dir, image_file)
    print(f"Processing image {index}/{len(os.listdir(image_dir))}: {image_file}")

    # Open the image
    image = Image.open(image_path).convert("RGB")

    # Resize the image to 224x224 for ViT
    image = image.resize((224, 224))

    # Extract features using ViT
    features = extract_vit_features(image)

    # Store the results
    image_paths.append(image_path)
    image_features.append(features)

# Convert features to a numpy array for clustering
image_features = np.vstack(image_features)

# Perform clustering to generate pseudo-labels for each target label
# Adjust n_clusters based on your needs
clothing_type_kmeans = KMeans(n_clusters=5).fit(image_features)
occasion_kmeans = KMeans(n_clusters=3).fit(image_features)
pattern_kmeans = KMeans(n_clusters=4).fit(image_features)
season_kmeans = KMeans(n_clusters=4).fit(image_features)
specifications_kmeans = KMeans(n_clusters=3).fit(image_features)

# Assign clusters as labels
clothing_type_labels = clothing_type_kmeans.labels_
occasion_labels = occasion_kmeans.labels_
pattern_labels = pattern_kmeans.labels_
season_labels = season_kmeans.labels_
specification_labels = specifications_kmeans.labels_

# Create a DataFrame to store the results
data = {
    "image": image_paths,
    "clothing_type": clothing_type_labels,
    "occasion": occasion_labels,
    "pattern": pattern_labels,
    "season": season_labels,
    "specifications": specification_labels
}

df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv('labeled_fashion_dataset.csv', index=False)

print("Dataset creation complete. Saved to 'labeled_fashion_dataset.csv'.")
