In [None]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchsummary import summary
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import pandas as pd
import os

csv_path = "./path/to/images/styles.csv"

df = pd.read_csv(csv_path, usecols=["id", "gender", "masterCategory", "subCategory", 
                                    "articleType", "baseColour", "season", "year", 
                                    "usage", "productDisplayName"])

df["brand"] = df["productDisplayName"].str.split().str[0]

image_folder = "./path/to/images/images"

df["image_path"] = df["id"].astype(str) + ".jpg"  # Keep only filenames

df.head()

In [None]:
import os
import random
import pandas as pd
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

logging.basicConfig(filename="scraper_log.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

csv_path = "./path/to/images/styles.csv"
image_folder = "./path/to/images/images"

df = pd.read_csv(csv_path, usecols=["id", "gender", "masterCategory", "subCategory",
                                    "articleType", "baseColour", "season", "year",
                                    "usage", "productDisplayName"])

df["brand"] = df["productDisplayName"].str.split().str[0]
df["price"] = None
df["website"] = None
df["image_path"] = df["id"].astype(str) + ".jpg"


options = webdriver.EdgeOptions()
options.add_argument(f"--user-data-dir={os.path.expanduser('~')}/AppData/Local/Microsoft/Edge/User Data")
options.add_argument("--profile-directory=Default")
driver = webdriver.Edge()


image_count = 0
for index, row in df.iterrows():
    if image_count >= 500:
        print("Processed 500 images, stopping...")
        break

    image_file = os.path.join(image_folder, row["image_path"])
    if not os.path.exists(image_file):
        logging.warning(f"Image not found: {image_file}")
        continue

    try:
        driver.get("https://lens.google.com/")
        time.sleep(random.uniform(.02, .05))
        print(f"Searching image: {row['image_path']}")

        upload_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'upload a file')]"))
        )
        upload_button.click()
        time.sleep(random.uniform(.02, .05))

        upload_input = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='file']"))
        )
        upload_input.send_keys(image_file)
        if row["image_path"] == "15970.jpg":
            print("Waiting extra time for CAPTCHA...")
            time.sleep(20)
        print(f"Uploaded image: {row['image_path']}")

        time.sleep(random.uniform(.005, .01))
        driver.execute_script("window.scrollBy(0, 300);")

        time.sleep(random.uniform(.5, 1.5))  # Give time for results to appear

        price_elements = driver.find_elements(By.XPATH, "//span[@class='EwVMFc']")
        retailer_elements = driver.find_elements(By.XPATH, "//div[@class='R8BTeb q8U8x LJEGod du278d i0Rdmd']")

        found_price = None
        found_retailer = None

        for i, price_elem in enumerate(price_elements):
            price_text = price_elem.text.strip()
            if "$" == price_text[0]:
                found_price = price_text
                # Try to get the corresponding retailer (if available)
                if i < len(retailer_elements):
                    found_retailer = retailer_elements[i].text.strip()
                else:
                    found_retailer = None
                break

        if found_price and found_retailer:
            df.at[index, "price"] = found_price
            df.at[index, "website"] = found_retailer
            print(f"Found Price: {found_price} at {found_retailer}")
        else:
            logging.warning(f"No $ price found for image: {row['image_path']}")

        time.sleep(random.uniform(.05, .5))
        image_count += 1

    except Exception as e:
        logging.error(f"Error processing image {row['image_path']}: {e}")
        print("CAPTCHA detected! Sleeping for 10 seconds before retrying...")
        time.sleep(10)

driver.quit()
print("WebDriver closed.")
print("\nDataFrame updated successfully!")

In [None]:
import pandas as pd

# csv_path = ./path/to/images/styles.csv"
# df = pd.read_csv(csv_path, usecols=["id", "price"])

# **Function to Extract Currency Symbol**
def get_currency(price):
    if pd.isna(price) or not isinstance(price, str):
        return None  # Handle missing or invalid values
    return price[0]  # Extract first character (currency)

# **Apply Function to Price Column**
df["currency"] = df["price"].apply(get_currency)

print(df.head())

df.to_csv("updated_prices.csv", index=False)
print("Currency symbols extracted successfully!")

In [None]:
df.head(500)

In [None]:
import pandas as pd

# **Filter Out Rows Without Pricing**
df_filtered = df.dropna(subset=["price"])  # Remove rows where 'price' is NaN

# **Limit to First 500 Entries**
df_limited = df_filtered.head(500)  

def get_currency(price):
    if pd.isna(price) or not isinstance(price, str):
        return None  # Handle missing or invalid values
    return price[0]  # Extract first character (currency)

df_limited["currency"] = df_limited["price"].apply(get_currency)

print(df_limited.head())

df_limited.to_csv("updated_prices.csv", index=False)
print("Filtered dataset saved with first 500 images that have pricing!")

In [None]:
len(df_limited)

In [None]:
df_limited.tail()

In [None]:
import os
image_path = path/to/images/images/15970.jpg"
print("File Exists:", os.path.exists(image_path))

In [None]:
print(os.getcwd())  

In [None]:
df_limited.tail()

In [None]:
print(df_limited["articleType"].unique())

In [None]:
print(df_limited["currency"].unique())

In [None]:
currencies = ['₹', 'C', 'P', '€', 'S', '£', 'A', None]
filtered_df = df_limited[df_limited['currency'].isin(currencies)]
filtered_df.head(50)