# Automation of Dataset Creation

To run this notebook, download `products_asos.csv` from [Kaggle](https://www.kaggle.com/datasets/trainingdatapro/asos-e-commerce-dataset-30845-products).

In [2]:
import os
import pandas as pd
import requests

In [7]:
# 1) Read in the CSV
df = pd.read_csv("products_asos.csv", on_bad_lines='skip')
df = df.dropna(subset=['images'])

In [2]:
df['images']

Unnamed: 0,images
0,['https://images.asos-media.com/products/new-l...
1,['https://images.asos-media.com/products/new-l...
2,['https://images.asos-media.com/products/new-l...
3,['https://images.asos-media.com/products/new-l...
4,['https://images.asos-media.com/products/strad...
...,...
30840,['https://images.asos-media.com/products/urban...
30841,['https://images.asos-media.com/products/asos-...
30842,['https://images.asos-media.com/products/asyou...
30843,['https://images.asos-media.com/products/miss-...


In [8]:
df = df.drop_duplicates(subset=['images'])

In [4]:
df['images']

Unnamed: 0,images
0,['https://images.asos-media.com/products/new-l...
4,['https://images.asos-media.com/products/strad...
8,['https://images.asos-media.com/products/jdy-o...
10,['https://images.asos-media.com/products/nike-...
14,['https://images.asos-media.com/products/asos-...
...,...
30840,['https://images.asos-media.com/products/urban...
30841,['https://images.asos-media.com/products/asos-...
30842,['https://images.asos-media.com/products/asyou...
30843,['https://images.asos-media.com/products/miss-...


In [9]:
# Take first 5000 rows for now
df = df.head(5000)

In [10]:
# Clean the 'images' column: remove brackets and quotes, extract the first URL
def clean_image_cell(cell):
    if pd.isna(cell):
        return None
    # Remove brackets and single/double quotes
    cleaned = cell.strip("[]").replace("'", "").replace('"', '')
    # Return the first URL (split on comma if needed)
    return cleaned.split(",")[0].strip()

df['images'] = df['images'].apply(clean_image_cell)

In [7]:
print(df['images'][0])

https://images.asos-media.com/products/new-look-trench-coat-in-camel/204351106-4?$n_1920w$&wid=1926&fit=constrain


In [8]:
for url in df['images'][0].split(","):
  print(url)

https://images.asos-media.com/products/new-look-trench-coat-in-camel/204351106-4?$n_1920w$&wid=1926&fit=constrain


In [9]:
# 2) Create a folder to hold the downloaded images
os.makedirs("asos_images", exist_ok=True)

# 3) Loop over each row's image URL in the 'images' column
index = 75 # currently have 74 images in dataset
for i, row in df.iterrows():
    image_url = row["images"]
    if pd.isna(image_url):
        # If there's no image URL, skip
        continue

    # 4) Download the image
    try:
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()  # Raise an error if status != 200

        # 5) Save the image to disk
        filename = os.path.join("asos_images", f"{index}.png")
        with open(filename, "wb") as f:
            f.write(response.content)
        index += 1
    except Exception as e:
        print(f"Failed to download {image_url}: {e}")

In [10]:
print(index) # next index should start here

5075


In [None]:
!zip -r asos_images.zip asos_images


In [14]:
from google.colab import files
files.download("asos_images.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Concatenating with original dataset

In [3]:
orig_df = pd.read_excel('dataset.xlsx', sheet_name='Sheet1')

In [4]:
orig_df = orig_df.drop(columns=['image','user','brand'])

In [5]:
orig_df

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
68,Short tweed dress,The studio has reinterpreted the Parisian offi...,445.00
69,Floral sleeveless jumper,"For the new collection, the Maje studio celebr...",265.00
70,Belted wool coat,Long double-breasted coat in fine pure wool br...,875.00
71,Long Castleford Trench Coat,A trench coat made in England from shower-resi...,2890.00


In [11]:
new_df = df[['name','description','price']]

In [12]:
new_df

Unnamed: 0,name,description,price
0,New Look trench coat in camel,[{'Product Details': 'Coats & Jackets by New L...,49.99
4,Stradivarius double breasted wool coat in grey,[{'Product Details': 'Coats & Jackets by Strad...,59.99
8,JDY oversized trench coat in stone,[{'Product Details': 'Coats & Jackets by JDYLo...,45.00
10,Nike Running hooded jacket in pink,[{'Product Details': 'Coats & Jackets by Nike ...,84.95
14,ASOS DESIGN Tall linen mix trench coat in natural,[{'Product Details': 'Coats & Jackets by ASOS ...,75.00
...,...,...,...
5528,Mango high neck jumper in cream,[{'Product Details': 'Jumpers & Cardigans by M...,35.99
5529,New Look ribbed crew neck knitted jumper in bu...,[{'Product Details': 'Jumpers & Cardigans by N...,16.99
5530,Noisy May crew neck knitted jumper in dark green,[{'Product Details': 'Jumper by Noisy May Soft...,Now 16.80
5531,Wednesday's Girl longline cardigan in cable knit,"[{'Product Details': ""Cardigans by Wednesday's...",28.00


In [60]:
df_combined = pd.concat([orig_df, new_df], ignore_index=True)

In [14]:
df_combined

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
5068,Mango high neck jumper in cream,[{'Product Details': 'Jumpers & Cardigans by M...,35.99
5069,New Look ribbed crew neck knitted jumper in bu...,[{'Product Details': 'Jumpers & Cardigans by N...,16.99
5070,Noisy May crew neck knitted jumper in dark green,[{'Product Details': 'Jumper by Noisy May Soft...,Now 16.80
5071,Wednesday's Girl longline cardigan in cable knit,"[{'Product Details': ""Cardigans by Wednesday's...",28.00


In [70]:
df_combined.to_excel('5k_dataset.xlsx', index=False)

## Adding men's clothing (do not use for now)

In [15]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [16]:
import pandas as pd
import os
import glob


In [27]:
folder_path = '/content/drive/My Drive/Penn Spring 2025/STAT 4830/men'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

In [41]:
men_df = pd.DataFrame()
for file in csv_files:
    try:
        # Read the CSV file
        desired_cols = ['product_name', 'product_images', 'price', 'details']
        curr_df = pd.read_csv(file)
        curr_df.columns = curr_df.columns.str.strip().str.lower()
        curr_df = curr_df[desired_cols]
        # Append to the combined DataFrame
        men_df = pd.concat([men_df, curr_df], ignore_index=True)
    except ValueError as e:
        print(f"Skipping file {file} due to missing columns: {e}")

In [48]:
men_df = men_df.dropna()

In [49]:
men_df

Unnamed: 0,product_name,product_images,price,details
0,100% LINEN SUIT BLAZER,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 11,990.00",Regular-fit blazer made of linen. Notched lape...
1,JACKET WITH POCKETS,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 7,590.00",Faded regular-fit collared jacket with long sl...
2,100% LINEN PLEATED TROUSERS,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 5,990.00",Regular fit trousers made of a linen fabric. w...
3,KNIT COTTON POLO SHIRT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 3,290.00",Round collar knit polo shirt in spun cotton fa...
4,OXFORD SHIRT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 4,990.00",Regular-fit shirt made of a textured cotton fa...
...,...,...,...,...
936,CROPPED DENIM SWEATSHIRT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 3,290.00",Oversize fit cropped faded sweatshirt featurin...
937,DENIM T-SHIRT WITH ABSTRACT PRINT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 2,990.00",Lightweight denim T-shirt with a round necklin...
938,VINTAGE FADED DENIM T-SHIRT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 2,590.00",T-shirt made of lightweight denim. Featuring a...
939,TIE-DYE PRINT SWEATSHIRT,[{'https://static.zara.net/photos///2023/I/0/2...,"₹ 3,290.00",Round neck sweatshirt with short sleeves and i...


In [45]:
men_df['product_images'][0]

"[{'https://static.zara.net/photos///2023/I/0/2/p/1564/102/506/2/w/489/1564102506_1_1_1.jpg?ts=1688573213489': 'Image 0 of 100% LINEN SUIT BLAZER from Zara'}, {'https://static.zara.net/photos///2023/I/0/2/p/1564/102/506/2/w/489/1564102506_2_1_1.jpg?ts=1688573213424': 'Image 1 of 100% LINEN SUIT BLAZER from Zara'}]"

In [50]:
import ast

def clean_image_cell(cell):
    if pd.isna(cell):
        return None
    try:
        # Convert stringified list of dicts to actual Python list
        image_list = ast.literal_eval(cell)
        if isinstance(image_list, list) and len(image_list) > 0:
            first_dict = image_list[0]
            if isinstance(first_dict, dict):
                return list(first_dict.keys())[0]
    except Exception as e:
        print(f"Failed to parse image cell: {cell[:100]}... Error: {e}")
    return None

men_df['product_images'] = men_df['product_images'].apply(clean_image_cell)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  men_df['product_images'] = men_df['product_images'].apply(clean_image_cell)


In [53]:
men_df['product_images'][0]

'https://static.zara.net/photos///2023/I/0/2/p/1564/102/506/2/w/489/1564102506_1_1_1.jpg?ts=1688573213489'

In [55]:
men_df = men_df.rename(columns={'product_name': 'name', 'product_images': 'images', 'details': 'description'})

In [61]:
df_combined = pd.concat([df_combined, men_df[['name','description','price']]], ignore_index=True)

In [62]:
df_combined

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
6007,CROPPED DENIM SWEATSHIRT,Oversize fit cropped faded sweatshirt featurin...,"₹ 3,290.00"
6008,DENIM T-SHIRT WITH ABSTRACT PRINT,Lightweight denim T-shirt with a round necklin...,"₹ 2,990.00"
6009,VINTAGE FADED DENIM T-SHIRT,T-shirt made of lightweight denim. Featuring a...,"₹ 2,590.00"
6010,TIE-DYE PRINT SWEATSHIRT,Round neck sweatshirt with short sleeves and i...,"₹ 3,290.00"


In [66]:
# Create a folder to hold the downloaded images
os.makedirs("zara_men_images4", exist_ok=True)
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

# Loop over each row's image URL in the 'images' column
index = 5075 # currently have 5074 images in dataset
for i, row in men_df.iterrows():
    image_url = row["images"]
    if pd.isna(image_url):
        # If there's no image URL, skip
        continue

    # 4) Download the image
    try:
        response = requests.get(image_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error if status != 200

        # 5) Save the image to disk
        filename = os.path.join("zara_men_images", f"{index}.jpg")
        with open(filename, "wb") as f:
            f.write(response.content)
        index += 1
    except Exception as e:
        print(f"Failed to download {image_url}: {e}")

Failed to download https://static.zara.net/photos///2023/I/0/2/p/9598/305/712/2/w/0/9598305712_1_1_1.jpg?ts=1691753368278: 404 Client Error: Not Found for url: https://static.zara.net/photos///2023/I/0/2/p/9598/305/712/2/w/0/9598305712_1_1_1.jpg?ts=1691753368278
Failed to download https://static.zara.net/photos///2023/I/0/2/p/7627/305/426/2/w/0/7627305426_1_1_1.jpg?ts=1689319990245: 404 Client Error: Not Found for url: https://static.zara.net/photos///2023/I/0/2/p/7627/305/426/2/w/0/7627305426_1_1_1.jpg?ts=1689319990245
Failed to download https://static.zara.net/photos///2023/I/0/2/p/2795/313/712/2/w/0/2795313712_1_1_1.jpg?ts=1692286398339: 404 Client Error: Not Found for url: https://static.zara.net/photos///2023/I/0/2/p/2795/313/712/2/w/0/2795313712_1_1_1.jpg?ts=1692286398339
Failed to download https://static.zara.net/photos///2023/I/0/2/p/5039/503/800/2/w/0/5039503800_1_1_1.jpg?ts=1692956953130: 404 Client Error: Not Found for url: https://static.zara.net/photos///2023/I/0/2/p/5039/

In [None]:
print(index) # next index should start here