# Automation of Dataset Creation

To run this notebook, download `products_asos.csv` from [Kaggle](https://www.kaggle.com/datasets/trainingdatapro/asos-e-commerce-dataset-30845-products).

In [1]:
import os
import pandas as pd
import requests

In [2]:
# 1) Read in the CSV
df = pd.read_csv("products_asos.csv", on_bad_lines='skip')
df = df.dropna(subset=['images'])

In [None]:
df['images']

Unnamed: 0,images
0,['https://images.asos-media.com/products/new-l...
1,['https://images.asos-media.com/products/new-l...
2,['https://images.asos-media.com/products/new-l...
3,['https://images.asos-media.com/products/new-l...
4,['https://images.asos-media.com/products/strad...
...,...
30840,['https://images.asos-media.com/products/urban...
30841,['https://images.asos-media.com/products/asos-...
30842,['https://images.asos-media.com/products/asyou...
30843,['https://images.asos-media.com/products/miss-...


In [3]:
df = df.drop_duplicates(subset=['images'])

In [None]:
df['images']

Unnamed: 0,images
0,['https://images.asos-media.com/products/new-l...
4,['https://images.asos-media.com/products/strad...
8,['https://images.asos-media.com/products/jdy-o...
10,['https://images.asos-media.com/products/nike-...
14,['https://images.asos-media.com/products/asos-...
...,...
30840,['https://images.asos-media.com/products/urban...
30841,['https://images.asos-media.com/products/asos-...
30842,['https://images.asos-media.com/products/asyou...
30843,['https://images.asos-media.com/products/miss-...


In [4]:
# Take first 5000 rows for now
df = df.head(5000)

In [5]:
# Clean the 'images' column: remove brackets and quotes, extract the first URL
def clean_image_cell(cell):
    if pd.isna(cell):
        return None
    # Remove brackets and single/double quotes
    cleaned = cell.strip("[]").replace("'", "").replace('"', '')
    # Return the first URL (split on comma if needed)
    return cleaned.split(",")[0].strip()

df['images'] = df['images'].apply(clean_image_cell)

In [None]:
print(df['images'][0])

https://images.asos-media.com/products/new-look-trench-coat-in-camel/204351106-4?$n_1920w$&wid=1926&fit=constrain


In [None]:
for url in df['images'][0].split(","):
  print(url)

https://images.asos-media.com/products/new-look-trench-coat-in-camel/204351106-4?$n_1920w$&wid=1926&fit=constrain


In [None]:
# 2) Create a folder to hold the downloaded images
os.makedirs("asos_images", exist_ok=True)

# 3) Loop over each row's image URL in the 'images' column
index = 75 # currently have 74 images in dataset
for i, row in df.iterrows():
    image_url = row["images"]
    if pd.isna(image_url):
        # If there's no image URL, skip
        continue

    # 4) Download the image
    try:
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()  # Raise an error if status != 200

        # 5) Save the image to disk
        filename = os.path.join("asos_images", f"{index}.png")
        with open(filename, "wb") as f:
            f.write(response.content)
        index += 1
    except Exception as e:
        print(f"Failed to download {image_url}: {e}")

In [None]:
print(index) # next index should start here

5075


In [None]:
!zip -r asos_images.zip asos_images


In [None]:
from google.colab import files
files.download("asos_images.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Concatenating with original dataset

In [6]:
orig_df = pd.read_excel('dataset.xlsx', sheet_name='Sheet1')

In [7]:
orig_df = orig_df.drop(columns=['image','user','brand'])

In [None]:
orig_df

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
68,Short tweed dress,The studio has reinterpreted the Parisian offi...,445.00
69,Floral sleeveless jumper,"For the new collection, the Maje studio celebr...",265.00
70,Belted wool coat,Long double-breasted coat in fine pure wool br...,875.00
71,Long Castleford Trench Coat,A trench coat made in England from shower-resi...,2890.00


In [8]:
new_df = df[['name','description','price']]

In [None]:
new_df

Unnamed: 0,name,description,price
0,New Look trench coat in camel,[{'Product Details': 'Coats & Jackets by New L...,49.99
4,Stradivarius double breasted wool coat in grey,[{'Product Details': 'Coats & Jackets by Strad...,59.99
8,JDY oversized trench coat in stone,[{'Product Details': 'Coats & Jackets by JDYLo...,45.00
10,Nike Running hooded jacket in pink,[{'Product Details': 'Coats & Jackets by Nike ...,84.95
14,ASOS DESIGN Tall linen mix trench coat in natural,[{'Product Details': 'Coats & Jackets by ASOS ...,75.00
...,...,...,...
5528,Mango high neck jumper in cream,[{'Product Details': 'Jumpers & Cardigans by M...,35.99
5529,New Look ribbed crew neck knitted jumper in bu...,[{'Product Details': 'Jumpers & Cardigans by N...,16.99
5530,Noisy May crew neck knitted jumper in dark green,[{'Product Details': 'Jumper by Noisy May Soft...,Now 16.80
5531,Wednesday's Girl longline cardigan in cable knit,"[{'Product Details': ""Cardigans by Wednesday's...",28.00


In [9]:
df_combined = pd.concat([orig_df, new_df], ignore_index=True)

In [None]:
df_combined

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
5068,Mango high neck jumper in cream,[{'Product Details': 'Jumpers & Cardigans by M...,35.99
5069,New Look ribbed crew neck knitted jumper in bu...,[{'Product Details': 'Jumpers & Cardigans by N...,16.99
5070,Noisy May crew neck knitted jumper in dark green,[{'Product Details': 'Jumper by Noisy May Soft...,Now 16.80
5071,Wednesday's Girl longline cardigan in cable knit,"[{'Product Details': ""Cardigans by Wednesday's...",28.00


In [None]:
df_combined.to_excel('5k_dataset.xlsx', index=False)

## Adding men's clothing (do not use for now)

In [10]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [11]:
import pandas as pd
import os
import glob


In [12]:
folder_path = '/content/drive/My Drive/Penn Spring 2025/STAT 4830/men myntra'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

In [13]:
men_df = pd.DataFrame()
for file in csv_files:
    try:
        # Read the CSV file
        desired_cols = ['NAME', 'DESCRIPTION & COLOR', 'IMAGE', 'PRICE']
        curr_df = pd.read_csv(file)
        curr_df.columns = curr_df.columns.str.strip()
        curr_df = curr_df[desired_cols]
        curr_df = curr_df.head(100)
        # Append to the combined DataFrame
        men_df = pd.concat([men_df, curr_df], ignore_index=True)
    except ValueError as e:
        print(f"Skipping file {file} due to missing columns: {e}")

In [14]:
men_df = men_df.dropna()

In [15]:
men_df

Unnamed: 0,NAME,DESCRIPTION & COLOR,IMAGE,PRICE
0,Van Heusen Men Green Slim Fit Formal Shirt,"Van Heusen Men Green Slim Fit Formal Shirt, V...","https://assets.myntassets.com/h_1440,q_100,w_1...",749
1,Blackberrys Men White & Purple Slim Fit Self-C...,Blackberrys Men White & Purple Slim Fit Self C...,"https://assets.myntassets.com/h_1440,q_100,w_1...",1197
2,INVICTUS Blue Slim Fit Formal Shirt,"INVICTUS Blue Slim Fit Formal Shirt, INVICTUS...","https://assets.myntassets.com/h_1440,q_100,w_1...",719
3,U.S. Polo Assn. Men Pink Regular Fit Striped F...,U.S. Polo Assn. Men Pink Regular Fit Striped F...,"https://assets.myntassets.com/h_1440,q_100,w_1...",839
4,Peter England Men Green & Yellow Slim Fit Chec...,Peter England Men Green & Yellow Slim Fit Chec...,"https://assets.myntassets.com/h_1440,q_100,w_1...",999
...,...,...,...,...
1142,SUITLTD Men Men Navy Striped Single-Breasted R...,SUITLTD Men Men Navy Striped Single Breasted R...,"https://assets.myntassets.com/h_1440,q_100,w_1...",3276
1143,Parx Men Blue Single-Breasted Urban Fit Formal...,Parx Men Blue Single Breasted Urban Fit Formal...,"https://assets.myntassets.com/h_1440,q_100,w_1...",3999
1144,Peter England Elite Men Black Single-Breasted ...,Peter England Elite Men Black Single Breasted ...,"https://assets.myntassets.com/h_1440,q_100,w_1...",5999
1145,Parx Men Brown Single-Breasted Urban Fit Forma...,Parx Men Brown Single Breasted Urban Fit Forma...,"https://assets.myntassets.com/h_1440,q_100,w_1...",3199


In [None]:
men_df['IMAGE'][0]

'https://assets.myntassets.com/h_1440,q_100,w_1080/v1/assets/images/7647697/2018/11/22/2a69b8bb-9b0b-40b8-a0bf-53417246d8e01542881881200-Van-Heusen-Men-Shirts-7731542881881013-1.jpg'

In [16]:
men_df = men_df.rename(columns={'NAME': 'name', 'IMAGE': 'images', 'DESCRIPTION & COLOR': 'description', 'PRICE': 'price'})

In [17]:
df_combined2 = pd.concat([df_combined, men_df[['name','description','price']]], ignore_index=True)

In [18]:
df_combined2

Unnamed: 0,name,description,price
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57
...,...,...,...
6215,SUITLTD Men Men Navy Striped Single-Breasted R...,SUITLTD Men Men Navy Striped Single Breasted R...,3276
6216,Parx Men Blue Single-Breasted Urban Fit Formal...,Parx Men Blue Single Breasted Urban Fit Formal...,3999
6217,Peter England Elite Men Black Single-Breasted ...,Peter England Elite Men Black Single Breasted ...,5999
6218,Parx Men Brown Single-Breasted Urban Fit Forma...,Parx Men Brown Single Breasted Urban Fit Forma...,3199


In [19]:
# Create a folder to hold the downloaded images
os.makedirs("myntra_men_images", exist_ok=True)
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

# Loop over each row's image URL in the 'images' column
index = 5075 # currently have 5074 images in dataset
for i, row in men_df.iterrows():
    image_url = row["images"]
    if pd.isna(image_url):
        # If there's no image URL, skip
        continue

    # 4) Download the image
    try:
        response = requests.get(image_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error if status != 200

        # 5) Save the image to disk
        filename = os.path.join("myntra_men_images", f"{index}.jpg")
        with open(filename, "wb") as f:
            f.write(response.content)
        index += 1
    except Exception as e:
        print(f"Failed to download {image_url}: {e}")

Failed to download https://assets.myntassets.com/h_1440,q_100,w_1080/v1/assets/images/productimage/2018/11/24/1c32ae09-63fc-4f4c-aa01-67a8bf96ef901543046449728-1.jpg: 503 Server Error: Service Unavailable for url: https://assets.myntassets.com/h_1440,q_100,w_1080/v1/assets/images/productimage/2018/11/24/1c32ae09-63fc-4f4c-aa01-67a8bf96ef901543046449728-1.jpg


In [20]:
print(index) # next index should start here

6221


In [21]:
!zip -r myntra_men_images.zip myntra_men_images

  adding: myntra_men_images/ (stored 0%)
  adding: myntra_men_images/5228.jpg (deflated 3%)
  adding: myntra_men_images/5962.jpg (deflated 0%)
  adding: myntra_men_images/5991.jpg (deflated 1%)
  adding: myntra_men_images/5592.jpg (deflated 0%)
  adding: myntra_men_images/6204.jpg (deflated 6%)
  adding: myntra_men_images/5390.jpg (deflated 3%)
  adding: myntra_men_images/5604.jpg (deflated 4%)
  adding: myntra_men_images/5902.jpg (deflated 3%)
  adding: myntra_men_images/6119.jpg (deflated 4%)
  adding: myntra_men_images/6053.jpg (deflated 8%)
  adding: myntra_men_images/5687.jpg (deflated 3%)
  adding: myntra_men_images/5115.jpg (deflated 2%)
  adding: myntra_men_images/5809.jpg (deflated 4%)
  adding: myntra_men_images/5778.jpg (deflated 8%)
  adding: myntra_men_images/5827.jpg (deflated 5%)
  adding: myntra_men_images/6165.jpg (deflated 6%)
  adding: myntra_men_images/5942.jpg (deflated 3%)
  adding: myntra_men_images/6197.jpg (deflated 6%)
  adding: myntra_men_images/6027.jpg (def

In [22]:
from google.colab import files
files.download("myntra_men_images.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
df_combined2.iloc[6040:6074, :]

Unnamed: 0,name,description,price
6040,Speedo Men Grey Solid Swim Shorts 8007220012,"Speedo Men Grey Solid Swim Shorts 8007220012, ...",949
6041,Speedo Unisex Assorted Slogan Print Swim Cap,"Speedo Unisex Assorted Slogan Print Swim Cap, ...",799
6042,Speedo Unisex FASTSKIN SPESOCKET 2 MIR Swimmin...,Speedo Unisex FASTSKIN SPESOCKET 2 MIR Swimmin...,2379
6043,Speedo Kids Grey & Black Junior Slogan Print S...,Speedo Kids Grey & Black Junior Slogan Print S...,799
6044,Speedo Kids Pink Pace Swimming Cap 8720641341,"Speedo Kids Pink Pace Swimming Cap 8720641341,...",899
6045,Speedo Kids Red Slogan Printed Swim Cap,"Speedo Kids Red Slogan Printed Swim Cap, Spee...",799
6046,Speedo Kids Blue Slogan Print Swimming Cap,"Speedo Kids Blue Slogan Print Swimming Cap, S...",799
6047,Speedo Kids Orange Pace Swim Cap,"Speedo Kids Orange Pace Swim Cap, Speedo, Swi...",899
6048,Speedo Kids White Slogan Printed Swim Cap,"Speedo Kids White Slogan Printed Swim Cap, Sp...",799
6049,Black Swimming Goggle,"Black Swimming Goggle, Speedo, Swimwear Acces...",1999


In [28]:
df_combined2 = df_combined2.drop(index=6062)

In [29]:
df_combined2['image_key'] = df_combined2.index + 2

In [30]:
df_combined2

Unnamed: 0,name,description,price,image_key
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49,2
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68,3
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49,4
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53,5
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57,6
...,...,...,...,...
6215,SUITLTD Men Men Navy Striped Single-Breasted R...,SUITLTD Men Men Navy Striped Single Breasted R...,3276,6217
6216,Parx Men Blue Single-Breasted Urban Fit Formal...,Parx Men Blue Single Breasted Urban Fit Formal...,3999,6218
6217,Peter England Elite Men Black Single-Breasted ...,Peter England Elite Men Black Single Breasted ...,5999,6219
6218,Parx Men Brown Single-Breasted Urban Fit Forma...,Parx Men Brown Single Breasted Urban Fit Forma...,3199,6220


In [31]:
df_combined2.to_excel('dataset_m.xlsx', index=False)