## La Garçonne Product Scraping & Cleaning

In [None]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re

In [None]:
lagarconne_product_list = []

# Manual product removal (safety net)
remove_keywords = [
    # Jewelry
    'necklace', 'hops', 'earrings', 'earring', 'ring', 'hoops', 'bracelet', 'pendant', 'solid',
    '16"', '8"', '18"', '20"', '80CM','45CM', '42CM', '40CM', 'collar', 'strand', 'claw','sunglasses',
    'bangle','cuff','choker',

    # Home supplies
    'body wash', 'conditioner', 'shampoo', 'soap', 'hand balm', 'hand wash',
    'spray set', 'candle', 'perfume', 'neck pillow', 'bar', 'gel',
    'lotion', 'wash', 'pillow',

    # Bags
    'envelope', 'phone case', 'backpack', 'handbag', 'clutch','medium',
    'bowling', 'tote', 'crossbody','small','large','extra-large',

    # other types
    'heel','boot','all-in-one','on','up','short', 'bootie','heel','tunic','strap','shoes','slip on','mule'
]

# run through the pages of the website
for lagarconne_page in range(1, 20):
    lagarconne_url = f'https://lagarconne.com/collections/all?page={lagarconne_page}'
    lagarconne_response = requests.get(lagarconne_url)
    lagarconne_soup = BeautifulSoup(lagarconne_response.text, 'html.parser')

    lagarconne_products = lagarconne_soup.find_all(
        'div', class_="lg-col-6 lg-col-md-3 lg-product-list-item"
    )

    for product in lagarconne_products:
        name = product.find('h3', class_="lg-product-list-item-title")
        price = product.find('span', class_="money")
        vendor = product.find('p', class_="lg-product-list-item-vendor")


        name_text = name.get_text(strip=True) if name else None
        vendor_text = vendor.get_text(strip=True) if vendor else None
        price_text = price.get_text(strip=True) if price else None

        if not price_text:
            continue

        price_value = float(price_text.replace('$', '').replace(',', ''))

        if price_value < 500:
            price_tag = 'Affordable fashion'
        elif price_value < 2000:
            price_tag = 'Accessible Luxury'
        elif price_value < 10000:
            price_tag = 'High Luxury'
        else:
            price_tag = 'Ultra Luxury'

        # Extract product link
        link_tag = product.find('a', href=True)
        if not link_tag:
            continue

        product_link = 'https://lagarconne.com' + link_tag['href']
        product_page = requests.get(product_link)
        product_page_soup = BeautifulSoup(product_page.text, 'html.parser')

        # Description
        desc_div = product_page_soup.find(
            'div', class_="lg-desc-product lg-col-md-8 lg-no-gutters"
        )
        descrip = desc_div.find('p') if desc_div else None
        product_desc = descrip.get_text(" ", strip=True).split() if descrip else []

        # Lexical diversity
        len_desc = len(product_desc)
        lex_diversity = round(len(set(product_desc)) / len_desc * 100, 2) if len_desc else 0
        product_desc_str = ' '.join(product_desc)

        # Product details
        product_details = None
        for p in desc_div.find_all('p'):
          if "color" in p.get_text(strip=True).lower():
            product_details = p.get_text(" ", strip=True)
            break

        # Size guide
        size_guide = bool(
            product_page_soup.find('a', {'class': 'lg-link product-sizechart'})
        )

        # Image extraction
        pictures = product_page_soup.find_all('div', class_='lg-product-item')
        pic_count = len(pictures)

        image_links = []
        image_tags = product_page_soup.find_all(
            'a', attrs={'data-product-single-thumbnail': True}
        )

        for tag in image_tags:
            href = tag.get('href')
            if href:
                if href.startswith("//"):
                    href = "https:" + href
                image_links.append(href)
        image_link = image_links[0] if image_links else None

        # Category extraction
        if name_text and '—' in name_text:
            category = name_text.split('—')[0].split()[-1].upper()
        elif name_text:
            category = name_text.split()[-1].upper()
        else:
            category = None

        # Category cleaning and sorting
        if category == 'NECK':
            category = 'CREWNECK' if name_text.split()[-2].upper() == 'CREW' else 'TURTLENECK'
        elif category in ['CAPE', 'BANDANA', 'STOLE', 'SHAWL','NECKWARMER']:
            category = 'SCARF'
        elif category in ['REJUVEN8','SNEAKER']:
            category = 'SNEAKERS'
        elif category in ['PANT', 'TROUSER','TROUSERS', 'DENIM','JOGGER','JEANS','JEAN']:
            category = 'PANTS'
        elif category == 'I':
            category = 'BOOTS'
        elif category in ['CAP','BEANIE']:
            category = 'HAT'
        elif category == 'BASELAYER':
            category = 'LONGSLEEVE'
        elif category in ['T', 'TEE', 'T-SHIRT', 'SHIRT','LONGSLEEVE','TANK','BOATNECK','OVERSHIRT','BLOUSE']:
            category = 'TOP'
        elif category in ['ANORAK', 'SWEATSHIRT','HOODIE','POLO']:
            category = 'PULLOVER'
        elif category in ['CREWNECK','CARDIGAN','TURTLENECK','CREWNECK','JUMPER','KNIT','GUERNSEY']:
            category = 'SWEATER'
        elif category in ['PEACOAT', 'COAT', 'PARKA','OVERCOAT','BLAZER']:
            category = 'JACKET'
        elif category in ['BOOT', 'BOOTS','SLIDE', 'SNEAKER', 'LOAFER']:
            category = 'BOOTS'

        name_lower = name_text.lower() if name_text else ''
        if any(word in name_lower for word in remove_keywords):
            continue

        # Material detection
        material_list = [
            "silk", "cashmere", "merino wool", "mohair", "alpaca", "vicuna",
            "egyptian cotton", "pima cotton", "linen", "leather", "suede",
            "velvet", "satin", "crepe", "tweed", "selvedge denim", "lace",
            "calfskin", "lambskin", "crocodile", "alligator", "ostrich",
            "lizard", "patent leather", "saffiano", "canvas", "coated canvas",
            "nylon", "technical nylon", "microfiber", "raffia", "straw",
            "jacquard", "cotton", "polyester", "wool", "polyamide", "yak khullu",
            "shearling", "viscose", "triacetate", "stretch poly", "rubber", "fabric",
            "polyurethane", "faux pearl"
        ]

        # Compile found materials
        combined_text = (
            product_desc_str + " " + (product_details or "") + (name_text or "")
        ).lower()

        material_found = [m for m in material_list if m in combined_text]

        # Append clean product
        lagarconne_product_list.append({
            'Retailer': 'La Garconne',
            'Vendor': vendor_text,
            'Category': category,
            'Name': name_text,
            'Price': price_value,
            'Product Link': product_link,
            'Description': product_desc_str,
            'Product Details': product_details,
            'Description Lexical Diversity': lex_diversity,
            'Description Length': len_desc,
            'Materials': material_found,
            'Size Guide': size_guide,
            'Luxury Tier': price_tag,
            'Number of pictures': pic_count,
            'Image Links': image_link
        })

# DataFrame
lagar_df = pd.DataFrame(lagarconne_product_list)
lagar_df

Unnamed: 0,Retailer,Vendor,Category,Name,Price,Product Link,Description,Product Details,Description Lexical Diversity,Description Length,Materials,Size Guide,Luxury Tier,Number of pictures,Image Links
0,La Garconne,6397,SWEATER,Andes Wool Sweater,995.0,https://lagarconne.com/collections/all/product...,6397 beige cardigan with a rounded collar and ...,Color: Mouse. 100% Eco Wool. Made in Peru.,79.07,43,[wool],True,Accessible Luxury,4,https://lagarconne.com/cdn/shop/files/Feb2021S...
1,La Garconne,6397,SWEATER,Track Zip Alpaca Sweater,695.0,https://lagarconne.com/collections/all/product...,6397 dark green sweater jacket with a high nec...,"Color: Mud. 57% Alpaca, 30% Viscose, 12% Recyc...",94.74,38,"[merino wool, alpaca, nylon, wool, viscose]",True,Accessible Luxury,4,https://lagarconne.com/cdn/shop/files/Feb2021S...
2,La Garconne,6397,SWEATER,Elbow V-Neck Cashmere Sweater,595.0,https://lagarconne.com/collections/all/product...,6397 brown sweater top with a high V-neckline ...,Color: Mud. 100% Cashmere. Imported.,87.88,33,[cashmere],True,Accessible Luxury,4,https://lagarconne.com/cdn/shop/files/Feb2021S...
3,La Garconne,6397,TOP,Roll-Neck Merino Wool Tank — Putty,325.0,https://lagarconne.com/collections/all/product...,"6397 light mauve sleeveless tank with a high, ...",Color: Putty. 100% Merino Wool. Imported.,94.74,19,"[merino wool, wool]",True,Affordable fashion,4,https://lagarconne.com/cdn/shop/files/Feb2021S...
4,La Garconne,6397,TOP,Roll-Neck Merino Wool Tank — Black,325.0,https://lagarconne.com/collections/all/product...,"6397 black sleeveless tank with a high, rolled...",Color: Black. 100% Merino Wool. Imported.,94.44,18,"[merino wool, wool]",True,Affordable fashion,4,https://lagarconne.com/cdn/shop/files/Feb2021S...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,La Garconne,The Row,MOCCASIN,Soft Moccasin,1290.0,https://lagarconne.com/collections/all/product...,The Row light beige moccasin with an oval toe ...,Color: Yellow Linen. 100% Calf Suede. Made in ...,95.83,24,"[linen, leather, suede, calfskin]",True,Accessible Luxury,3,https://lagarconne.com/cdn/shop/files/softloaf...
659,La Garconne,The Row,BOOTS,Ama Slide — Toile,496.0,https://lagarconne.com/collections/all/product...,The Row beige curved slide sandal with a conto...,Color: Toile. 100% Rubber. Made in Italy.,100.00,17,[rubber],True,Affordable fashion,5,https://lagarconne.com/cdn/shop/files/amatoile...
660,La Garconne,The Row,BOOTS,Ama Slide — Laquer Red,496.0,https://lagarconne.com/collections/all/product...,The Row red curved slide sandal with a contour...,Color: Laquer Red. 100% Rubber. Made in Italy.,100.00,17,[rubber],True,Affordable fashion,4,https://lagarconne.com/cdn/shop/files/amared1_...
661,La Garconne,The Row,BOOTS,Ama Slide — Black,496.0,https://lagarconne.com/collections/all/product...,The Row black curved slide sandal with a conto...,Color: Black. 100% Rubber. Made in Italy.,100.00,17,[rubber],True,Affordable fashion,4,https://lagarconne.com/cdn/shop/files/rubbersl...


In [None]:
# Filter out categories with fewer than 5 items for better predictions

# Count category frequencies
category_counts = lagar_df['Category'].value_counts()

# Filter categories with >= 5 occurrences
valid_categories = category_counts[category_counts >= 5].index

# Keep only rows with valid categories
lagar_df = lagar_df[lagar_df['Category'].isin(valid_categories)].reset_index(drop=True)

In [None]:
# Download csv of scraped products
from google.colab import files
lagar_df.to_csv('lagarconne_products_cleaned.csv', index=False)
files.download('lagarconne_products_cleaned.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>