In [1]:
import pandas as pd
from loguru import logger

from pet_products_scraper import (
    PetProductsETL,
    ZooplusETL,
    PetsAtHomeETL,
    JollyesETL,
    LilysKitchenETL,
    BitibaETL,
    PetSupermarketETL,
    PetPlanetETL,
    PurinaETL,
    DirectVetETL,
    FishKeeperETL,
    PetDrugsOnlineETL,
    ViovetETL,
    PetShopETL,
    VetShopETL,
    VetUKETL,
    BurnsPetETL,
    AsdaETL,
    TheRangeETL,
    OcadoETL,
    HarringtonsETL,
    BernPetFoodsETL,
    PetsCornerETL,
    OrijenETL,
    ThePetExpressETL,
    PetShopOnlineETL,
    TaylorPetFoodsETL,
    TheNaturalPetStoreETL,
    HealthyPetStoreETL,
    FarmAndPetPlaceETL,
    NaturesMenuETL,
)

factory = {
    # "Zooplus": ZooplusETL(),  
    "PetsAtHome": PetsAtHomeETL(),  
    "Jollyes": JollyesETL(),  
    "LilysKitchen": LilysKitchenETL(),  
    # "Bitiba": BitibaETL(),
    "PetSupermarket": PetSupermarketETL(),  
    "PetPlanet": PetPlanetETL(),  
    "Purina": PurinaETL(),  
    "DirectVet": DirectVetETL(),  
    "FishKeeper": FishKeeperETL(),  
    "PetDrugsOnline": PetDrugsOnlineETL(),  
    "Viovet": ViovetETL(),  
    "PetShop": PetShopETL(),  
    "VetShop": VetShopETL(),  
    "VetUK": VetUKETL(),  
    "BurnsPet": BurnsPetETL(),  
    "ASDAGroceries": AsdaETL(),  
    # "TheRange": TheRangeETL(),
    "Ocado": OcadoETL(),  
    "Harringtons": HarringtonsETL(),  
    "BernPetFoods": BernPetFoodsETL(),  
    "PetsCorner": PetsCornerETL(),  
    "Orijen": OrijenETL(),  
    "ThePetExpress": ThePetExpressETL(),  
    "PetShopOnline": PetShopOnlineETL(),  
    "TaylorPetFoods": TaylorPetFoodsETL(),  
    "TheNaturalPetStore": TheNaturalPetStoreETL(),  
    "HealthyPetStore": HealthyPetStoreETL(),  
    "FarmAndPetPlace": FarmAndPetPlaceETL(),  
    "NaturesMenu": NaturesMenuETL(),  
}

def run_etl(shop: str):

    if shop in factory:
        return factory[shop]
    else:
        raise ValueError(
            f"Shop {shop} is not supported. Please pass a valid shop.")



[32m2025-04-28 19:13:52.707[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m59[0m - [1mSuccessfully extracted data from https://www.direct-vet.co.uk 200[0m
[32m2025-04-28 19:13:52.708[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m64[0m - [1mSleeping for 1.3688383360361043 seconds...[0m
[32m2025-04-28 19:13:55.262[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m59[0m - [1mSuccessfully extracted data from https://www.direct-vet.co.uk 200[0m
[32m2025-04-28 19:13:55.263[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m64[0m - [1mSleeping for 1.0014036308613041 seconds...[0m


In [2]:
d = pd.read_csv('./csv/pet_product_variant_urls.csv')
d['full_url'] = d['base_url'].str[:-1] + d['url']

In [3]:
company_list = d['shop_name'].value_counts().reset_index()['shop_name'].to_list()
valid_companies = [company for company in company_list if company in factory.keys()]

valid_companies

['Viovet',
 'PetShop',
 'ThePetExpress',
 'Ocado',
 'FarmAndPetPlace',
 'PetSupermarket',
 'PetsCorner',
 'PetsAtHome',
 'PetDrugsOnline',
 'VetUK',
 'Jollyes',
 'HealthyPetStore',
 'TheNaturalPetStore',
 'PetPlanet',
 'PetShopOnline',
 'DirectVet',
 'ASDAGroceries',
 'TaylorPetFoods',
 'LilysKitchen',
 'NaturesMenu',
 'BernPetFoods',
 'Harringtons',
 'Orijen']

In [None]:
def extract(companies):
    for c in companies:
        sample_df = d[d['shop_name'] == c]
        scrape_link = sample_df['full_url'].value_counts().reset_index()['full_url'].to_list()
        scraper = run_etl(c)
        scrape_payload = []
        for link in scrape_link:
            scrape_df = scraper.image_scrape_product(link)
            if scrape_df is not None:
                scrape_payload.append(scrape_df)
            else:
                raise ValueError('Problem with scraper . . ')

        transform(c, scrape_payload)


def transform(company, df):
    df_scrape = pd.DataFrame(df)

    df = d[d['shop_name'] == company]

    df_merge = df.merge(df_scrape, how="inner", left_on="full_url", right_on="url")
    load(df_merge[['id', 'shop_name', 'base_url', 'image_urls']], company)


def load(df, company):
    df.to_csv(f'./csv/{company}.csv', index=False)



extract(['Orijen'])

[32m2025-04-28 19:13:56.169[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m59[0m - [1mSuccessfully extracted data from https://www.orijenpetfoods.co.uk/product/orijen-puppy/ 200[0m
[32m2025-04-28 19:13:56.170[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m64[0m - [1mSleeping for 0.8206286537220977 seconds...[0m
[32m2025-04-28 19:13:56.426[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m59[0m - [1mSuccessfully extracted data from https://www.orijenpetfoods.co.uk/product/orijen-original/ 200[0m
[32m2025-04-28 19:13:56.427[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m:[36m64[0m - [1mSleeping for 1.5676284173567165 seconds...[0m
[32m2025-04-28 19:13:56.679[0m | [1mINFO    [0m | [36mpet_products_scraper._pet_products_etl[0m:[36mextract_from_url[0m