In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
import re

In [2]:
def extract_asoup(url, parser='lxml'):
  page = requests.get(url)
  soup = BeautifulSoup(page.content, parser)
  return soup

In [3]:
watches_url = "https://www.jaquet-droz.com/en/watches"

In [4]:
soup = extract_asoup(watches_url, 'html.parser')

In [5]:
links = soup.find_all('a', href=True)
watches_urls = [link['href'] for link in links if link['href'].startswith("https://www.jaquet-droz.com/en/watches/")]
watches_urls

['https://www.jaquet-droz.com/en/watches/timepieces',
 'https://www.jaquet-droz.com/en/watches/automata',
 'https://www.jaquet-droz.com/en/watches/ateliers-d-art',
 'https://www.jaquet-droz.com/en/watches/grande-seconde',
 'https://www.jaquet-droz.com/en/watches/sw',
 'https://www.jaquet-droz.com/en/watches/petite-heure-minute',
 'https://www.jaquet-droz.com/en/watches/lady-8',
 'https://www.jaquet-droz.com/en/watches/astrale']

In [6]:
watches_data = []
for watch_url in watches_urls:
    watch_soup = extract_asoup(watch_url, 'html.parser')
    content = watch_soup.find('div', class_ = 'block block-system')
    if content is None:
        continue
    parent_model = content.find('h1').text
    item_list = content.find_all('a', href=True)
    urls =[link['href'] for link in item_list]
    for watch in urls:
        watch_URL = watch
        watch_infos = extract_asoup(watch, 'html.parser')
        watch_info = watch_infos.find('div', class_ = 'watch-infos')
        specific_model = watch_info.find('h1', class_ = 'title-node').text
        description = watch_info.find('div', class_ = 'description').text.strip()
        watch_spec = watch_info.find('div', class_ = 'watch-spec')
        features = watch_spec.find('th', text='Indications').find_next_sibling('td').text if watch_spec.find('th', text='Indications') else None
        jewels = watch_spec.find('th', text='Jewelling').find_next_sibling('td').text \
            if watch_spec.find('th', text='Jewelling') else None
        frequency = watch_spec.find('th', text='Frequency').find_next_sibling('td').text if watch_spec.find('th', text='Frequency') else None
        power_reserve = watch_spec.find('th', text='Power reserve').find_next_sibling('td').text if watch_spec.find('th', text='Power reserve') else None
        caliber = watch_spec.find('th', text='Movement').find_next_sibling('td').text if watch_spec.find('th', text='Movement') else None
        movement = caliber
        clasp_type = watch_spec.find('th', text='Buckle').find_next_sibling('td').text if watch_spec.find('th', text='Buckle') else None
        bracelet_color = watch_spec.find('th', text='Strap').find_next_sibling('td').text if watch_spec.find('th', text='Strap') else None
        bracelet_material = bracelet_color
        dial_color = watch_spec.find('th', text='Dial').find_next_sibling('td').text if watch_spec.find('th', text='Dial') else None
        water_resistance = watch_spec.find('th', text=lambda text: text and 'resistance' in text).find_next_sibling('td').text if watch_spec.find('th', text=lambda text: text and 'resistance' in text) else None
        case_thickness = watch_spec.find('th', text='Case').find_next_sibling('td').text if watch_spec.find('th', text='Case') else None
        diameter = case_thickness
        case_material = case_thickness

        image_URL = watch_infos.find('div', class_='watch-picture').find('img').get('src')
        price ="N/A"
        currency = "N/A"
        brand = "Jaquet Droz"
        reference_number = watch_spec.find('th', text='Reference').find_next_sibling('td').text

        if watch_infos.select('div.variantes li.variante:not(.active)'):
            list_colors = [li.find('a')['href'] for li in watch_infos.select('div.variantes li.variante:not(.active)')]

            index = urls.index(watch)
            urls[index:index] = [item for item in list_colors if item not in urls]



        watches_data.append({
            "field name": "value",
            "reference_number": reference_number,
            "watch_URL": watch_URL,
            "type": '',
            "brand": brand,
            "year_introduced": '',
            "parent_model": parent_model,
            "specific_model": specific_model,
            "nickname": '',
            "marketing_name": '',
            "style": '',
            "currency": currency,
            "price": price,
            "image_URL": image_URL,
            "made_in": '',
            "case_shape": '',
            "case_material": case_material,
            "case_finish": '',
            "caseback": '',
            "diameter": diameter,
            "between_lugs": '',
            "lug_to_lug": '',
            "case_thickness": case_thickness,
            "bezel_material": '',
            "bezel_color": '',
            "crystal": '',
            "water_resistance": water_resistance,
            "weight": '',
            "dial_color": dial_color,
            "numerals": '',
            "bracelet_material": bracelet_material,
            "bracelet_color": bracelet_color,
            "clasp_type": clasp_type,
            "caliber": caliber,
           "power_reserve": power_reserve,
           "frequency": frequency,
           "jewels": jewels,
           "features": features,
           "description": description,
           "short_description": '',
        })

  features = watch_spec.find('th', text='Indications').find_next_sibling('td').text if watch_spec.find('th', text='Indications') else None
  if watch_spec.find('th', text='Jewelling') else None
  jewels = watch_spec.find('th', text='Jewelling').find_next_sibling('td').text \
  frequency = watch_spec.find('th', text='Frequency').find_next_sibling('td').text if watch_spec.find('th', text='Frequency') else None
  power_reserve = watch_spec.find('th', text='Power reserve').find_next_sibling('td').text if watch_spec.find('th', text='Power reserve') else None
  caliber = watch_spec.find('th', text='Movement').find_next_sibling('td').text if watch_spec.find('th', text='Movement') else None
  clasp_type = watch_spec.find('th', text='Buckle').find_next_sibling('td').text if watch_spec.find('th', text='Buckle') else None
  bracelet_color = watch_spec.find('th', text='Strap').find_next_sibling('td').text if watch_spec.find('th', text='Strap') else None
  dial_color = watch_spec.find('th', text='D

In [7]:
df_watches_data = pd.DataFrame(watches_data)
df_watches_data

Unnamed: 0,field name,reference_number,watch_URL,type,brand,year_introduced,parent_model,specific_model,nickname,marketing_name,...,bracelet_material,bracelet_color,clasp_type,caliber,power_reserve,frequency,jewels,features,description,short_description
0,value,J0328330011,https://www.jaquet-droz.com/en/watches/automat...,,Jaquet Droz,,Timepieces,The Rolling Stones Automaton,,,...,Fabric with printed patterns in rubber,Fabric with printed patterns in rubber,18-karat red gold folding clasp,"Jaquet Droz 2653 AT2, self-winding mechanical ...",68 hours,"28,800 v.p.h.",56 jewels,Off-centered hours and minutes,Black onyx dial and 18-karat red gold applied ...,
1,value,J0328330011,https://www.jaquet-droz.com/en/watches/automat...,,Jaquet Droz,,Timepieces,The Rolling Stones Automaton,,,...,Fabric with printed patterns in rubber,Fabric with printed patterns in rubber,18-karat red gold folding clasp,"Jaquet Droz 2653 AT2, self-winding mechanical ...",68 hours,"28,800 v.p.h.",56 jewels,Off-centered hours and minutes,Black onyx dial and 18-karat red gold applied ...,
2,value,J013523242,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet,,,...,Rolled-edge hand-made black alligator,Rolled-edge hand-made black alligator,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Sapphire dial with metallic sapphire base. \r\...,
3,value,J013523242,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet,,,...,Rolled-edge hand-made black alligator,Rolled-edge hand-made black alligator,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Sapphire dial with metallic sapphire base. \r\...,
4,value,J0135230011,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet Skull,,,...,Rubber strap,Rubber strap,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Hand-engraved and hand-painted 18-karat gold d...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,value,J013013281,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Jadeite,,,...,Rolled-edge hand-made green satin,Rolled-edge hand-made green satin,18-karat red gold folding clasp set with 32 di...,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h.",31 jewels,Hours and minutes at 6 o'clock Tourbillon fram...,"Jadeite dial, white mother-of-pearl subdial. \...",
127,value,J013034240,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Côtes de Genève,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat white gold folding clasp,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels,Hours and minutes at 6 o'clock\r\nTourbillon f...,"Côtes de Genève technical dial,\r\nsapphire ri...",
128,value,J013013200,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat red gold folding clasp\r\n,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels\r\n,Hours and minutes at 6 o'clock\r\nTourbillon f...,"Ivory Grand Feu enamel, tourbillon frame. 18-c...",
129,value,J013033200,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Ivory Enamel,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat red gold folding clasp,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels,Hours and minutes at 6 o'clock\r\nTourbillon f...,Ivory Grand Feu enameled dial.\r\n18-carat red...,


In [8]:
df_watches_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   field name         131 non-null    object
 1   reference_number   131 non-null    object
 2   watch_URL          131 non-null    object
 3   type               131 non-null    object
 4   brand              131 non-null    object
 5   year_introduced    131 non-null    object
 6   parent_model       131 non-null    object
 7   specific_model     131 non-null    object
 8   nickname           131 non-null    object
 9   marketing_name     131 non-null    object
 10  style              131 non-null    object
 11  currency           131 non-null    object
 12  price              131 non-null    object
 13  image_URL          131 non-null    object
 14  made_in            131 non-null    object
 15  case_shape         131 non-null    object
 16  case_material      131 non-null    object
 1

In [9]:
df_watches_data.isna().sum()

field name           0
reference_number     0
watch_URL            0
type                 0
brand                0
year_introduced      0
parent_model         0
specific_model       0
nickname             0
marketing_name       0
style                0
currency             0
price                0
image_URL            0
made_in              0
case_shape           0
case_material        0
case_finish          0
caseback             0
diameter             0
between_lugs         0
lug_to_lug           0
case_thickness       0
bezel_material       0
bezel_color          0
crystal              0
water_resistance     7
weight               0
dial_color           5
numerals             0
bracelet_material    5
bracelet_color       5
clasp_type           5
caliber              0
power_reserve        0
frequency            5
jewels               3
features             5
description          0
short_description    0
dtype: int64

In [10]:
df_watches_data.to_csv('df_watches_data.csv', index=False, encoding='utf-8-sig')

In [11]:
import pandas as pd
df = pd.read_csv('df_watches_data.csv')
df

Unnamed: 0,field name,reference_number,watch_URL,type,brand,year_introduced,parent_model,specific_model,nickname,marketing_name,...,bracelet_material,bracelet_color,clasp_type,caliber,power_reserve,frequency,jewels,features,description,short_description
0,value,J0328330011,https://www.jaquet-droz.com/en/watches/automat...,,Jaquet Droz,,Timepieces,The Rolling Stones Automaton,,,...,Fabric with printed patterns in rubber,Fabric with printed patterns in rubber,18-karat red gold folding clasp,"Jaquet Droz 2653 AT2, self-winding mechanical ...",68 hours,"28,800 v.p.h.",56 jewels,Off-centered hours and minutes,Black onyx dial and 18-karat red gold applied ...,
1,value,J0328330011,https://www.jaquet-droz.com/en/watches/automat...,,Jaquet Droz,,Timepieces,The Rolling Stones Automaton,,,...,Fabric with printed patterns in rubber,Fabric with printed patterns in rubber,18-karat red gold folding clasp,"Jaquet Droz 2653 AT2, self-winding mechanical ...",68 hours,"28,800 v.p.h.",56 jewels,Off-centered hours and minutes,Black onyx dial and 18-karat red gold applied ...,
2,value,J013523242,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet,,,...,Rolled-edge hand-made black alligator,Rolled-edge hand-made black alligator,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Sapphire dial with metallic sapphire base. \r\...,
3,value,J013523242,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet,,,...,Rolled-edge hand-made black alligator,Rolled-edge hand-made black alligator,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Sapphire dial with metallic sapphire base. \r\...,
4,value,J0135230011,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,Timepieces,Tourbillon Skelet Skull,,,...,Rubber strap,Rubber strap,18-karat red gold folding clasp,"Jaquet Droz 2625SQ, self-winding skeleton tour...",7 days,"21,600 v.p.h.",30 jewels,Off-centered hours and minutes at 6 o'clock. T...,Hand-engraved and hand-painted 18-karat gold d...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,value,J013013281,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Jadeite,,,...,Rolled-edge hand-made green satin,Rolled-edge hand-made green satin,18-karat red gold folding clasp set with 32 di...,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h.",31 jewels,Hours and minutes at 6 o'clock Tourbillon fram...,"Jadeite dial, white mother-of-pearl subdial. \...",
127,value,J013034240,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Côtes de Genève,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat white gold folding clasp,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels,Hours and minutes at 6 o'clock\r\nTourbillon f...,"Côtes de Genève technical dial,\r\nsapphire ri...",
128,value,J013013200,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat red gold folding clasp\r\n,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels\r\n,Hours and minutes at 6 o'clock\r\nTourbillon f...,"Ivory Grand Feu enamel, tourbillon frame. 18-c...",
129,value,J013033200,https://www.jaquet-droz.com/en/watches/grande-...,,Jaquet Droz,,GRANDE SECONDE,Tourbillon Ivory Enamel,,,...,Rolled-edge hand-made black alligator leather ...,Rolled-edge hand-made black alligator leather ...,18-carat red gold folding clasp,"Jaquet Droz 25JD, self-winding tourbillon move...",7 days,"21,600 v.p.h",31 jewels,Hours and minutes at 6 o'clock\r\nTourbillon f...,Ivory Grand Feu enameled dial.\r\n18-carat red...,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   field name         131 non-null    object 
 1   reference_number   131 non-null    object 
 2   watch_URL          131 non-null    object 
 3   type               0 non-null      float64
 4   brand              131 non-null    object 
 5   year_introduced    0 non-null      float64
 6   parent_model       131 non-null    object 
 7   specific_model     131 non-null    object 
 8   nickname           0 non-null      float64
 9   marketing_name     0 non-null      float64
 10  style              0 non-null      float64
 11  currency           0 non-null      float64
 12  price              0 non-null      float64
 13  image_URL          131 non-null    object 
 14  made_in            0 non-null      float64
 15  case_shape         0 non-null      float64
 16  case_material      131 non

In [13]:
df.isna().sum()

field name             0
reference_number       0
watch_URL              0
type                 131
brand                  0
year_introduced      131
parent_model           0
specific_model         0
nickname             131
marketing_name       131
style                131
currency             131
price                131
image_URL              0
made_in              131
case_shape           131
case_material          0
case_finish          131
caseback             131
diameter               0
between_lugs         131
lug_to_lug           131
case_thickness         0
bezel_material       131
bezel_color          131
crystal              131
water_resistance       7
weight               131
dial_color             5
numerals             131
bracelet_material      5
bracelet_color         5
clasp_type             5
caliber                0
power_reserve          0
frequency              5
jewels                 3
features               5
description            0
short_description    131
