### Importation des Bibliothèques et Packages

In [64]:
import requests
import os
import re
import time
from datetime import datetime
import unidecode
import uuid

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import pandas as pd
from pandas.errors import EmptyDataError
import numpy as np

In [2]:
url = "https://www.largus.fr/Toutes-Marques.html"
port = 59795


### Définition des fonction

In [4]:
def download_image(url, folder_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(folder_path, 'wb') as f:
            f.write(response.content)
        print(f"Image téléchargée avec succès: {folder_path}")
    else:
        print(f"Échec du téléchargement de l'image depuis l'URL: {url}")

In [5]:
def get_driver():
    chrome_option = Options()
    headless = True
    chrome_option.binary_location = '/Applications/Brave Browser.app'
    
    # if port:
    #     chrome_options.add_argument(f'--remote-debugging-port={port}')
    # if headless:
    #     chrome_options.add_argument('--headless')

    service = Service()
    driver = webdriver.Chrome(service=service, options=chrome_option)
    
    return driver

In [6]:
def get_page_html(url, port=None, headless=False):
    driver = get_driver()
  
    driver.get(url)

    driver.implicitly_wait(30)

    html_content = driver.page_source

    # Fermer le navigateur
    # driver.quit()

    return html_content

In [None]:
html_content = get_page_html(url)

In [None]:
soup = BeautifulSoup(html_content, "html.parser")

In [26]:
marques = []

for item in soup.find_all("div", class_="liste-mm-item"):
    marque = {}
    marque["libelle"] = item.find("a", class_="libelle").text.strip()
    marque["lien_url"] = "https://www.largus.fr" + item.find("a", class_="libelle")["href"].replace("\\/", "/")
    marque["logo_url"] = "https://www.largus.fr" + item.find("img")["src"].replace("\\/", "/")
    marque["alt_text"] = item.find("img")["alt"]
    marques.append(marque)

In [27]:
marques

[{'libelle': 'Abarth',
  'lien_url': 'https://www.largus.fr/Abarth.html',
  'logo_url': 'https://www.largus.fr/img/logos/abarth.png',
  'alt_text': 'Abarth'},
 {'libelle': 'Aiways',
  'lien_url': 'https://www.largus.fr/Aiways.html',
  'logo_url': 'https://www.largus.fr/img/logos/aiways.png',
  'alt_text': 'Aiways'},
 {'libelle': 'Alfa-Romeo',
  'lien_url': 'https://www.largus.fr/Alfa-Romeo.html',
  'logo_url': 'https://www.largus.fr/img/logos/alfa-romeo.png',
  'alt_text': 'Alfa-Romeo'},
 {'libelle': 'Alpine',
  'lien_url': 'https://www.largus.fr/Alpine.html',
  'logo_url': 'https://www.largus.fr/img/logos/alpine.png',
  'alt_text': 'Alpine'},
 {'libelle': 'Aston-Martin',
  'lien_url': 'https://www.largus.fr/Aston-Martin.html',
  'logo_url': 'https://www.largus.fr/img/logos/aston-martin.png',
  'alt_text': 'Aston-Martin'},
 {'libelle': 'Audi',
  'lien_url': 'https://www.largus.fr/Audi.html',
  'logo_url': 'https://www.largus.fr/img/logos/audi.png',
  'alt_text': 'Audi'},
 {'libelle': '

In [28]:
df_marques = pd.DataFrame(marques)

In [163]:
df_marques

Unnamed: 0,libelle,lien_url,logo_url,alt_text,Traiter
0,2twenty,https://www.largus.fr/2twenty.html,https://www.largus.fr/v4/includes/images/refon...,,1
1,Abarth,https://www.largus.fr/Abarth.html,https://www.largus.fÏr/img/logos/abarth.png,Abarth,1
2,Acma,https://www.largus.fr/Acma.html,https://www.largus.fr/img/logos/acma.png,Acma,1
3,Adiva,https://www.largus.fr/Adiva.html,https://www.largus.fr/v4/includes/images/refon...,,1
4,Aeon,https://www.largus.fr/Aeon.html,https://www.largus.fr/img/logos/aeon.png,Aeon,1
...,...,...,...,...,...
161,Yamaha,https://www.largus.fr/Yamaha.html,https://www.largus.fr/img/logos/yamaha.jpg,Yamaha,0
162,Ycf,https://www.largus.fr/Ycf.html,https://www.largus.fr/v4/includes/images/refon...,,0
163,Yiying,https://www.largus.fr/Yiying.html,https://www.largus.fr/img/logos/yiying.png,Yiying,0
164,Youbee Motors,https://www.largus.fr/Youbee-Motors.html,https://www.largus.fr/v4/includes/images/refon...,,0


In [164]:
df_marques.to_json("marques.json", orient="records")

In [165]:
df_marques.to_csv("marques.csv", index=False)

In [40]:
# Création des dossiers et téléchargement des logos
for index, row in df_marques.iterrows():
    libelle = row['libelle']
    logo_url = row['logo_url']
    
    # Créer un dossier avec le libellé de la marque
    folder_path = os.path.join(f"Marque Folder/{libelle.capitalize()}", 'Logo')
    os.makedirs(folder_path, exist_ok=True)

    # Télécharger le logo dans le dossier 'logo'
    logo_filename = os.path.basename(logo_url)
    logo_path = os.path.join(folder_path, logo_filename)
    download_image(logo_url, logo_path)
    

Image téléchargée avec succès: Marque Folder/Abarth/Logo/abarth.png
Image téléchargée avec succès: Marque Folder/Aiways/Logo/aiways.png
Image téléchargée avec succès: Marque Folder/Alfa-romeo/Logo/alfa-romeo.png
Image téléchargée avec succès: Marque Folder/Alpine/Logo/alpine.png
Image téléchargée avec succès: Marque Folder/Aston-martin/Logo/aston-martin.png
Image téléchargée avec succès: Marque Folder/Audi/Logo/audi.png
Image téléchargée avec succès: Marque Folder/Bmw/Logo/bmw.png
Image téléchargée avec succès: Marque Folder/Bmw-alpina/Logo/bmw-alpina.png
Image téléchargée avec succès: Marque Folder/Byd/Logo/no-photo_134x91.png
Image téléchargée avec succès: Marque Folder/Bentley/Logo/bentley.png
Image téléchargée avec succès: Marque Folder/Bluecar/Logo/bluecar.png
Image téléchargée avec succès: Marque Folder/Cadillac/Logo/cadillac.png
Image téléchargée avec succès: Marque Folder/Chevrolet/Logo/chevrolet.png
Image téléchargée avec succès: Marque Folder/Chevrolet usa/Logo/chevrolet-usa.

In [5]:
df_marques = pd.read_json("marques.json")
df_marques = df_marques.sort_values(by='libelle')
df_marques.head()

Unnamed: 0,libelle,lien_url,logo_url,alt_text,Traiter
0,2twenty,https://www.largus.fr/2twenty.html,https://www.largus.fr/v4/includes/images/refon...,,1
1,Abarth,https://www.largus.fr/Abarth.html,https://www.largus.fÏr/img/logos/abarth.png,Abarth,1
2,Acma,https://www.largus.fr/Acma.html,https://www.largus.fr/img/logos/acma.png,Acma,1
3,Adiva,https://www.largus.fr/Adiva.html,https://www.largus.fr/v4/includes/images/refon...,,1
4,Aeon,https://www.largus.fr/Aeon.html,https://www.largus.fr/img/logos/aeon.png,Aeon,1


In [6]:
filtered_df = df_marques[df_marques['Traiter'] == 1]

In [7]:
filtered_df.shape

(166, 5)

In [8]:
df_marques_json = pd.read_json("object_mark_json.json")
df_marques_json = df_marques_json.sort_values(by='Name')
df_marques_json.head()

Unnamed: 0,id,fullpath,Name,Logo
134,35177,/Vehicules/Mark/2twenty/2twenty,2twenty,/Vehicules/Marque Folder/2twenty/Logo/no-photo...
97,35103,/Vehicules/Mark/Abarth/Abarth,Abarth,/Vehicules/Marque Folder/Abarth/Logo/abarth.png
88,35085,/Vehicules/Mark/Acma/Acma,Acma,/Vehicules/Marque Folder/Acma/Logo/acma.png
161,35230,/Vehicules/Mark/Adiva/Adiva,Adiva,/Vehicules/Marque Folder/Adiva/Logo/no-photo_1...
150,35209,/Vehicules/Mark/Aeon/Aeon,Aeon,/Vehicules/Marque Folder/Aeon/Logo/aeon.png


In [9]:
df_marques_json.shape

(166, 4)

In [10]:
print(df_marques.columns)

Index(['libelle', 'lien_url', 'logo_url', 'alt_text', 'Traiter'], dtype='object')


In [11]:
df_marques.shape

(166, 5)

## Modeles

In [12]:
def extract_vehicle_info(driver):
    # Trouver tous les éléments de produit
    try:
        products_elements = driver.find_elements(By.CSS_SELECTOR, 'a.product-wrap')
        if not products_elements:
            print("No product elements found.")
            return []
    except NoSuchElementException:
        print("Error finding product elements.")
        return []

    
    vehicles = []

    # Extraire les informations pour chaque véhicule
    for element in products_elements:
        try:
            vehicle_url = element.get_attribute('href')
            vehicle_model = element.get_attribute('data-model')
            vehicle_make = element.get_attribute('data-make')
            vehicle_title = element.find_element(By.CSS_SELECTOR, 'span.product-title').text

            # Add extracted information to the list
            vehicles.append({
                'url': vehicle_url,
                'model': vehicle_model,
                'make': vehicle_make,
                'title': vehicle_title
            })
        except NoSuchElementException as e:
            print(f"Error extracting data from element: {e}")
            continue

    return vehicles


def save_vehicles_to_csv(vehicles):
    if vehicles:
        folder = "Modeles"
        if not os.path.exists(folder):
            os.makedirs(folder)
            print(f"Folder created at {folder}")

        file_name = f"{vehicles[0]['make']}.csv"
        save_path = os.path.join(folder, file_name)
        pd.DataFrame(vehicles).to_csv(save_path, index=False)
        print(f"Data saved to {save_path}")


def scrape_multiple_urls(driver, url):
    print(f"Scraping URL: {url}")
    driver.get(url)

    vehicles_info = extract_vehicle_info(driver)

    if vehicles_info:
        save_vehicles_to_csv(vehicles_info)

def process_links(driver, dataframe):
    filtered_df = df_marques[df_marques['Traiter'] == 1]
    treated_links = set(filtered_df['lien_url'])  # Un ensemble pour stocker les liens déjà traités

    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    counter = 0
    for index, row in dataframe.iterrows():
        link_url = row['lien_url']
        # Vérifier si le lien a déjà été traité
        
        if link_url in treated_links:
            continue
            
        if link_url not in treated_links:
            scrape_multiple_urls(driver, link_url)
            dataframe.at[index, 'Traiter'] = 1
            treated_links.add(link_url)
            dataframe.to_json("marques.json", orient="records")
            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)

        if counter >= 50:
            print("Arrêt après 50 itérations.")
            break
    print(f"Arrêt toutes les liens, un total de {counter} ont été traitées !.")
    #return dataframe

In [8]:
url_central = 'https://www.largus.fr/Bmw.html'

In [214]:
driver = get_driver()
process_links(driver, df_marques)

Scraping URL: https://www.largus.fr/Swm.html
No product elements found.
Waiting for 1 minute before the next URL...1
Arrêt toutes les liens, un total de 1 ont été traitées !.
Scraping URL: https://www.largus.fr/Tesla.html
Data saved to Modeles/TESLA.csv
Waiting for 1 minute before the next URL...2
Arrêt toutes les liens, un total de 2 ont été traitées !.
Scraping URL: https://www.largus.fr/Toyota.html
Data saved to Modeles/TOYOTA.csv
Waiting for 1 minute before the next URL...3
Arrêt toutes les liens, un total de 3 ont été traitées !.
Scraping URL: https://www.largus.fr/Triumph.html
Data saved to Modeles/TRIUMPH.csv
Waiting for 1 minute before the next URL...4
Arrêt toutes les liens, un total de 4 ont été traitées !.
Scraping URL: https://www.largus.fr/Venturi.html
Data saved to Modeles/VENTURI.csv
Waiting for 1 minute before the next URL...5
Arrêt toutes les liens, un total de 5 ont été traitées !.
Scraping URL: https://www.largus.fr/Vespa.html
Data saved to Modeles/VESPA.csv
Waiting 

In [216]:
driver.quit()

### Concatener le dossier

In [13]:
folder_column_path = 'Vehiculs/Models'
folder_path = 'Models'

In [218]:
# Liste pour stocker les DataFrames


In [219]:
final_df.head()

Unnamed: 0,url,model,make,title
0,https://www.largus.fr/Piaggio_1.html,1,PIAGGIO,
1,https://www.largus.fr/Piaggio_Medley.html,Medley,PIAGGIO,
2,https://www.largus.fr/Piaggio_Mymoover.html,Mymoover,PIAGGIO,
3,https://www.largus.fr/Piaggio_Porter.html,Porter,PIAGGIO,
4,https://www.largus.fr/Piaggio_Strom.html,Strom,PIAGGIO,


In [226]:
df_marques_json.head()

Unnamed: 0,id,fullpath,Name,Logo
134,35177,/Vehicules/Mark/2twenty/2twenty,2twenty,/Vehicules/Marque Folder/2twenty/Logo/no-photo...
97,35103,/Vehicules/Mark/Abarth/Abarth,Abarth,/Vehicules/Marque Folder/Abarth/Logo/abarth.png
88,35085,/Vehicules/Mark/Acma/Acma,Acma,/Vehicules/Marque Folder/Acma/Logo/acma.png
161,35230,/Vehicules/Mark/Adiva/Adiva,Adiva,/Vehicules/Marque Folder/Adiva/Logo/no-photo_1...
150,35209,/Vehicules/Mark/Aeon/Aeon,Aeon,/Vehicules/Marque Folder/Aeon/Logo/aeon.png


In [220]:
final_df.shape

(2848, 4)

In [267]:
df_marques_json['Name'] = df_marques_json['Name'].str.lower()
final_df['make'] = final_df['make'].str.lower()

In [268]:
# Fusionner les DataFrames sur les colonnes 'make' et 'Name'
merged_df = pd.merge(final_df, df_marques_json[['id', 'Name']], left_on='make', right_on='Name', how='left')

In [277]:
merged_df.dropna(subset=['id'], inplace=True)

In [284]:
non_finite_values = merged_df['id'][~merged_df['id'].apply(np.isfinite)]

In [287]:
if len(non_finite_values) == 0:
    merged_df['id'] = merged_df['id'].astype(int)

In [288]:
merged_df['id'].dtype

dtype('int64')

In [289]:
merged_df.head()

Unnamed: 0,url,model,make,title,id,Name
0,https://www.largus.fr/Piaggio_1.html,1,piaggio,,35183,piaggio
1,https://www.largus.fr/Piaggio_Medley.html,Medley,piaggio,,35183,piaggio
2,https://www.largus.fr/Piaggio_Mymoover.html,Mymoover,piaggio,,35183,piaggio
3,https://www.largus.fr/Piaggio_Porter.html,Porter,piaggio,,35183,piaggio
4,https://www.largus.fr/Piaggio_Strom.html,Strom,piaggio,,35183,piaggio


In [294]:
merged_df['folder_column_path'] = merged_df['make'].apply(lambda make: f"Vehiculs/Models/{make.upper()}")

In [295]:
merged_df.head()

Unnamed: 0,url,model,make,title,id,Name,folder_column_path
0,https://www.largus.fr/Piaggio_1.html,1,piaggio,,35183,piaggio,Vehiculs/Models/PIAGGIO
1,https://www.largus.fr/Piaggio_Medley.html,Medley,piaggio,,35183,piaggio,Vehiculs/Models/PIAGGIO
2,https://www.largus.fr/Piaggio_Mymoover.html,Mymoover,piaggio,,35183,piaggio,Vehiculs/Models/PIAGGIO
3,https://www.largus.fr/Piaggio_Porter.html,Porter,piaggio,,35183,piaggio,Vehiculs/Models/PIAGGIO
4,https://www.largus.fr/Piaggio_Strom.html,Strom,piaggio,,35183,piaggio,Vehiculs/Models/PIAGGIO


In [296]:
final_df_cleaned = merged_df[['url', 'model', 'make', 'title', 'id', 'folder_column_path']]

In [297]:
final_df_cleaned.head()

Unnamed: 0,url,model,make,title,id,folder_column_path
0,https://www.largus.fr/Piaggio_1.html,1,piaggio,,35183,Vehiculs/Models/PIAGGIO
1,https://www.largus.fr/Piaggio_Medley.html,Medley,piaggio,,35183,Vehiculs/Models/PIAGGIO
2,https://www.largus.fr/Piaggio_Mymoover.html,Mymoover,piaggio,,35183,Vehiculs/Models/PIAGGIO
3,https://www.largus.fr/Piaggio_Porter.html,Porter,piaggio,,35183,Vehiculs/Models/PIAGGIO
4,https://www.largus.fr/Piaggio_Strom.html,Strom,piaggio,,35183,Vehiculs/Models/PIAGGIO


In [253]:
df_marques_json.columns

Index(['id', 'fullpath', 'Name', 'Logo'], dtype='object')

In [254]:
final_df.columns

Index(['url', 'model', 'make', 'title'], dtype='object')

In [300]:
# Enregistrer le DataFrame final dans un nouveau fichier CSV
final_df_cleaned.to_csv('Modeles/model.csv', index=False)

In [14]:
final_df_cleaned = pd.read_csv('Modeles/model.csv')
final_df_cleaned.shape

(2704, 6)

In [15]:
final_df_cleaned.head()

Unnamed: 0,url,model,make,title,id,folder_column_path
0,https://www.largus.fr/Piaggio_1.html,1,piaggio,,35183,Vehiculs/Models/PIAGGIO
1,https://www.largus.fr/Piaggio_Medley.html,Medley,piaggio,,35183,Vehiculs/Models/PIAGGIO
2,https://www.largus.fr/Piaggio_Mymoover.html,Mymoover,piaggio,,35183,Vehiculs/Models/PIAGGIO
3,https://www.largus.fr/Piaggio_Porter.html,Porter,piaggio,,35183,Vehiculs/Models/PIAGGIO
4,https://www.largus.fr/Piaggio_Strom.html,Strom,piaggio,,35183,Vehiculs/Models/PIAGGIO


In [15]:
driver = get_driver()
driver.get(url_central)
time.sleep(2)
html_content = driver.page_source

In [67]:
driver.quit()

In [19]:
soup = BeautifulSoup(html_content, "html.parser")

In [113]:
# Liste pour stocker les informations extraites
models = extract_vehicle_info(driver)

MaxRetryError: HTTPConnectionPool(host='localhost', port=52051): Max retries exceeded with url: /session/355f01012138d3872f91364770120825/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1299ed510>: Failed to establish a new connection: [Errno 61] Connection refused'))

## les fiches techniques

In [15]:
url_fiche_technique = 'https://www.largus.fr/Audi_A3-Berline.html'

In [8]:
driver = get_driver()
driver.get(url_fiche_technique)
time.sleep(2)
html_content_fiche_technique = driver.page_source

In [115]:
driver.quit()

In [6]:
def extract_make_from_url(url):
    match = re.search(r'/fiche-technique/([^/]+)/', url)
    if match:
        return match.group(1)
    return None

def extract_year_from_libelle(libelle):
    try:
        match = re.search(r'\b\d{4}\b', libelle)
        if match:
            return match.group(0)
        return None
    except Exception as e:
        print(f"Error extracting year from libelle: {e}")
        return None
    
# Fonction pour extraire le lien "Toutes les fiches techniques"
def extract_all_fiches_techniques_url(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        section = soup.select_one('section.stacking-block.section-fiches-techniques')

        if section:
            lien_tout = section.select_one('a.lien-tout')
            if lien_tout:
                return lien_tout.get('href')
        return None
    except Exception as e:
        print(f"Error extracting 'lien-tout': {e}")
        return None

# Fonction pour extraire les fiches techniques d'une page donnée
def extract_fiches_techniques(driver, url, model):
    try:
        driver.get(url)
        time.sleep(1)
        response = driver.page_source
    except Exception as e:
        print(f"Error fetching URL: {url}. Exception: {e}")
        return []

    try:
        soup = BeautifulSoup(response, 'html.parser')
        fiches = []
        marque = extract_make_from_url(url)
    except Exception as e:
        print(f"Error parsing HTML content from URL: {url}. Exception: {e}")
        return []

    try:
        for item in soup.select('ul.liste-millesimes li a.item'):
            try:
                libelle = item.select_one('span.libelle').text.strip()
                lien = item.get('href')
                year = extract_year_from_libelle(libelle)
                fiches.append({
                    'Libelle': libelle,
                    'Marque': marque,
                    'Model': model,
                    'Lien': f"https://www.largus.fr{lien}",
                    'Annee': year
                })
            except AttributeError as e:
                print(f"Error extracting data from an item: {e}")
    except Exception as e:
        print(f"Error processing items from URL: {url}. Exception: {e}")
        return []

    return fiches


def process_all_fiches_techniques(html_content_fiche, model):
    all_urls = extract_all_fiches_techniques_url(html_content_fiche)
        
    if all_urls:
        print(f"Lien vers toutes les fiches techniques: {all_urls}")

        # Compléter l'URL si nécessaire
        if not all_urls.startswith('http'):
            all_urls = f'https://www.largus.fr{all_urls}'

        # Extraire les fiches techniques de la page "Toutes les fiches techniques"
        fiches_techniques = extract_fiches_techniques(driver, all_urls, model)

        if fiches_techniques:
            # Sauvegarder les fiches techniques dans un fichier CSV
            df_fiches_technique = pd.DataFrame(fiches_techniques)
            marque = df_fiches_technique['Marque'][0].capitalize()
            folder_path = f"Fiches Techniques/{marque.capitalize()}"

            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
                print(f"Folder created at {folder_path}")

            file_name = f"fiches_techniques_{model.lower()}.csv"
            save_path = os.path.join(folder_path, file_name)

            df_fiches_technique.to_csv(save_path, index=False)
            return df_fiches_technique
        else:
            print("Aucune fiche technique trouvée.")
            return None
    else:
        print("Lien vers Toutes les fiches techniques non trouvé.")
        return None


def process_fiche_technique_file_links(driver, dataframe, column):
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column]
        model = row['model']
        make = row['make']
        
        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        if link_url not in treated_links:
            driver.get(link_url)
            html_content_fiche = driver.page_source
            df_fiches_technique = process_all_fiches_techniques(html_content_fiche, model)
            
            if df_fiches_technique is None:
                continue
                
            dataframe.at[index, 'Traiter'] = 1
            treated_links.update(link_url)
            save_file_path = f"Models/{make}.csv"
            save_file_path = unidecode.unidecode(save_file_path).strip().lower().replace(' ', '_').replace("'", "")
            dataframe.to_csv(save_file_path, index=False)
            
            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)

        if counter >= 25:
            print("Arrêt après 50 itérations.")
            break

    print(f"Arrêt toutes les liens, un total de {counter} liens ont été traitées !.")

In [10]:
# Extraire le lien vers "Toutes les fiches techniques"
all_url = extract_all_fiches_techniques_url(html_content_fiche_technique)
all_url

'/fiche-technique/Audi/A3+Berline.html'

In [19]:
df_fiches_techniques = pd.read_csv('Fiches Techniques/Audi/fiches_techniques_fiche technique audi a3 berline 2024.csv')
df_fiches_techniques.head()

Unnamed: 0,Libelle,Marque,Lien,Annee
0,Fiche technique Audi A3 Berline 2024,Audi,https://www.largus.fr/fiche-technique/Audi/A3+...,2024
1,Fiche technique Audi A3 Berline 2023,Audi,https://www.largus.fr/fiche-technique/Audi/A3+...,2023
2,Fiche technique Audi A3 Berline 2022,Audi,https://www.largus.fr/fiche-technique/Audi/A3+...,2022
3,Fiche technique Audi A3 Berline 2021,Audi,https://www.largus.fr/fiche-technique/Audi/A3+...,2021
4,Fiche technique Audi A3 Berline 2020,Audi,https://www.largus.fr/fiche-technique/Audi/A3+...,2020


### Fiche technique par lot

In [28]:
bmw_model_path = "Models/BMW.csv"
df_model_bmw = pd.read_csv(bmw_model_path)
df_model_bmw.head()

Unnamed: 0,url,model,make,title,Traiter
0,https://www.largus.fr/Bmw_1m-Coupe.html,1m Coupe,BMW,,1
1,https://www.largus.fr/Bmw_C.html,C,BMW,,1
2,https://www.largus.fr/Bmw_C1.html,C1,BMW,,1
3,https://www.largus.fr/Bmw_F.html,F,BMW,,1
4,https://www.largus.fr/Bmw_G.html,G,BMW,,1


In [10]:
df_model_bmw.shape

(76, 5)

In [20]:
print(df_model_bmw[df_model_bmw['Traiter'] == 1].shape)

(69, 5)


In [21]:
length = len(df_model_bmw[df_model_bmw['Traiter'] == 1])

In [22]:
df_model_bmw[length:]

Unnamed: 0,url,model,make,title,Traiter
69,https://www.largus.fr/Bmw_Z3-M-Roadster.html,Z3 M Roadster,BMW,,1
70,https://www.largus.fr/Bmw_Z3-Roadster.html,Z3 Roadster,BMW,,0
71,https://www.largus.fr/Bmw_Z4.html,Z4,BMW,,0
72,https://www.largus.fr/Bmw_Z4-M-Coupe.html,Z4 M Coupe,BMW,,0
73,https://www.largus.fr/Bmw_Z4-M-Roadster.html,Z4 M Roadster,BMW,,0
74,https://www.largus.fr/Bmw_Z4-Roadster.html,Z4 Roadster,BMW,,0
75,https://www.largus.fr/Bmw_Z8-Roadster.html,Z8 Roadster,BMW,,0


In [30]:
driver = get_driver()
process_fiche_technique_file_links(driver, df_model_bmw, 'url')

Lien vers toutes les fiches techniques: /fiche-technique/Bmw/Z4.html
Aucune fiche technique trouvée.
Lien vers toutes les fiches techniques: /fiche-technique/Bmw/Z4+M+Coupe.html
Waiting for 1 minute before the next URL...1
Lien vers toutes les fiches techniques: /fiche-technique/Bmw/Z4+M+Roadster.html
Waiting for 1 minute before the next URL...2
Lien vers toutes les fiches techniques: /fiche-technique/Bmw/Z4+Roadster.html
Waiting for 1 minute before the next URL...3
Lien vers toutes les fiches techniques: /fiche-technique/Bmw/Z8+Roadster.html
Waiting for 1 minute before the next URL...4
Arrêt toutes les liens, un total de 4 liens ont été traitées !.


In [31]:
driver.quit()

In [36]:
def load_and_concatenate_csvs(folder_path):
    """
    Load all CSV files in the specified folder and concatenate them into a single DataFrame.

    Parameters:
    folder_path (str): The path to the folder containing the CSV files.

    Returns:
    pd.DataFrame: The concatenated DataFrame containing all the data from the CSV files.
    """
    dataframes = []

    # Parcourir tous les fichiers dans le dossier
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            # Lire le fichier CSV et l'ajouter à la liste des DataFrames
            _df = pd.read_csv(file_path)
            dataframes.append(_df)

    # Concaténer tous les DataFrames en un seul
    final_df = pd.concat(dataframes, ignore_index=True)
    return final_df

In [34]:
folder_path = 'Fiches Techniques/Bmw'
df_fiches_techniques_final = load_and_concatenate_csvs(folder_path)
df_fiches_techniques_final.head()

Unnamed: 0,Libelle,Marque,Model,Lien,Annee
0,Fiche technique BMW M8 Gran Coupe 2024,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2024
1,Fiche technique BMW M8 Gran Coupe 2023,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2023
2,Fiche technique BMW M8 Gran Coupe 2022,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2022
3,Fiche technique BMW M8 Gran Coupe 2021,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2021
4,Fiche technique BMW M8 Gran Coupe 2020,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2020


In [35]:
df_fiches_techniques_final.shape

(714, 5)

In [36]:
df_fiches_techniques_final['Annee'].min()

1995

In [37]:
df_fiches_techniques_final.to_csv('fiches_techniques_final.csv', index=False)

Unnamed: 0,Libelle,Marque,Model,Lien,Annee,Traiter
0,Fiche technique BMW M8 Gran Coupe 2024,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2024,1
1,Fiche technique BMW M8 Gran Coupe 2023,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2023,1
2,Fiche technique BMW M8 Gran Coupe 2022,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2022,1
3,Fiche technique BMW M8 Gran Coupe 2021,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2021,1
4,Fiche technique BMW M8 Gran Coupe 2020,Bmw,M8 Gran Coupe,https://www.largus.fr/fiche-technique/Bmw/M8+G...,2020,1


## Version

In [18]:
url_version = "https://www.largus.fr/fiche-technique/Audi/A5/2024.html"

In [23]:
driver = get_driver()
driver.get(url_version)
# Attendre que la page se charge correctement (si nécessaire)
driver.implicitly_wait(2)
html_content = driver.page_source

In [56]:
def extract_version_data(driver, url_version, html_content, df_rows):
    """
    Extract version data from a given URL and HTML content using Selenium and BeautifulSoup.

    Parameters:
    driver (WebDriver): The Selenium WebDriver instance.
    url_version (str): The URL containing the version information.
    html_content (str): The HTML content of the page.

    Returns:
    list: A list of lists containing version data.
    str: The filename for the CSV file.
    """
    # Extraire l'année de l'URL à l'aide d'une expression régulière
    match = re.search(r'/(\d{4})\.html', url_version)
    if match:
        year = match.group(1)
    else:
        year = datetime.now().year

    # Localiser la table
    table = driver.find_element(By.ID, 'listeVersions')
    
    if table is not None:
        # Extraire les lignes de la table
        rows = table.find_elements(By.TAG_NAME, 'tr')
    
        # Préparer une liste pour stocker les données
        data_versions = []
        mark = df_rows['Marque']
        model = df_rows['Model']
    
        # Boucler à travers les lignes pour extraire les données
        for row in rows[1:]:  # Ignorer l'en-tête
            cols = row.find_elements(By.TAG_NAME, 'td')
            if cols:
                version = cols[0].text
                version_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href')
                carrosserie = cols[1].text
                energy = cols[2].text
                boite = cols[3].text
                puissance_fiscale = cols[4].text
                data_versions.append([version, carrosserie, energy, boite, puissance_fiscale, version_link, year, mark, model])
    
        # Déterminer le nom du fichier CSV
        if data_versions:
            soup = BeautifulSoup(html_content, "html.parser")
            title_tag = soup.find('h1', class_='title lvl1-title')
            if title_tag:
                title_text = title_tag.text.strip().lower()
                title_text = re.sub(r'\s+', '_', title_text)  # Remplacer les espaces par des underscores
                csv_filename = f'{normalize_label(title_text)}.csv'
            else:
                csv_filename = f'fiches_techniques_{year}.csv'
                
            # Créer un DataFrame Pandas à partir des données
            df_versions = pd.DataFrame(data_versions, columns=['Version', 'Carrosserie', 'Energie', 'Boîte', 'Puissance Fiscale', 'Url', 'Année', 'Marque', 'Modele'])
            folder_path = f"Versions/{mark}/{model}"
            
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            
            save_path = os.path.join(folder_path, csv_filename)
                
            df_versions.to_csv(save_path, index=False)
      
    
        return data_versions

In [69]:
def process_versions_links(driver, dataframe, column_link='Lien'):
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column_link])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    captcha = 0
    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column_link]

        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        if link_url not in treated_links:
            driver.get(link_url)
            time.sleep(1)
            html_content = driver.page_source
            data_versions = extract_version_data(driver, link_url, html_content, row)

            if data_versions is None:
                captcha += 1
                continue

            dataframe.at[index, 'Traiter'] = 1
            treated_links.update(link_url)
            save_file_path = "Fiches Techniques/fiches_techniques_final.csv"
            dataframe.to_csv(save_file_path, index=False)

            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)
            
        if captcha >= 2:
            print("Detection de captcha")
            break
            
        if counter >= 50:
            print("Arrêt après 50 itérations.")
            break

    print(f"Arrêt toutes les liens, un total de {counter} liens ont été traitées !.")

In [154]:
df_fiches_techniques = pd.read_csv('Fiches Techniques/fiches_techniques_final.csv')
len(df_fiches_techniques[df_fiches_techniques['Traiter'] == 1]), len(df_fiches_techniques[df_fiches_techniques['Traiter'] == 0])

(714, 0)

In [151]:
driver = get_driver()

In [152]:
process_versions_links(driver, df_fiches_techniques)

Waiting for 1 minute before the next URL...1
Waiting for 1 minute before the next URL...2
Waiting for 1 minute before the next URL...3
Waiting for 1 minute before the next URL...4
Waiting for 1 minute before the next URL...5
Arrêt toutes les liens, un total de 5 liens ont été traitées !.


In [153]:
driver.quit()

In [5]:
df_version = pd.read_csv('Fiches Techniques/Audi/fiches_techniques_audi_a5_2024.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Fiches Techniques/Audi/fiches_techniques_audi_a5_2024.csv'

In [20]:
df_version.head()

Unnamed: 0,Version,Carrosserie,Énergie,Boîte,Puissance Fiscale,Url,Année
0,II 35 TDI 163ch Avus S tronic 7 9cv,Coupés,Diesel,Automatique,9 CV,https://www.largus.fr/fiche-technique/Audi/A5/...,2024
1,II 35 TDI 163ch Avus S tronic 7,Cabriolets,Diesel,Automatique,9 CV,https://www.largus.fr/fiche-technique/Audi/A5+...,2024
2,II 35 TDI 163ch Competition S tronic 7,Coupés,Diesel,Automatique,9 CV,https://www.largus.fr/fiche-technique/Audi/A5/...,2024
3,II 35 TDI 163ch Design S tronic 7 9cv,Coupés,Diesel,Automatique,9 CV,https://www.largus.fr/fiche-technique/Audi/A5/...,2024
4,II 35 TDI 163ch S Edition S tronic 7,Coupés,Diesel,Automatique,9 CV,https://www.largus.fr/fiche-technique/Audi/A5/...,2024


In [45]:
driver.quit()

## Information Fiche technique

In [8]:
url_fiche = 'https://www.largus.fr/fiche-technique/Bmw/X6/I+E71/2008/Break+5+Portes/30da+235+Exclusive-966560.html'

In [9]:
driver = get_driver()
driver.get(url_fiche)
# Attendre que la page se charge correctement (si nécessaire)
driver.implicitly_wait(1)
page_source = driver.page_source

In [10]:
driver.quit()

### Fonctions

In [19]:
def extract_vehicle_name(header):
    vehicle_name_tag = header.find('span', class_='libelle-vehicule')
    vehicle_name = vehicle_name_tag.text.strip() if vehicle_name_tag else None
    return vehicle_name

def extract_date_lancement(header):
    date_lancement_tag = header.find('span', class_='date-lancement')
    date_lancement = date_lancement_tag.text.strip() if date_lancement_tag else None
    return date_lancement

def extract_prix(header):
    prix_tag = header.find('div', class_='prix')
    prix = prix_tag.text.strip().replace('\u00a0', ' ') if prix_tag else None
    return prix

def extract_gallery_images(soup, base_url="https://www.largus.fr"):
    gallery_div = soup.find('div', class_='galerieFT')
    images = gallery_div.find_all('img') if gallery_div else []
    image_urls = [base_url + img['src'] for img in images if 'src' in img.attrs]
    return image_urls

In [11]:
# Analyser le contenu de la page avec BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

In [18]:
def extract_header_data(soup):
    """
    Extract vehicle information from the header section.

    Parameters:
    soup (BeautifulSoup): The BeautifulSoup object of the page.

    Returns:
    dict: A dictionary containing the vehicle name, date of launch, and price.
    """
    # Extraire les informations
    header = soup.find('div', class_='title-bar clearfix')
    vehicle = extract_vehicle_name(header)
    date = extract_date_lancement(header)
    price = extract_prix(header)


    return [vehicle, price, date]

In [13]:
data_header = extract_header_data(soup)
data_header

{'Vehicule': 'BMW X6 I (E71) 3.0dA 235ch Exclusive',
 'Date': '06-2008',
 'Prix': '74 550 €'}

In [14]:
df_data_header = pd.DataFrame([data_header])
df_data_header.head()

Unnamed: 0,Vehicule,Date,Prix
0,BMW X6 I (E71) 3.0dA 235ch Exclusive,06-2008,74 550 €


### Gestion Images

In [15]:
gallery_images = extract_gallery_images(soup)
gallery_images

['https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/troisquartavant.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/blocoptiquearriere.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/coffre.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/facearriere.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/faceavant.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/interieuravant.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/planchedebord.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/planchedebord1.jpg',
 'https://www.largus.fr/images/photos/rsi/_G_JPG/Voitures/BMW/X6/I_E71/Ph1/Break_5_portes/profil.jpg',
 'https://www.largus.

In [32]:
gallery = {
    'Gallery Images' : gallery_images
}
df_gallery = pd.DataFrame([gallery])
df_gallery.head()

Unnamed: 0,Gallery Images
0,[https://www.largus.fr/images/photos/rsi/_G_JP...


### Details

In [17]:
def normalize_label(label):
    return unidecode.unidecode(label).strip().replace(' ', '_').replace("'", "")

def extract_vehicle_resume(soup):
    resume_div = soup.find('div', id='resume')

    details = {}
    # Extraire les informations détaillées
    info_lines = resume_div.find_all('div', class_='ligneInfo')

    for line in info_lines:
        label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
        value_element = line.find('span', class_='valeur')

        if value_element:
            value = ' '.join(value_element.text.split())
        else:
            value = '-'

        details[normalize_label(label).upper()] = value

    return details

In [18]:
vehicle_resume = extract_vehicle_resume(soup)
vehicle_resume

{'ENERGIE': 'Diesel',
 'PUISSANCE_COMMERCIALE': 'nc',
 'PUISSANCE_FISCALE': '15 CV',
 'CONSOMMATION_MIXTE': '8,2 L/100 Km',
 'EMISSION_DE_CO2': '217 g/km F',
 'BOITE_DE_VITESSES': 'Automatique',
 'CARROSSERIE': '4*4/SUV/Crossovers',
 'DATE_DE_FIN_DE_COMMERCIALISATION': '02/06/2010'}

In [19]:
resume = {
    'Vehicule Resume': [vehicle_resume],
}
resume

{'Vehicule Resume': [{'ENERGIE': 'Diesel',
   'PUISSANCE_COMMERCIALE': 'nc',
   'PUISSANCE_FISCALE': '15 CV',
   'CONSOMMATION_MIXTE': '8,2 L/100 Km',
   'EMISSION_DE_CO2': '217 g/km F',
   'BOITE_DE_VITESSES': 'Automatique',
   'CARROSSERIE': '4*4/SUV/Crossovers',
   'DATE_DE_FIN_DE_COMMERCIALISATION': '02/06/2010'}]}

In [20]:
df_resume = pd.DataFrame(resume)
df_resume

Unnamed: 0,Vehicule Resume
0,"{'ENERGIE': 'Diesel', 'PUISSANCE_COMMERCIALE':..."


### Fonctions d'extraction par sous-titre

In [16]:
def extract_dimensions(soup):
    dimensions = {}
    dimensions_div = soup.find_all('div', class_='panel-dimPoids')
    if dimensions_div:
        for div in dimensions_div:
            if div.find('h3', class_='sous-titre').text.strip().upper() == "DIMENSIONS":
                dimension_lines = div.find_all('div', class_='ligneInfo')
                for line_div in dimension_lines:
                    label = line_div.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                    value = ' '.join(line_div.find('span', class_='valeur').text.split())
                    dimensions[normalize_label(label).upper()] = value
        return dimensions

def extract_weight(soup):
    weights = {}
    weight_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in weight_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "poids":
            weight_lines = div.find_all('div', class_='ligneInfo')
            for line in weight_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                weights[normalize_label(label).upper()] = value
    return weights

def extract_habitability(soup):
    habitability = {}
    habitability_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in habitability_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "habitabilité":
            habitability_lines = div.find_all('div', class_='ligneInfo')
            for line in habitability_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                habitability[normalize_label(label).upper()] = value
    return habitability

def extract_tires(soup):
    tires = {}
    tires_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in tires_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "pneumatiques":
            tires_lines = div.find_all('div', class_='ligneInfo')
            for line in tires_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                tires[normalize_label(label).upper()] = value
    return tires

def extract_vehicle_details(soup):
    return [extract_dimensions(soup), extract_weight(soup), extract_habitability(soup), extract_tires(soup)]

In [83]:
vehicle_details = extract_vehicle_details(soup)
vehicle_details

[{'LONGUEUR': '4,88 m',
  'LARGEUR': '1,98 m',
  'HAUTEUR': '1,69 m',
  'EMPATTEMENT': '2,93 m',
  'RESERVOIR': '85 l',
  'VOIES_AVANT': '1,644 m',
  'VOIES_ARRIERE': '1,706 m'},
 {'POIDS_A_VIDE': '2 150 kg',
  'PTAC': '2 675 kg',
  'CHARGE_UTILE': '600 kg',
  'POIDS_TRACTE_FREINE': '2 700 kg',
  'POIDS_TRACTE_NON_FREINE': '750 kg'},
 {'NOMBRE_DE_PLACES': '4',
  'VOLUME_DE_COFFRE': '570 l',
  'VOLUME_DE_COFFRE_UTILE': '1 450 l'},
 {'TYPES_DE_PNEUMATIQUES': 'Classique',
  'MATERIAU_DES_JANTES': 'Aluminium',
  'TAILLE_DES_ROUES_AVANT': '275/40 R20',
  'TAILLE_DES_ROUES_ARRIERE': '275/40 R20'}]

In [76]:
df_vehicle_details = pd.DataFrame(vehicle_details)
df_vehicle_details

Unnamed: 0,Dimensions,Weight,Habitability,Tires
0,"{'LONGUEUR': '4,88 m', 'LARGEUR': '1,98 m', 'H...","{'POIDS_A_VIDE': '2 150 kg', 'PTAC': '2 675 kg...","{'NOMBRE_DE_PLACES': '4', 'VOLUME_DE_COFFRE': ...","{'TYPES_DE_PNEUMATIQUES': 'Classique', 'MATERI..."


### Caractéristiques Techniques

In [15]:
def extract_engine_details(soup):
    engine_details = {}
    engine_div = soup.find('h3', class_='sous-titre', string='Moteur').find_next('div', class_='conteneur-infosFT')
    if engine_div:
        engine_lines = engine_div.find_all('div', class_='ligneInfo')
        for line in engine_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            engine_details[normalize_label(label).upper()] = value
    return engine_details

def extract_transmission_details(soup):
    transmission_details = {}
    transmission_div = soup.find('h3', class_='sous-titre', string='Transmission').find_next('div', class_='conteneur-infosFT')
    if transmission_div:
        transmission_lines = transmission_div.find_all('div', class_='ligneInfo')
        for line in transmission_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            transmission_details[normalize_label(label).upper()] = value
    return transmission_details

def extract_technical_details(soup):
    technical_details = {}
    technical_div = soup.find('h3', class_='sous-titre', string='Technique').find_next('div', class_='conteneur-infosFT')
    if technical_div:
        technical_lines = technical_div.find_all('div', class_='ligneInfo')
        for line in technical_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            technical_details[normalize_label(label).upper()] = value
    return technical_details

def extract_vehicle_characteristics(soup):
    characteristics = {
        'Engine': [extract_engine_details(soup)],
        'Transmission': [extract_transmission_details(soup)],
        'Technical': [extract_technical_details(soup)],
    }
    return characteristics

In [25]:
vehicle_characteristics = extract_vehicle_characteristics(soup)
vehicle_characteristics

{'Engine': [{'NOM_DU_MOTEUR': '3.0d235',
   'ENERGIE': 'Diesel',
   'ARCHITECTURE': 'Six cylindres en ligne',
   'ALIMENTATION': 'Turbo à géométrie variable',
   'INJECTION': 'Injection directe à rampe commune',
   'CYLINDREE': '2 993 cm³',
   'PUISSANCE_REELLE_MAXI': '235 ch / 173 kW',
   'AU_REGIME_DE': '2 000 tr/min',
   'COUPLE_MAXI': '520 Nm',
   'NOMBRE_DE_SOUPAPES': '24',
   'ALESAGE/COURSE': '84 x 90',
   'RAPPORT_VOLUMETRIQUE': '17,0 : 1',
   'NORME_ANTI-POLLUTION': 'Euro 4',
   'DISPOSITION_DU_MOTEUR': 'Longitudinale Avant'}],
 'Transmission': [{'BOITE_DE_VITESSES': 'Automatique 6 rapports',
   'MODE_DE_TRANSMISSION': 'Transmission Intégrale'}],
 'Technical': [{'TYPE_DE_CHASSIS': 'Monocoque',
   'MATERIAU_DU_CHASSIS': 'Acier',
   'DIRECTION_ASSISTEE': 'Oui',
   'TYPE_DE_DIRECTION': 'A crémaillère',
   'TYPE_DASSISTANCE': 'A assitance variable',
   'DIAMETRE_DE_BRAQUAGE_(MUR)': '12,8 m',
   'TYPE_DE_SUSPENSION_AVANT': 'essieu à doubles triangles; train avant à bras oscillants 

In [26]:
df_vehicle_characteristics = pd.DataFrame(vehicle_characteristics)
df_vehicle_characteristics

Unnamed: 0,Engine,Transmission,Technical
0,"{'NOM_DU_MOTEUR': '3.0d235', 'ENERGIE': 'Diese...",{'BOITE_DE_VITESSES': 'Automatique 6 rapports'...,"{'TYPE_DE_CHASSIS': 'Monocoque', 'MATERIAU_DU_..."


### Performances et les consommations du véhicule

In [14]:
def extract_performance(soup):
    performance_div = soup.find('div', class_='panel-heading', id='titre-pc')
    if performance_div:
        _div = performance_div.find_next_sibling('div', class_='panel-collapse').find('h3', string='Performances')
        if _div:
            performance_div = _div.find_next_sibling('div', class_='conteneur-infosFT')
            if performance_div:
                performance_data = {}
                for info in performance_div.find_all('div', class_='ligneInfo'):
                    label = info.find('span', class_='labelInfo').text.strip()
                    value = info.find('span', class_='valeur').text.strip()
                    performance_data[normalize_label(label).upper()] = value
                return performance_data
    return None

def extract_consumption(soup):
    consumption_div = soup.find('div', class_='panel-heading', id='titre-pc')
    if consumption_div:
        _div = consumption_div.find_next_sibling('div', class_='panel-collapse').find('h3', string='Consommations')
        if _div:
            consumption_div = _div.find_next('div', class_='conteneur-infosFT')
            if consumption_div:
                consumption_data = {}
                for info in consumption_div.find_all('div', class_='ligneInfo'):
                    label = info.find('span', class_='labelInfo').text.strip()
                    value = info.find('span', class_='valeur').text.strip()
                    consumption_data[normalize_label(label).upper()] = value
                return consumption_data
    return None

In [28]:
performance_data = extract_performance(soup)
consumption_data = extract_consumption(soup)

In [29]:
performance_and_consumption = {
    'Performance': [performance_data],
    'Consumption': [consumption_data]
}

performance_and_consumption

{'Performance': [{'VITESSE_MAXIMALE': '210 km/h',
   '0_A_100_KM/H': '8,0 s',
   '0_A_1000_M_DA': '29,1 s'}],
 'Consumption': [{'CYCLE_URBAIN': '10,4 L/100km',
   'EXTRA_URBAIN': '7,0 L/100km',
   'MIXTE': '8,2 L/100km',
   'EMISSION_DE_CO2': '217 g/km'}]}

In [30]:
df_performance_and_consumption = pd.DataFrame(performance_and_consumption)
df_performance_and_consumption

Unnamed: 0,Performance,Consumption
0,"{'VITESSE_MAXIMALE': '210 km/h', '0_A_100_KM/H...","{'CYCLE_URBAIN': '10,4 L/100km', 'EXTRA_URBAIN..."


In [13]:
def combine_dataframes(fiche_technical_detail):
    df_fiche_technical_detail = pd.concat(fiche_technical_detail, ignore_index=False, axis=1)
    return df_fiche_technical_detail

In [54]:
fiche_technical_details = [
    df_data_header,
    df_resume,
    df_vehicle_details,
    df_vehicle_characteristics,
    df_performance_and_consumption,
    df_gallery
]

In [57]:
df_fiche_technical_details = combine_dataframes(fiche_technical_details)
df_fiche_technical_details.head()

Unnamed: 0,Vehicule,Date,Prix,Vehicule Resume,Dimensions,Weight,Habitability,Tires,Engine,Transmission,Technical,Performance,Consumption,Gallery Images
0,BMW X6 I (E71) 3.0dA 235ch Exclusive,06-2008,74 550 €,"{'ENERGIE': 'Diesel', 'PUISSANCE_COMMERCIALE':...","{'LONGUEUR': '4,88 m', 'LARGEUR': '1,98 m', 'H...","{'POIDS_A_VIDE': '2 150 kg', 'PTAC': '2 675 kg...","{'NOMBRE_DE_PLACES': '4', 'VOLUME_DE_COFFRE': ...","{'TYPES_DE_PNEUMATIQUES': 'Classique', 'MATERI...","{'NOM_DU_MOTEUR': '3.0d235', 'ENERGIE': 'Diese...",{'BOITE_DE_VITESSES': 'Automatique 6 rapports'...,"{'TYPE_DE_CHASSIS': 'Monocoque', 'MATERIAU_DU_...","{'VITESSE_MAXIMALE': '210 km/h', '0_A_100_KM/H...","{'CYCLE_URBAIN': '10,4 L/100km', 'EXTRA_URBAIN...",[https://www.largus.fr/images/photos/rsi/_G_JP...


In [58]:
df_fiche_technical_details.shape

(1, 14)

In [46]:
def read_csv_files_from_directory(root_dir):
    all_dataframes = []

    # Parcours du répertoire racine et de ses sous-répertoires
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.csv'):
                file_path = os.path.join(dirpath, filename)
                try:
                    _df = pd.read_csv(file_path)
                    all_dataframes.append(_df)
                except Exception as e:
                    print(f"Erreur lors de la lecture de {file_path}: {e}")

    # Concaténer tous les DataFrames en un seul DataFrame
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    return combined_df

In [59]:
df_fiche_technical_details.to_csv('Fiches Technical Details/fiches_technical_details.csv', index=False)

In [47]:
path_version = 'Versions/Bmw'
df_versions = read_csv_files_from_directory(path_version)

In [50]:
df_versions.head()

Unnamed: 0,Version,Carrosserie,Energie,Boîte,Puissance Fiscale,Url,Année,Marque,Modele
0,I (E36) 2.8i 193ch,Coupés,Essence,Manuelle,13 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,1999,Bmw,Z3
1,I (E36) 3.0i 231ch,Coupés,Essence,Manuelle,15 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2001,Bmw,Z3
2,I (E36) 2.8i 193ch,Coupés,Essence,Manuelle,13 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2000,Bmw,Z3
3,I (E36) 3.0i 231ch,Coupés,Essence,Manuelle,15 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2002,Bmw,Z3
4,I (F34) 318d 150ch Business Lounge,Berlines,Diesel,Manuelle,8 CV,https://www.largus.fr/fiche-technique/Bmw/Seri...,2017,Bmw,Serie 3 Gran Turismo


In [51]:
df_versions.shape

(32070, 9)

In [57]:
df_versions.to_csv(f"{path_version}/{df_versions['Marque'][0]}.csv", index=False)

In [52]:
# df_fiche_technical_details['Modele'] = pd.Series(dtype='str')
# df_fiche_technical_details['Marque'] = pd.Series(dtype='str')
# df_fiche_technical_details['Annee'] = pd.Series(dtype='str')
# 
# for index, row in df_version_bmx.iterrows():
#     model = row['Modele']
#     mark = row['Marque']
#     year = row['Année']
# 
#     df_fiche_technical_details.loc[index, ['Modele', 'Marque', 'Annee']] = [model, mark, year]
#     
#     break
# 
# df_fiche_technical_details.head()

In [59]:
fiche_technical_details.head()

Unnamed: 0,Version,Carrosserie,Energie,Boîte,Puissance Fiscale,Url,Année,Marque,Modele
0,I (E36) 2.8i 193ch,Coupés,Essence,Manuelle,13 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,1999,Bmw,Z3
1,I (E36) 3.0i 231ch,Coupés,Essence,Manuelle,15 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2001,Bmw,Z3
2,I (E36) 2.8i 193ch,Coupés,Essence,Manuelle,13 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2000,Bmw,Z3
3,I (E36) 3.0i 231ch,Coupés,Essence,Manuelle,15 CV,https://www.largus.fr/fiche-technique/Bmw/Z3+C...,2002,Bmw,Z3
4,I (F34) 318d 150ch Business Lounge,Berlines,Diesel,Manuelle,8 CV,https://www.largus.fr/fiche-technique/Bmw/Seri...,2017,Bmw,Serie 3 Gran Turismo


In [101]:
def generate_immatriculation():
    return str(uuid.uuid4())

def process_vehicle_data(driver, save_file_path, column_link='Url'):
    dataframe = pd.read_csv(save_file_path)
   
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column_link])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    details = {
        'Marque': [],
        'Modele': [],
        'Annee': [],
        'Vehicule': [],
        'Prix': [],
        'Date Publication': [],
        'Resumer': [],
        'Dimensions': [],
        'Weight': [],
        'Habitability': [],
        'Tires': [],
        'Engine': [],
        'Transmission': [],
        'Technical': [],
        'Performance': [],
        'Consumption': [],
        'Gallery Images': [],
    }

    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column_link]

        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        model = row['Modele']
        mark = row['Marque']
        year = row['Année']

        
        driver.get(link_url)
        time.sleep(1)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        data_header = extract_header_data(soup)
        vehicle_resume = extract_vehicle_resume(soup)
        vehicle_details = extract_vehicle_details(soup)
        vehicle_characteristics = extract_vehicle_characteristics(soup)
        performance_data = extract_performance(soup)
        consumption_data = extract_consumption(soup)
        gallery_images = extract_gallery_images(soup)

        details['Marque'].append(mark)
        details['Modele'].append(model)
        details['Annee'].append(year)
        details['Vehicule'].append(data_header[0])
        details['Prix'].append(data_header[1])
        details['Date Publication'].append(data_header[2])
        details['Resumer'].append(vehicle_resume)
        details['Dimensions'].append(vehicle_details[0])
        details['Weight'].append(vehicle_details[1])
        details['Habitability'].append(vehicle_details[2])
        details['Tires'].append(vehicle_details[3])
        details['Engine'].append(vehicle_characteristics['Engine'])
        details['Transmission'].append(vehicle_characteristics['Transmission'])
        details['Technical'].append(vehicle_characteristics['Technical'])
        details['Performance'].append(performance_data)
        details['Consumption'].append(consumption_data)
        details['Gallery Images'].append(gallery_images)

        dataframe.at[index, 'Traiter'] = 1
        treated_links.update(link_url)
        counter += 1

        print(f"Waiting for 1 minute before the next URL...{counter}")
        time.sleep(1)

        if counter >= 50:
            print(f"Arrêt après {counter} itérations.")
            break

    dataframe.to_csv(save_file_path, index=False)

    print(f"Arrêt après {counter} itérations.")

    return details

In [94]:
def process_create_fiche_technical_df(data, folder):
    columns = ['Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
     'Transmission', 'Technical', 'Performance', 'Consumption']

    # Vérifier si le fichier existe et charger les données existantes, sinon créer un DataFrame vide
    if os.path.exists(folder):
        try:
            df_save = pd.read_csv(folder)
        except EmptyDataError:
            df_save = pd.DataFrame(columns=['Marque', 'Modele', 'Annee', 'Vehicule', 'Prix', 'Date Publication',
                                            'Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
                                            'Transmission', 'Technical', 'Performance', 'Consumption',
                                            'Gallery Images'])
    else:
        df_save = pd.DataFrame(columns=['Marque', 'Modele', 'Annee', 'Vehicule', 'Prix', 'Date Publication',
                                        'Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
                                        'Transmission', 'Technical', 'Performance', 'Consumption',
                                        'Gallery Images'])
        
    df_fiche = pd.DataFrame(data)

    # Ajouter une colonne Immatriculation avec des valeurs uniques
    df_fiche['Immatriculation'] = df_fiche.apply(lambda _: generate_immatriculation(), axis=1)

    # Ajouter une colonne object_folder avec le chemin formaté
    df_fiche['object_folder'] = df_fiche.apply(
        lambda row: f"Vehiculs/Version/{row['Marque'].capitalize()}/{row['Annee']}/{row['Vehicule'].lower()}", axis=1
    )
    
    for column in columns:
        # Ajouter la clé Immatriculation dans chaque dictionnaire de colonne
        df_fiche[column] = df_fiche.apply(
            lambda row: {**row[column], 'Immatriculation': row['Immatriculation']} if isinstance(row[column], dict) else row[column], axis=1
        )
        # Ajouter la clé Object_Folder_{column} dans chaque dictionnaire de colonne
        df_fiche[column] = df_fiche.apply(
            lambda row: {**row[column], f"Object_Folder_{column}": f"Vehiculs/Models/{row['Marque'].upper()}/{column}"} if isinstance(row[column], dict) else row[column], axis=1
        )

    # Concaténer le DataFrame original avec le nouveau DataFrame
    df_save = pd.concat([df_save, df_fiche], ignore_index=True)
    # Enregistrer le DataFrame concaténé dans le fichier CSV
    df_save.to_csv(folder, index=False)
    
    return df_save

In [110]:
from datetime import datetime
import time
import asyncio
import nest_asyncio

In [112]:
# Appliquer le patch nest_asyncio
nest_asyncio.apply()

async def process_data(waiting_time):
    folder = "Versions/Bmw/Bmw.csv"
    save_file_path = "Fiches Technical Details/fiches_technical_details.csv"

    async with True:
        start = time.time()
        while time.time() <= start + waiting_time * 60:
            driver = get_driver()
            data = process_vehicle_data(driver, save_file_path)
            driver.quit()
            
            if len(data) > 0:
                try:
                    process_create_fiche_technical_df(data, folder)
                except Exception as e:
                    print(f"Error sending event data batch")
            print(f"Données traitées et sauvegardées. En attente de {waiting_time} minutes avant la prochaine exécution.")
            await asyncio.sleep(waiting_time * 60)

def run(duration, frequency):
    start_time = time.time()
    while time.time() < start_time + 60 * duration:
        asyncio.run(process_data(frequency))

def main():
    run(duration=30, frequency=5)

In [113]:
if __name__ == "__main__":
    main()

  return compile(source, filename, mode, flags,


TypeError: 'bool' object does not support the asynchronous context manager protocol

RuntimeError: asyncio.run() cannot be called from a running event loop

In [97]:
fiche_technical_details = process_vehicle_data(driver, folder)

Waiting for 1 minute before the next URL...1
Waiting for 1 minute before the next URL...2
Waiting for 1 minute before the next URL...3
Waiting for 1 minute before the next URL...4
Waiting for 1 minute before the next URL...5
Waiting for 1 minute before the next URL...6
Waiting for 1 minute before the next URL...7
Waiting for 1 minute before the next URL...8
Waiting for 1 minute before the next URL...9
Waiting for 1 minute before the next URL...10
Waiting for 1 minute before the next URL...11
Waiting for 1 minute before the next URL...12
Waiting for 1 minute before the next URL...13
Waiting for 1 minute before the next URL...14
Waiting for 1 minute before the next URL...15
Waiting for 1 minute before the next URL...16
Waiting for 1 minute before the next URL...17
Waiting for 1 minute before the next URL...18
Waiting for 1 minute before the next URL...19
Waiting for 1 minute before the next URL...20
Waiting for 1 minute before the next URL...21
Waiting for 1 minute before the next URL...

In [98]:

df = process_create_fiche_technical_df(fiche_technical_details, save_folder)

In [99]:
df.shape

(156, 19)

In [100]:
driver.quit()

Unnamed: 0,Marque,Modele,Annee,Vehicule,Prix,Date Publication,Resumer,Dimensions,Weight,Habitability,Tires,Engine,Transmission,Technical,Performance,Consumption,Gallery Images
0,Bmw,X6,2008,BMW X6 I (E71) 3.5iA 306ch Exclusive,75 750 €,06-2008,"{'ENERGIE': 'Essence', 'PUISSANCE_COMMERCIALE'...","{'LONGUEUR': '4,88 m', 'LARGEUR': '1,98 m', 'H...","{'POIDS_A_VIDE': '2 145 kg', 'PTAC': '2 670 kg...","{'NOMBRE_DE_PLACES': '4', 'VOLUME_DE_COFFRE': ...","{'TYPES_DE_PNEUMATIQUES': 'Classique', 'MATERI...","[{'NOM_DU_MOTEUR': '3.5i', 'ENERGIE': 'Essence...",[{'BOITE_DE_VITESSES': 'Automatique 6 rapports...,"[{'TYPE_DE_CHASSIS': 'Monocoque', 'MATERIAU_DU...","{'VITESSE_MAXIMALE': '240 km/h', '0_A_100_KM/H...","{'CYCLE_URBAIN': '14,9 L/100km', 'EXTRA_URBAIN...",[https://www.largus.fr/images/photos/rsi/_G_JP...
1,Bmw,X6,2008,BMW X6 I (E71) 3.5iA 306ch Luxe,66 850 €,06-2008,"{'ENERGIE': 'Essence', 'PUISSANCE_COMMERCIALE'...","{'LONGUEUR': '4,88 m', 'LARGEUR': '1,98 m', 'H...","{'POIDS_A_VIDE': '2 145 kg', 'PTAC': '2 670 kg...","{'NOMBRE_DE_PLACES': '4', 'VOLUME_DE_COFFRE': ...","{'TYPES_DE_PNEUMATIQUES': 'Classique', 'MATERI...","[{'NOM_DU_MOTEUR': '3.5i', 'ENERGIE': 'Essence...",[{'BOITE_DE_VITESSES': 'Automatique 6 rapports...,"[{'TYPE_DE_CHASSIS': 'Monocoque', 'MATERIAU_DU...","{'VITESSE_MAXIMALE': '240 km/h', '0_A_100_KM/H...","{'CYCLE_URBAIN': '14,9 L/100km', 'EXTRA_URBAIN...",[https://www.largus.fr/images/photos/rsi/_G_JP...
2,Bmw,X6,2008,BMW X6 I (E71) 5.0iA 407ch Exclusive,91 000 €,06-2008,"{'ENERGIE': 'Essence', 'PUISSANCE_COMMERCIALE'...","{'LONGUEUR': '4,88 m', 'LARGEUR': '1,98 m', 'H...","{'POIDS_A_VIDE': '2 265 kg', 'PTAC': '2 840 kg...","{'NOMBRE_DE_PLACES': '4', 'VOLUME_DE_COFFRE': ...","{'TYPES_DE_PNEUMATIQUES': 'Classique', 'MATERI...","[{'NOM_DU_MOTEUR': '5.0i', 'ENERGIE': 'Essence...",[{'BOITE_DE_VITESSES': 'Automatique 6 rapports...,"[{'TYPE_DE_CHASSIS': 'Monocoque', 'MATERIAU_DU...","{'VITESSE_MAXIMALE': '250 km/h', '0_A_100_KM/H...","{'CYCLE_URBAIN': '17,6 L/100km', 'EXTRA_URBAIN...",[https://www.largus.fr/images/photos/rsi/_G_JP...


In [166]:
df['Resumer'][1]

{'ENERGIE': 'Diesel',
 'PUISSANCE_COMMERCIALE': 'nc',
 'PUISSANCE_FISCALE': '15 CV',
 'CONSOMMATION_MIXTE': '8,2 L/100 Km',
 'EMISSION_DE_CO2': '217 g/km F',
 'BOITE_DE_VITESSES': 'Automatique',
 'CARROSSERIE': '4*4/SUV/Crossovers',
 'DATE_DE_FIN_DE_COMMERCIALISATION': '02/06/2010',
 'Immatriculation': '2aacbae2-8417-40f1-9d47-8f9cfd881166',
 'Object_Folder_Resumer': 'Vehiculs/Models/BMW/Resumer'}

In [1]:
df['Resumer'][0]['ENERGIE']

NameError: name 'df' is not defined

In [136]:
df.to_csv('Fiches Technical Details/fiches_technical_details.csv', index=False)

In [None]:

# save_folder = f"Fiches Technical Details/{mark.capitalize()}"
# csv_file_path = f"Fiches_Technical_Details_{mark.capitalize()}.csv"

#save_file_path = unidecode.unidecode(save_file_path).strip().replace(' ', '_').replace("'", "")

In [62]:
test = pd.read_csv('Fiches Technical Details/fiches_technical_details.csv')

EmptyDataError: No columns to parse from file