### Importation des Bibliothèques et Packages

In [None]:
import requests
import os
import re
import time
from datetime import datetime
import unidecode
import uuid

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException

import pandas as pd
from pandas.errors import EmptyDataError
import numpy as np

In [None]:
url = "https://www.largus.fr/Toutes-Marques.html"
port = 59795

### Définition des fonction

In [None]:
def download_image(url, folder_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(folder_path, 'wb') as f:
            f.write(response.content)
        print(f"Image téléchargée avec succès: {folder_path}")
    else:
        print(f"Échec du téléchargement de l'image depuis l'URL: {url}")

In [None]:
def get_driver():
    chrome_option = Options()
    headless = True
    chrome_option.binary_location = '/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'
    
    # if port:
    #     chrome_options.add_argument(f'--remote-debugging-port={port}')
    # if headless:
    #     chrome_options.add_argument('--headless')

    service = Service()
    driver = webdriver.Chrome(service=service, options=chrome_option)
    
    return driver

In [None]:
def get_page_html(url, port=None, headless=False):
    driver = get_driver()
  
    driver.get(url)

    driver.implicitly_wait(30)

    html_content = driver.page_source

    # Fermer le navigateur
    # driver.quit()

    return html_content

In [None]:
html_content = get_page_html(url)

In [None]:
soup = BeautifulSoup(html_content, "html.parser")

In [None]:
marques = []

for item in soup.find_all("div", class_="liste-mm-item"):
    marque = {}
    marque["libelle"] = item.find("a", class_="libelle").text.strip()
    marque["lien_url"] = "https://www.largus.fr" + item.find("a", class_="libelle")["href"].replace("\\/", "/")
    marque["logo_url"] = "https://www.largus.fr" + item.find("img")["src"].replace("\\/", "/")
    marque["alt_text"] = item.find("img")["alt"]
    marques.append(marque)

In [None]:
marques

In [None]:
df_marques = pd.DataFrame(marques)

In [None]:
df_marques

In [None]:
df_marques.to_json("marques.json", orient="records")

In [None]:
df_marques.to_csv("marques.csv", index=False)

In [None]:
# Création des dossiers et téléchargement des logos
for index, row in df_marques.iterrows():
    libelle = row['libelle']
    logo_url = row['logo_url']
    
    # Créer un dossier avec le libellé de la marque
    folder_path = os.path.join(f"Marque Folder/{libelle.capitalize()}", 'Logo')
    os.makedirs(folder_path, exist_ok=True)

    # Télécharger le logo dans le dossier 'logo'
    logo_filename = os.path.basename(logo_url)
    logo_path = os.path.join(folder_path, logo_filename)
    download_image(logo_url, logo_path)
    

In [None]:
df_marques = pd.read_json("marques.json")
df_marques = df_marques.sort_values(by='libelle')
df_marques.head()

In [None]:
filtered_df = df_marques[df_marques['Traiter'] == 1]

In [None]:
filtered_df.shape

In [None]:
df_marques_json = pd.read_json("object_mark_json.json")
df_marques_json = df_marques_json.sort_values(by='Name')
df_marques_json.head()

In [None]:
df_marques_json.shape

In [None]:
print(df_marques.columns)

In [None]:
df_marques.shape

## Modeles

In [None]:
def extract_vehicle_info(driver):
    # Trouver tous les éléments de produit
    try:
        products_elements = driver.find_elements(By.CSS_SELECTOR, 'a.product-wrap')
        if not products_elements:
            print("No product elements found.")
            return []
    except NoSuchElementException:
        print("Error finding product elements.")
        return []

    
    vehicles = []

    # Extraire les informations pour chaque véhicule
    for element in products_elements:
        try:
            vehicle_url = element.get_attribute('href')
            vehicle_model = element.get_attribute('data-model')
            vehicle_make = element.get_attribute('data-make')
            vehicle_title = element.find_element(By.CSS_SELECTOR, 'span.product-title').text

            # Add extracted information to the list
            vehicles.append({
                'url': vehicle_url,
                'model': vehicle_model,
                'make': vehicle_make,
                'title': vehicle_title
            })
        except NoSuchElementException as e:
            print(f"Error extracting data from element: {e}")
            continue

    return vehicles


def save_vehicles_to_csv(vehicles):
    if vehicles:
        folder = "Modeles"
        if not os.path.exists(folder):
            os.makedirs(folder)
            print(f"Folder created at {folder}")

        file_name = f"{vehicles[0]['make']}.csv"
        save_path = os.path.join(folder, file_name)
        pd.DataFrame(vehicles).to_csv(save_path, index=False)
        print(f"Data saved to {save_path}")


def scrape_multiple_urls(driver, url):
    print(f"Scraping URL: {url}")
    driver.get(url)

    vehicles_info = extract_vehicle_info(driver)

    if vehicles_info:
        save_vehicles_to_csv(vehicles_info)

def process_links(driver, dataframe):
    filtered_df = df_marques[df_marques['Traiter'] == 1]
    treated_links = set(filtered_df['lien_url'])  # Un ensemble pour stocker les liens déjà traités

    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    counter = 0
    for index, row in dataframe.iterrows():
        link_url = row['lien_url']
        # Vérifier si le lien a déjà été traité
        
        if link_url in treated_links:
            continue
            
        if link_url not in treated_links:
            scrape_multiple_urls(driver, link_url)
            dataframe.at[index, 'Traiter'] = 1
            treated_links.add(link_url)
            dataframe.to_json("marques.json", orient="records")
            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)

        if counter >= 50:
            print("Arrêt après 50 itérations.")
            break
    print(f"Arrêt toutes les liens, un total de {counter} ont été traitées !.")
    #return dataframe

In [None]:
url_central = 'https://www.largus.fr/Bmw.html'

In [None]:
driver = get_driver()
process_links(driver, df_marques)

In [None]:
driver.quit()

### Concatener le dossier

In [None]:
folder_column_path = 'Vehiculs/Models'
folder_path = 'Models'

In [None]:
# Liste pour stocker les DataFrames


In [None]:
final_df.head()

In [None]:
df_marques_json.head()

In [None]:
final_df.shape

In [None]:
df_marques_json['Name'] = df_marques_json['Name'].str.lower()
final_df['make'] = final_df['make'].str.lower()

In [None]:
# Fusionner les DataFrames sur les colonnes 'make' et 'Name'
merged_df = pd.merge(final_df, df_marques_json[['id', 'Name']], left_on='make', right_on='Name', how='left')

In [None]:
merged_df.dropna(subset=['id'], inplace=True)

In [None]:
non_finite_values = merged_df['id'][~merged_df['id'].apply(np.isfinite)]

In [None]:
if len(non_finite_values) == 0:
    merged_df['id'] = merged_df['id'].astype(int)

In [None]:
merged_df['id'].dtype

In [None]:
merged_df.head()

In [None]:
merged_df['folder_column_path'] = merged_df['make'].apply(lambda make: f"Vehiculs/Models/{make.upper()}")

In [None]:
merged_df.head()

In [None]:
final_df_cleaned = merged_df[['url', 'model', 'make', 'title', 'id', 'folder_column_path']]

In [None]:
final_df_cleaned.head()

In [None]:
df_marques_json.columns

In [None]:
final_df.columns

In [None]:
# Enregistrer le DataFrame final dans un nouveau fichier CSV
final_df_cleaned.to_csv('Modeles/model.csv', index=False)

In [None]:
final_df_cleaned = pd.read_csv('Modeles/model.csv')
final_df_cleaned.shape

In [None]:
final_df_cleaned.head()

In [None]:
driver = get_driver()
driver.get(url_central)
time.sleep(2)
html_content = driver.page_source

In [None]:
driver.quit()

In [None]:
soup = BeautifulSoup(html_content, "html.parser")

In [None]:
# Liste pour stocker les informations extraites
models = extract_vehicle_info(driver)

## les fiches techniques

In [None]:
url_fiche_technique = 'https://www.largus.fr/Audi_A3-Berline.html'

In [None]:
def detect_captcha(soup):
    iframe = soup.find('iframe')
    if iframe != -1:
        # Obtenir la valeur de l'attribut src de l'iframe
        src = iframe.get('src')
        # Vérifier si l'attribut src commence par le lien spécifique du captcha
        if src.startswith('https://geo.captcha-delivery.com/captcha/?initialCid='):
            return True
    # Retourner False si aucun iframe n'est trouvé ou si l'attribut src ne commence pas par le lien spécifique
    return False

In [None]:
driver = get_driver()
driver.get(url_fiche_technique)
time.sleep(2)
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
iframe = soup.find('iframe')
iframe

In [None]:
print(detect_captcha(soup))

In [None]:
driver.quit()

In [None]:
def extract_make_from_url(url):
    match = re.search(r'/fiche-technique/([^/]+)/', url)
    if match:
        return match.group(1)
    return None

def extract_year_from_libelle(libelle):
    try:
        match = re.search(r'\b\d{4}\b', libelle)
        if match:
            return match.group(0)
        return None
    except Exception as e:
        print(f"Error extracting year from libelle: {e}")
        return None
    
# Fonction pour extraire le lien "Toutes les fiches techniques"
def extract_all_fiches_techniques_url(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        section = soup.select_one('section.stacking-block.section-fiches-techniques')

        if section:
            lien_tout = section.select_one('a.lien-tout')
            if lien_tout:
                return lien_tout.get('href')
        return None
    except Exception as e:
        print(f"Error extracting 'lien-tout': {e}")
        return None

# Fonction pour extraire les fiches techniques d'une page donnée
def extract_fiches_techniques(driver, url, model):
    try:
        driver.get(url)
        time.sleep(1)
        response = driver.page_source
    except Exception as e:
        print(f"Error fetching URL: {url}. Exception: {e}")
        return []

    try:
        soup = BeautifulSoup(response, 'html.parser')
        fiches = []
        marque = extract_make_from_url(url)
    except Exception as e:
        print(f"Error parsing HTML content from URL: {url}. Exception: {e}")
        return []

    try:
        for item in soup.select('ul.liste-millesimes li a.item'):
            try:
                libelle = item.select_one('span.libelle').text.strip()
                lien = item.get('href')
                year = extract_year_from_libelle(libelle)
                fiches.append({
                    'Libelle': libelle,
                    'Marque': marque,
                    'Model': model,
                    'Lien': f"https://www.largus.fr{lien}",
                    'Annee': year
                })
            except AttributeError as e:
                print(f"Error extracting data from an item: {e}")
    except Exception as e:
        print(f"Error processing items from URL: {url}. Exception: {e}")
        return []

    return fiches


def process_all_fiches_techniques(html_content_fiche, model, make):
    all_urls = extract_all_fiches_techniques_url(html_content_fiche)
        
    if all_urls:
        print(f"Lien vers toutes les fiches techniques: {all_urls}")

        # Compléter l'URL si nécessaire
        if not all_urls.startswith('http'):
            all_urls = f'https://www.largus.fr{all_urls}'

        # Extraire les fiches techniques de la page "Toutes les fiches techniques"
        fiches_techniques = extract_fiches_techniques(driver, all_urls, model)

        if fiches_techniques:
            # Sauvegarder les fiches techniques dans un fichier CSV
            df_fiches_technique = pd.DataFrame(fiches_techniques)
            folder_path = f"Data/Fiches Techniques/{make.capitalize()}"

            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
                print(f"Folder created at {folder_path}")

            file_name = f"fiches_techniques_{model.lower()}.csv"
            save_path = os.path.join(folder_path, file_name)

            df_fiches_technique.to_csv(save_path, index=False)
            return df_fiches_technique
        else:
            print("Aucune fiche technique trouvée.")
            return None
    else:
        print("Lien vers Toutes les fiches techniques non trouvé.")
        return None


def process_fiche_technique_file_links(driver, dataframe, column):
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column]
        model = row['model']
        make = row['make']
        
        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        if link_url not in treated_links:
            driver.get(link_url)
            html_content_fiche = driver.page_source
            df_fiches_technique = process_all_fiches_techniques(html_content_fiche, model, make)
            
            if df_fiches_technique is None:
                continue
                
            dataframe.at[index, 'Traiter'] = 1
            treated_links.update(link_url)
            save_file_path = f"Data/Models/{make}.csv"
            save_file_path = unidecode.unidecode(save_file_path).strip().lower().replace(' ', '_').replace("'", "")
            dataframe.to_csv(save_file_path, index=False)
            
            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)

        if counter >= 5:
            print(f"Arrêt après {counter} itérations.")
            break

    print(f"Arrêt toutes les liens, un total de {counter} liens ont été traitées !.")

In [None]:
# Extraire le lien vers "Toutes les fiches techniques"
all_url = extract_all_fiches_techniques_url(html_content_fiche_technique)
all_url

In [None]:
df_fiches_techniques = pd.read_csv('Fiches Techniques/Audi/fiches_techniques_fiche technique audi a3 berline 2024.csv')
df_fiches_techniques.head()

### Fiche technique par lot

In [None]:
model_path = "Data/all_models.csv"
df_model = pd.read_csv(model_path)
df_model[df_model['Traiter'] == 0].shape[0]

In [None]:
len(df_model[df_model['Traiter'] == 0])

In [None]:

df_model.to_csv('Data/all_models.csv', index=False)

In [None]:
driver = get_driver()
process_fiche_technique_file_links(driver, df_model, 'url')
driver.quit()

In [None]:
def load_and_concatenate_csvs(folder_path):
    dataframes = []

    # Parcourir tous les fichiers dans le dossier et les sous-dossiers
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.csv'):
                file_path = os.path.join(root, filename)
                # Lire le fichier CSV et l'ajouter à la liste des DataFrames
                _df = pd.read_csv(file_path)
                dataframes.append(_df)

    # Concaténer tous les DataFrames en un seul
    final_df = pd.concat(dataframes, ignore_index=True)
    return final_df

In [None]:
folder_path = 'Data/Fiches Techniques'
df_fiches_techniques_final = load_and_concatenate_csvs(folder_path)
df_fiches_techniques_final.head()

In [None]:
df_fiches_techniques_final.shape

In [None]:
df_fiches_techniques_final['Annee'].min()

In [None]:
df_fiches_techniques_final[df_fiches_techniques_final['Marque'] == 'Bmw']

In [None]:
# Remplacer toutes les occurrences de l'année 1007 par 2010 dans la colonne 'Annee'
df_fiches_techniques_final.loc[df_fiches_techniques_final['Marque'] == 'Bmw', 'Traiter'] = 1

In [None]:
df_fiches_techniques_final['Traiter'] = 0

In [None]:
# Mélanger les lignes du DataFrame
df_fiches_techniques_final = df_fiches_techniques_final.sample(frac=1).reset_index(drop=True)

In [None]:
df_fiches_techniques_final.head(10)

In [None]:
df_fiches_techniques_final.to_csv('Data/Fiches Techniques/fiches_techniques_final.csv', index=False)

## Version

In [None]:
url_version = "https://www.largus.fr/fiche-technique/Audi/A5/2024.html"

In [None]:
driver = get_driver()
driver.get(url_version)
# Attendre que la page se charge correctement (si nécessaire)
driver.implicitly_wait(2)
html_content = driver.page_source

In [None]:
driver.quit()

In [None]:
def extract_version_data(driver, url_version, html_content, df_rows):
    """
    Extract version data from a given URL and HTML content using Selenium and BeautifulSoup.

    Parameters:
    driver (WebDriver): The Selenium WebDriver instance.
    url_version (str): The URL containing the version information.
    html_content (str): The HTML content of the page.

    Returns:
    list: A list of lists containing version data.
    str: The filename for the CSV file.
    """
    # Extraire l'année de l'URL à l'aide d'une expression régulière
    match = re.search(r'/(\d{4})\.html', url_version)
    if match:
        year = match.group(1)
    else:
        year = datetime.now().year

    # Localiser la table
    table = driver.find_element(By.ID, 'listeVersions')
    
    if table is not None:
        # Extraire les lignes de la table
        rows = table.find_elements(By.TAG_NAME, 'tr')
    
        # Préparer une liste pour stocker les données
        data_versions = []
        mark = df_rows['Marque']
        model = df_rows['Model']
    
        # Boucler à travers les lignes pour extraire les données
        for row in rows[1:]:  # Ignorer l'en-tête
            cols = row.find_elements(By.TAG_NAME, 'td')
            if cols:
                version = cols[0].text
                version_link = cols[0].find_element(By.TAG_NAME, 'a').get_attribute('href')
                carrosserie = cols[1].text
                energy = cols[2].text
                boite = cols[3].text
                puissance_fiscale = cols[4].text
                data_versions.append([version, carrosserie, energy, boite, puissance_fiscale, version_link, year, mark, model])
    
        # Déterminer le nom du fichier CSV
        if data_versions:
            soup = BeautifulSoup(html_content, "html.parser")
            title_tag = soup.find('h1', class_='title lvl1-title')
            if title_tag:
                title_text = title_tag.text.strip().lower()
                title_text = re.sub(r'\s+', '_', title_text)  # Remplacer les espaces par des underscores
                csv_filename = f'{normalize_label(title_text)}.csv'
            else:
                csv_filename = f'fiches_techniques_{year}.csv'
                
            # Créer un DataFrame Pandas à partir des données
            df_versions = pd.DataFrame(data_versions, columns=['Version', 'Carrosserie', 'Energie', 'Boîte', 'Puissance Fiscale', 'Url', 'Année', 'Marque', 'Modele'])
            folder_path = f"Versions/{mark}/{model}"
            
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            
            save_path = os.path.join(folder_path, csv_filename)
                
            df_versions.to_csv(save_path, index=False)
      
    
        return data_versions

In [None]:
def process_versions_links(driver, dataframe, column_link='Lien'):
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column_link])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    captcha = 0
    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column_link]

        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        if link_url not in treated_links:
            driver.get(link_url)
            time.sleep(1)
            html_content = driver.page_source
            data_versions = extract_version_data(driver, link_url, html_content, row)

            if data_versions is None:
                captcha += 1
                continue

            dataframe.at[index, 'Traiter'] = 1
            treated_links.update(link_url)
            save_file_path = "Fiches Techniques/fiches_techniques_final.csv"
            dataframe.to_csv(save_file_path, index=False)

            counter += 1

            print(f"Waiting for 1 minute before the next URL...{counter}")
            time.sleep(1)
            
        if captcha >= 2:
            print("Detection de captcha")
            break
            
        if counter >= 50:
            print("Arrêt après 50 itérations.")
            break

    print(f"Arrêt toutes les liens, un total de {counter} liens ont été traitées !.")

In [None]:
df_fiches_techniques = pd.read_csv('Data/Fiches Techniques/fiches_techniques_final.csv')
len(df_fiches_techniques[df_fiches_techniques['Traiter'] == 1]), len(df_fiches_techniques[df_fiches_techniques['Traiter'] == 0])

In [None]:
driver = get_driver()

In [None]:
process_versions_links(driver, df_fiches_techniques)

In [None]:
driver.quit()

In [None]:
df_version = pd.read_csv('Fiches Techniques/Audi/fiches_techniques_audi_a5_2024.csv')

In [None]:
df_version.head()

In [None]:
driver.quit()

## Information Fiche technique

In [None]:
url_fiche = 'https://www.largus.fr/fiche-technique/Bmw/X6/I+E71/2008/Break+5+Portes/30da+235+Exclusive-966560.html'

In [None]:
driver = get_driver()
driver.get(url_fiche)
# Attendre que la page se charge correctement (si nécessaire)
driver.implicitly_wait(1)
page_source = driver.page_source

In [None]:
driver.quit()

### Fonctions

In [None]:
def extract_vehicle_name(header):
    vehicle_name_tag = header.find('span', class_='libelle-vehicule')
    vehicle_name = vehicle_name_tag.text.strip() if vehicle_name_tag else None
    return vehicle_name

def extract_date_lancement(header):
    date_lancement_tag = header.find('span', class_='date-lancement')
    date_lancement = date_lancement_tag.text.strip() if date_lancement_tag else None
    return date_lancement

def extract_prix(header):
    prix_tag = header.find('div', class_='prix')
    prix = prix_tag.text.strip().replace('\u00a0', ' ') if prix_tag else None
    return prix

def extract_gallery_images(soup, base_url="https://www.largus.fr"):
    gallery_div = soup.find('div', class_='galerieFT')
    images = gallery_div.find_all('img') if gallery_div else []
    image_urls = [base_url + img['src'] for img in images if 'src' in img.attrs]
    return image_urls

In [None]:
# Analyser le contenu de la page avec BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

In [None]:
def extract_header_data(soup):
    """
    Extract vehicle information from the header section.

    Parameters:
    soup (BeautifulSoup): The BeautifulSoup object of the page.

    Returns:
    dict: A dictionary containing the vehicle name, date of launch, and price.
    """
    # Extraire les informations
    header = soup.find('div', class_='title-bar clearfix')
    vehicle = extract_vehicle_name(header)
    date = extract_date_lancement(header)
    price = extract_prix(header)


    return [vehicle, price, date]

In [None]:
data_header = extract_header_data(soup)
data_header

In [None]:
df_data_header = pd.DataFrame([data_header])
df_data_header.head()

### Gestion Images

In [None]:
gallery_images = extract_gallery_images(soup)
gallery_images

In [None]:
gallery = {
    'Gallery Images' : gallery_images
}
df_gallery = pd.DataFrame([gallery])
df_gallery.head()

### Details

In [None]:
def normalize_label(label):
    return unidecode.unidecode(label).strip().replace(' ', '_').replace("'", "")

def extract_vehicle_resume(soup):
    resume_div = soup.find('div', id='resume')

    details = {}
    # Extraire les informations détaillées
    info_lines = resume_div.find_all('div', class_='ligneInfo')

    for line in info_lines:
        label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
        value_element = line.find('span', class_='valeur')

        if value_element:
            value = ' '.join(value_element.text.split())
        else:
            value = '-'

        details[normalize_label(label).upper()] = value

    return details

In [None]:
vehicle_resume = extract_vehicle_resume(soup)
vehicle_resume

In [None]:
resume = {
    'Vehicule Resume': [vehicle_resume],
}
resume

In [None]:
df_resume = pd.DataFrame(resume)
df_resume

### Fonctions d'extraction par sous-titre

In [None]:
def extract_dimensions(soup):
    dimensions = {}
    dimensions_div = soup.find_all('div', class_='panel-dimPoids')
    if dimensions_div:
        for div in dimensions_div:
            if div.find('h3', class_='sous-titre').text.strip().upper() == "DIMENSIONS":
                dimension_lines = div.find_all('div', class_='ligneInfo')
                for line_div in dimension_lines:
                    label = line_div.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                    value = ' '.join(line_div.find('span', class_='valeur').text.split())
                    dimensions[normalize_label(label).upper()] = value
        return dimensions

def extract_weight(soup):
    weights = {}
    weight_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in weight_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "poids":
            weight_lines = div.find_all('div', class_='ligneInfo')
            for line in weight_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                weights[normalize_label(label).upper()] = value
    return weights

def extract_habitability(soup):
    habitability = {}
    habitability_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in habitability_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "habitabilité":
            habitability_lines = div.find_all('div', class_='ligneInfo')
            for line in habitability_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                habitability[normalize_label(label).upper()] = value
    return habitability

def extract_tires(soup):
    tires = {}
    tires_divs = soup.find_all('div', class_='panel-dimPoids')
    for div in tires_divs:
        if div.find('h3', class_='sous-titre').text.strip().lower() == "pneumatiques":
            tires_lines = div.find_all('div', class_='ligneInfo')
            for line in tires_lines:
                label = line.find('span', class_='labelInfo').text.strip().lower().replace(' ', '_')
                value = ' '.join(line.find('span', class_='valeur').text.split())
                tires[normalize_label(label).upper()] = value
    return tires

def extract_vehicle_details(soup):
    return [extract_dimensions(soup), extract_weight(soup), extract_habitability(soup), extract_tires(soup)]

In [None]:
vehicle_details = extract_vehicle_details(soup)
vehicle_details

In [None]:
df_vehicle_details = pd.DataFrame(vehicle_details)
df_vehicle_details

### Caractéristiques Techniques

In [None]:
def extract_engine_details(soup):
    engine_details = {}
    engine_div = soup.find('h3', class_='sous-titre', string='Moteur').find_next('div', class_='conteneur-infosFT')
    if engine_div:
        engine_lines = engine_div.find_all('div', class_='ligneInfo')
        for line in engine_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            engine_details[normalize_label(label).upper()] = value
    return engine_details

def extract_transmission_details(soup):
    transmission_details = {}
    transmission_div = soup.find('h3', class_='sous-titre', string='Transmission').find_next('div', class_='conteneur-infosFT')
    if transmission_div:
        transmission_lines = transmission_div.find_all('div', class_='ligneInfo')
        for line in transmission_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            transmission_details[normalize_label(label).upper()] = value
    return transmission_details

def extract_technical_details(soup):
    technical_details = {}
    technical_div = soup.find('h3', class_='sous-titre', string='Technique').find_next('div', class_='conteneur-infosFT')
    if technical_div:
        technical_lines = technical_div.find_all('div', class_='ligneInfo')
        for line in technical_lines:
            label = line.find('span', class_='labelInfo').text
            value = ' '.join(line.find('span', class_='valeur').text.split())
            technical_details[normalize_label(label).upper()] = value
    return technical_details

def extract_vehicle_characteristics(soup):
    characteristics = {
        'Engine': [extract_engine_details(soup)],
        'Transmission': [extract_transmission_details(soup)],
        'Technical': [extract_technical_details(soup)],
    }
    return characteristics

In [None]:
vehicle_characteristics = extract_vehicle_characteristics(soup)
vehicle_characteristics

In [None]:
df_vehicle_characteristics = pd.DataFrame(vehicle_characteristics)
df_vehicle_characteristics

### Performances et les consommations du véhicule

In [None]:
def extract_performance(soup):
    performance_div = soup.find('div', class_='panel-heading', id='titre-pc')
    if performance_div:
        _div = performance_div.find_next_sibling('div', class_='panel-collapse').find('h3', string='Performances')
        if _div:
            performance_div = _div.find_next_sibling('div', class_='conteneur-infosFT')
            if performance_div:
                performance_data = {}
                for info in performance_div.find_all('div', class_='ligneInfo'):
                    label = info.find('span', class_='labelInfo').text.strip()
                    value = info.find('span', class_='valeur').text.strip()
                    performance_data[normalize_label(label).upper()] = value
                return performance_data
    return None

def extract_consumption(soup):
    consumption_div = soup.find('div', class_='panel-heading', id='titre-pc')
    if consumption_div:
        _div = consumption_div.find_next_sibling('div', class_='panel-collapse').find('h3', string='Consommations')
        if _div:
            consumption_div = _div.find_next('div', class_='conteneur-infosFT')
            if consumption_div:
                consumption_data = {}
                for info in consumption_div.find_all('div', class_='ligneInfo'):
                    label = info.find('span', class_='labelInfo').text.strip()
                    value = info.find('span', class_='valeur').text.strip()
                    consumption_data[normalize_label(label).upper()] = value
                return consumption_data
    return None

In [None]:
performance_data = extract_performance(soup)
consumption_data = extract_consumption(soup)

In [None]:
performance_and_consumption = {
    'Performance': [performance_data],
    'Consumption': [consumption_data]
}

performance_and_consumption

In [None]:
df_performance_and_consumption = pd.DataFrame(performance_and_consumption)
df_performance_and_consumption

In [None]:
def combine_dataframes(fiche_technical_detail):
    df_fiche_technical_detail = pd.concat(fiche_technical_detail, ignore_index=False, axis=1)
    return df_fiche_technical_detail

In [None]:
fiche_technical_details = [
    df_data_header,
    df_resume,
    df_vehicle_details,
    df_vehicle_characteristics,
    df_performance_and_consumption,
    df_gallery
]

In [None]:
df_fiche_technical_details = combine_dataframes(fiche_technical_details)
df_fiche_technical_details.head()

In [None]:
df_fiche_technical_details.shape

In [None]:
def read_csv_files_from_directory(root_dir):
    all_dataframes = []

    # Parcours du répertoire racine et de ses sous-répertoires
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.csv'):
                file_path = os.path.join(dirpath, filename)
                try:
                    _df = pd.read_csv(file_path)
                    all_dataframes.append(_df)
                except Exception as e:
                    print(f"Erreur lors de la lecture de {file_path}: {e}")

    # Concaténer tous les DataFrames en un seul DataFrame
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    return combined_df

In [None]:
df_fiche_technical_details.to_csv('Fiches Technical Details/fiches_technical_details.csv', index=False)

In [None]:
path_version = 'Versions/Bmw'
df_versions = read_csv_files_from_directory(path_version)

In [None]:
df_versions.head()

In [None]:
df_versions.shape

In [None]:
df_versions.to_csv(f"{path_version}/{df_versions['Marque'][0]}.csv", index=False)

In [None]:
# df_fiche_technical_details['Modele'] = pd.Series(dtype='str')
# df_fiche_technical_details['Marque'] = pd.Series(dtype='str')
# df_fiche_technical_details['Annee'] = pd.Series(dtype='str')
# 
# for index, row in df_version_bmx.iterrows():
#     model = row['Modele']
#     mark = row['Marque']
#     year = row['Année']
# 
#     df_fiche_technical_details.loc[index, ['Modele', 'Marque', 'Annee']] = [model, mark, year]
#     
#     break
# 
# df_fiche_technical_details.head()

In [None]:
fiche_technical_details.head()

In [None]:
def generate_immatriculation():
    return str(uuid.uuid4())

def process_vehicle_data(driver, save_file_path, column_link='Url'):
    dataframe = pd.read_csv(save_file_path)
   
    # Vérifier si la colonne Traiter existe déjà
    if 'Traiter' not in dataframe.columns:
        dataframe['Traiter'] = 0

    filtered_df = dataframe[dataframe['Traiter'] == 1]
    treated_links = set(filtered_df[column_link])  # Un ensemble pour stocker les liens déjà traités

    counter = 0
    details = {
        'Marque': [],
        'Modele': [],
        'Annee': [],
        'Vehicule': [],
        'Prix': [],
        'Date Publication': [],
        'Resumer': [],
        'Dimensions': [],
        'Weight': [],
        'Habitability': [],
        'Tires': [],
        'Engine': [],
        'Transmission': [],
        'Technical': [],
        'Performance': [],
        'Consumption': [],
        'Gallery Images': [],
    }

    for index, row in dataframe[len(treated_links):].iterrows():
        link_url = row[column_link]

        # Vérifier si le lien a déjà été traité
        if link_url in treated_links:
            continue

        model = row['Modele']
        mark = row['Marque']
        year = row['Année']

        
        driver.get(link_url)
        time.sleep(1)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        data_header = extract_header_data(soup)
        vehicle_resume = extract_vehicle_resume(soup)
        vehicle_details = extract_vehicle_details(soup)
        vehicle_characteristics = extract_vehicle_characteristics(soup)
        performance_data = extract_performance(soup)
        consumption_data = extract_consumption(soup)
        gallery_images = extract_gallery_images(soup)

        details['Marque'].append(mark)
        details['Modele'].append(model)
        details['Annee'].append(year)
        details['Vehicule'].append(data_header[0])
        details['Prix'].append(data_header[1])
        details['Date Publication'].append(data_header[2])
        details['Resumer'].append(vehicle_resume)
        details['Dimensions'].append(vehicle_details[0])
        details['Weight'].append(vehicle_details[1])
        details['Habitability'].append(vehicle_details[2])
        details['Tires'].append(vehicle_details[3])
        details['Engine'].append(vehicle_characteristics['Engine'])
        details['Transmission'].append(vehicle_characteristics['Transmission'])
        details['Technical'].append(vehicle_characteristics['Technical'])
        details['Performance'].append(performance_data)
        details['Consumption'].append(consumption_data)
        details['Gallery Images'].append(gallery_images)

        dataframe.at[index, 'Traiter'] = 1
        treated_links.update(link_url)
        counter += 1

        print(f"Waiting for 1 minute before the next URL...{counter}")
        time.sleep(1)

        if counter >= 50:
            print(f"Arrêt après {counter} itérations.")
            break

    dataframe.to_csv(save_file_path, index=False)

    print(f"Arrêt après {counter} itérations.")

    return details

In [None]:
def process_create_fiche_technical_df(data, folder):
    columns = ['Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
     'Transmission', 'Technical', 'Performance', 'Consumption']

    # Vérifier si le fichier existe et charger les données existantes, sinon créer un DataFrame vide
    if os.path.exists(folder):
        try:
            df_save = pd.read_csv(folder)
        except EmptyDataError:
            df_save = pd.DataFrame(columns=['Marque', 'Modele', 'Annee', 'Vehicule', 'Prix', 'Date Publication',
                                            'Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
                                            'Transmission', 'Technical', 'Performance', 'Consumption',
                                            'Gallery Images'])
    else:
        df_save = pd.DataFrame(columns=['Marque', 'Modele', 'Annee', 'Vehicule', 'Prix', 'Date Publication',
                                        'Resumer', 'Dimensions', 'Weight', 'Habitability', 'Tires', 'Engine',
                                        'Transmission', 'Technical', 'Performance', 'Consumption',
                                        'Gallery Images'])
        
    df_fiche = pd.DataFrame(data)

    # Ajouter une colonne Immatriculation avec des valeurs uniques
    df_fiche['Immatriculation'] = df_fiche.apply(lambda _: generate_immatriculation(), axis=1)

    # Ajouter une colonne object_folder avec le chemin formaté
    df_fiche['object_folder'] = df_fiche.apply(
        lambda row: f"Vehiculs/Version/{row['Marque'].capitalize()}/{row['Annee']}/{row['Vehicule'].lower()}", axis=1
    )
    
    for column in columns:
        # Ajouter la clé Immatriculation dans chaque dictionnaire de colonne
        df_fiche[column] = df_fiche.apply(
            lambda row: {**row[column], 'Immatriculation': row['Immatriculation']} if isinstance(row[column], dict) else row[column], axis=1
        )
        # Ajouter la clé Object_Folder_{column} dans chaque dictionnaire de colonne
        df_fiche[column] = df_fiche.apply(
            lambda row: {**row[column], f"Object_Folder_{column}": f"Vehiculs/Models/{row['Marque'].upper()}/{column}"} if isinstance(row[column], dict) else row[column], axis=1
        )

    # Concaténer le DataFrame original avec le nouveau DataFrame
    df_save = pd.concat([df_save, df_fiche], ignore_index=True)
    # Enregistrer le DataFrame concaténé dans le fichier CSV
    df_save.to_csv(folder, index=False)
    
    return df_save

In [None]:
fiche_technical_details = process_vehicle_data(driver, folder)

In [None]:
df = process_create_fiche_technical_df(fiche_technical_details, save_folder)

In [None]:
df.shape

In [None]:
driver.quit()

In [None]:
from modules.largus import Largus, TechnicalDataSearch

In [None]:
technical_data_search = TechnicalDataSearch()

In [None]:
folder = "Data/Versions/Bmw/Bmw.csv"
df_versions = pd.read_csv(folder)

In [None]:
len(df_versions[df_versions['Traiter'] == 1])

In [None]:
driver = technical_data_search.get_driver()
data = technical_data_search.process_vehicle_data(driver, save_file_path)

In [None]:
folder = "Data/Versions/Bmw/Bmw.csv"
df = pd.read_csv(folder)

In [None]:
df['Url']

In [None]:
df['Resumer'][0]['ENERGIE']

In [None]:

# save_folder = f"Fiches Technical Details/{mark.capitalize()}"
# csv_file_path = f"Fiches_Technical_Details_{mark.capitalize()}.csv"

#save_file_path = unidecode.unidecode(save_file_path).strip().replace(' ', '_').replace("'", "")

In [None]:
def read_directories(root_folder):
    def read_subdirectories(folder):
        dir_dict = {'name': os.path.basename(folder), 'subdirectories': []}
        try:
            for entry in os.scandir(folder):
                if entry.is_dir(follow_symlinks=False):
                    subdir = read_subdirectories(entry.path)
                    dir_dict['subdirectories'].append(subdir)
        except PermissionError:
            pass
        return dir_dict

    return read_subdirectories(root_folder)

In [None]:
root_folder = 'Data/Formats Type'  # Remplacez par le chemin de votre dossier racine
directory_structure = read_directories(root_folder)
directory_structure