In [11]:
import os
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

In [12]:
# Selenium Setup
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--ignore-certificate-errors")
service = Service("C:/chromedriver.exe")  # Adjust the path to your ChromeDriver
driver = webdriver.Chrome(service=service, options=options)

In [13]:
OUTPUT_DIR = "Data"
MAX_PAGES = 20
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "Newegg_Tablets.csv")
START_URL = "https://www.newegg.com/Tablets/SubCategory/ID-2557"

In [14]:
def parse_product_name(product_name):
    """
    Extract structured information from the product name.
    """
    data = {
        "Brand": None,
        "Model": None,
        "Storage": None,
    }

    # Extract brand
    brand_match = re.search(r'\b(Apple|Samsung|Xiaomi|Huawei|Sony|Ulefone|Lenovo|CHUWI|Fusion5|JIMTAB)\b', product_name, re.IGNORECASE)
    data["Brand"] = brand_match.group(1) if brand_match else "Unknown"

    # Extract model
    model_match = re.search(r'(Redmi Pad\s?\w+|Galaxy TAB\s?\w+|iPad Air\s?\w+|iPad\s?\w+|MATEPAD\s?\w+|VistaTab\s?\w+|Tab M\s?\w+|Tab Plus)', product_name, re.IGNORECASE)
    data["Model"] = model_match.group(0).strip() if model_match else "Unknown"


    # Extract storage
    storage_matches = re.findall(r'(\d+)\s?(?:Go|GB)', product_name, re.IGNORECASE)

    if storage_matches:
        # Convert to integers and take the maximum value (assuming storage is larger than RAM)
        storage_value = max(map(int, storage_matches))
        data["Storage"] = f"{storage_value}GB"
    else:
        data["Storage"] = "Unknown"     


    return data

def get_data(url):
    """
    Fetches the page source using Selenium.
    """
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "prd"))
    )
    return driver.page_source

In [15]:
# Définir le taux de conversion USD -> MAD (ajuste selon le taux actuel)
usd_to_mad_rate = 10.0  # Exemple

# Fonction pour nettoyer les prix
import re

# Fonction pour nettoyer les prix
def clean_price(price_str):
    if price_str:
        # Supprimer les espaces inutiles et les caractères non numériques comme les symboles d'espace (\xa0)
        price_str = re.sub(r'[^\d.,]', '', price_str)  # Garde les chiffres et les points/virgules
        # Remplacer la virgule par un point si nécessaire et convertir en float
        price_str = price_str.replace(',', '')
        return float(price_str) if price_str else 0.0
    return 0.0


In [16]:
# Fonction pour récupérer les données HTML avec Selenium
def get_data(url):
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "item-container"))
        )
        return driver.page_source
    except Exception as e:
        print(f"Erreur lors de la récupération des données pour l'URL {url}: {e}")
        return None
    

In [17]:
# Fonction de parsing pour les tablettes
def parse_tablets(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find_all('div', class_='item-container')

    all_products = []
    collection_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    for item in results:
        try:
            # Extraction du nom du produit
            name_tag = item.find('a', class_='item-title')
            product_name = name_tag.text.strip() if name_tag else 'N/A'
            link = name_tag['href'] if name_tag else 'N/A'
            
            # Extraction du prix promotionnel
            promo_price_tag = item.find('li', class_='price-current')
            promo_price = clean_price(promo_price_tag.text) if promo_price_tag else 0.0
            price_promo_mad = round(promo_price * usd_to_mad_rate, 2)

            # Extraction de l'ancien prix
            old_price_tag = item.find('li', class_='price-was')
            old_price = clean_price(old_price_tag.text) if old_price_tag else 0.0
            price_initial_mad = round(old_price * usd_to_mad_rate, 2)

            # Promotions
            promotions = []
            promo_tags = item.find_all("span", class_="price-save-percent")
            for promo_tag in promo_tags:
                if promo_tag.text.strip():
                    promotions.append(promo_tag.text.strip())
            promotion = ", ".join(promotions) if promotions else 'Aucune'

            if promotion == 'Aucune' :
                price_initial_mad = price_promo_mad
                price_promo_mad = 'N/A'
            
            # Données supplémentaires
            structured_data = parse_product_name(product_name)

            # Création de la structure des données
            product = {
                **structured_data,
                'category': 'Tablets',
                'marketplace': 'Newegg',
                'category': 'Tablette',
                'link': link,
                'priceInitial': price_initial_mad,
                'pricePromo': price_promo_mad,
                'promotionType': promotion,
                'collectionTime': collection_time,
            }
            all_products.append(product)
        except Exception as e:
            print(f"Error parsing product: {e}")
            continue
    
    return all_products

In [18]:
# Fonction principale de scraping
def scrape_tablets():
    all_products = []
    for page in range(1, MAX_PAGES + 1):
        url = f"{START_URL}/Page-{page}"
        html = get_data(url)
        if html:
            products = parse_tablets(html)
            all_products.extend(products)
        else:
            print(f"Erreur lors de la récupération des données de la page {page}")
    
    return all_products

In [19]:
def clean_all_products(all_products):
    cleaned_products = [
        product for product in all_products
        if all(value != "Unknown" for value in product.values())
    ]

    print(f"Removed {len(all_products) - len(cleaned_products)} rows with 'Unknown' values.")
    return cleaned_products

In [20]:
# Exécuter le scraper et sauvegarder les données
if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    all_products = scrape_tablets()
    cleanData = clean_all_products(all_products)

    if all_products:
        df = pd.DataFrame(cleanData)
        df.to_csv(OUTPUT_FILE, mode='a', index=False)
        print(f"Scraping terminé. Résultats sauvegardés dans {OUTPUT_FILE}")
    else:
        print("Aucun produit trouvé.")


Removed 289 rows with 'Unknown' values.
Scraping terminé. Résultats sauvegardés dans Data\Newegg_Tablets.csv
