In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import os
import re
import requests

In [2]:
# Selenium Setup
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--ignore-certificate-errors")
service = Service("C:/chromedriver.exe")  # Adjust the path to your ChromeDriver
driver = webdriver.Chrome(service=service, options=options)

In [3]:
# Paramètres du scraping
BASE_URL = "https://www.newegg.com"
START_URL = "https://www.newegg.com/Cell-Phones-Unlocked/SubCategory/ID-2961"
MAX_PAGES = 1
OUTPUT_DIR = "Data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "Newegg_phones.csv")

In [4]:
def get_exchange_rate():
    """Fetch USD to MAD exchange rate from an API."""
    api_url = "https://api.exchangerate-api.com/v4/latest/USD"
    try:
        response = requests.get(api_url)
        data = response.json()
        return data['rates']['MAD']
    except Exception as e:
        print(f"Error fetching exchange rate: {e}")
        return 10.0  # Default fallback rate
usd_to_mad_rate = get_exchange_rate()

In [5]:
# Fonction pour extraire les informations du nom du produit
def parse_product_name(product_name):
    """
    Extract structured information from the product name.
    """
    data = {
        "Brand": None,
        "Model": None,
        "Storage": None,
    }

    # Extract brand
    brand_match = re.search(r'\b(Iphone|Apple|Samsung|Xiaomi|Huawei|Sony|OnePlus|Honor|Vivo|Tecno|Itel|ZTE|Infinix|OPPO)\b', product_name, re.IGNORECASE)
    data["Brand"] = brand_match.group(1) if brand_match else "Unknown"

    # Extract model
    model_match = re.search(r'(Redmi\s?\w+|Galaxy\s?\w+|iPhone\s?\d+|Y(\d+)\s?|nubia V\s?\w+|Note\s?\w+|A(\d+)\s?|Redmi Note\s?\w+)', product_name, re.IGNORECASE)
    data["Model"] = model_match.group(0).strip() if model_match else "Unknown"


    # Extract storage
    storage_matches = re.findall(r'(\d+)\s?(?:Go|GB)', product_name, re.IGNORECASE)

    if storage_matches:
        # Convert to integers and take the maximum value (assuming storage is larger than RAM)
        storage_value = max(map(int, storage_matches))
        data["Storage"] = f"{storage_value}GB"
    else:
        data["Storage"] = "Unknown" 


    return data

In [6]:
# Fonction pour récupérer le HTML avec Selenium
def get_data(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "item-container"))
    )
    return driver.page_source

In [7]:
def clean_price(price_text):
    """Extracts a valid numeric price from a messy string."""
    price_text = price_text.replace("$", "").replace(",", "").replace("\xa0", "").strip()
    
    # Use regex to extract the first valid number (handles cases like "2097.00 –")
    match = re.search(r'\d+(\.\d+)?', price_text)
    
    return float(match.group()) if match else 0.0

# Fonction principale de parsing
def parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find_all('div', class_='item-container')

    all_products = []
    collection_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    
    for item in results:
        try:
            # Extract Product Name
            name_tag = item.find('a', class_='item-title')
            product_name = name_tag.text.strip() if name_tag else 'N/A'
            link = name_tag['href'] if name_tag else 'N/A'
            
            # Extract Promo Price
            promo_price_tag = item.find('li', class_='price-current')
            promo_price = clean_price(promo_price_tag.text) if promo_price_tag else 0.0
            price_promo_mad = round(promo_price * usd_to_mad_rate, 2)


            # Extract Old Price
            old_price_tag = item.find('li', class_='price-was')
            old_price = clean_price(old_price_tag.text) if old_price_tag else 0.0
            price_initial_mad = round(old_price * usd_to_mad_rate, 2)

            promotions = []
            promo_tags = item.find_all("span", class_="price-save-percent")  # Chercher toutes les balises avec classe "tag"
            for promo_tag in promo_tags:
                if promo_tag.text.strip():
                    promotions.append(promo_tag.text.strip())

            promotion = ", ".join(promotions) if promotions else 'Aucune'
            if promotion == 'Aucune' :
                price_initial_mad = price_promo_mad
                price_promo_mad = 'N/A'
            

            # Parse structured information
            structured_data = parse_product_name(product_name)
            
            product = {
                **structured_data,
                'Marketplace': 'Newegg',
                'Category': 'PC Laptops',
                'Link': link,
                'priceInitial': price_initial_mad,
                'pricePromo': price_promo_mad,
                'promotiontype' : promotion,
                'collectionTime': collection_time,
            }
            all_products.append(product)

        except Exception as e:
            print(f"Error parsing product: {e}")
            continue
    
    return all_products

In [8]:
def clean_all_products(all_products):
    cleaned_products = [
        product for product in all_products
        if all(value != "Unknown" for value in product.values())
    ]

    print(f"Removed {len(all_products) - len(cleaned_products)} rows with 'Unknown' values.")
    return cleaned_products

In [9]:
def get_next_page(soup):
    """
    Identifies the URL for the next page.
    """
    next_button = soup.find('a', {'title': 'Next'})
    if next_button and 'href' in next_button.attrs:
        return BASE_URL + next_button['href']
    return None 

In [10]:
def scrape_newegg():
    all_products = []
    
    for page in range(1, MAX_PAGES + 1):
        url = f"{START_URL}?Page-{page}"
        html = get_data(url)
        products = parse(html)
        all_products.extend(products)
    
    driver.quit()
    return all_products


In [11]:
data = scrape_newegg()
cleanData = clean_all_products(data)
# Sauvegarde des résultats
os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.DataFrame(cleanData)
df.to_csv(OUTPUT_FILE, mode='a',index=False)

print(f"Scraping terminé. Résultats sauvegardés dans {OUTPUT_FILE}")

Removed 11 rows with 'Unknown' values.
Scraping terminé. Résultats sauvegardés dans Data\Newegg_phones.csv
