In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import os
import re

In [2]:
# Constants
BASE_URL = "https://www.marjanemall.ma"
START_URL = "https://www.marjanemall.ma/informatique-gaming/tablette/tablette-tactile"
MAX_PAGES = 100
OUTPUT_DIR = "Data"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "MarjaneMall_Tablettes.csv")

In [3]:
# Selenium Setup
options = Options()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--ignore-certificate-errors")
service = Service("C:/chromedriver.exe")  
driver = webdriver.Chrome(service=service, options=options)

In [4]:
def parse_product_name(name):
    """
    Extract structured information from the product name.
    """
    data = {
        "Brand": None,
        "Model": None,
        "Storage": None,
    }
    product_name = name.title()
    # Extract brand
    brand_match = re.search(r'\b(Iphone|Apple|Samsung|Xiaomi|Huawei|Sony|Ulefone|Honor|Vivo|Tecno|Itel|ZTE|Infinix|OPPO)\b', product_name, re.IGNORECASE)
    data["Brand"] = brand_match.group(1) if brand_match else "Unknown"

    # Extract model
    model_match = re.search(r'(Redmi Pad\s?\w+|Galaxy TAB\s?\w+|iPad Air\s?\w+|iPad\s?\w+|Matepad\s?\w+|VistaTab\s?\w+|PAD\s?\w+|Medpad T\s?\w+)', product_name, re.IGNORECASE)
    data["Model"] = model_match.group(0).strip() if model_match else "Unknown"


    # Extract storage
    storage_match = re.search(r'(\d+)\s?Go', product_name, re.IGNORECASE)
    data["Storage"] = f"{storage_match.group(1)}GB" if storage_match else "Unknown"


    return data

In [5]:
def get_data(url):
    """
    Fetches the page source using Selenium.
    """
    driver.get(url)
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "product-item-link"))
    )
    return driver.page_source

In [6]:
def parse(html):
    """
    Parses product information from the HTML.
    """
    soup = BeautifulSoup(html, 'html.parser')
    # Debug: Print a preview of the page to confirm if products are present
    print(soup.prettify()[:1000])  # Show the first 1000 characters for debugging
    
    results = soup.find_all('div', {'class': 'product-item-details'})
    print(f"Found {len(results)} product entries.")
    
    if not results:
        print("No products found. Check the HTML structure or URL.")
        return []

    all_products = []
    collection_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    for item in results:
        try:
            # Product Name
            product_name_tag = item.find('a', {'class': 'product-item-link'})
            product_name = product_name_tag.text.strip() if product_name_tag else 'N/A'
            print(f"Product Name: {product_name}")  # Debugging product name

            # Product Link
            link_tag = product_name_tag
            link = link_tag['href'] if link_tag else 'N/A'
            if link and not link.startswith('http'):
                link = BASE_URL + link

            # Extract Initial Price
            old_price_tag = item.find('span', {'class': 'old-price'})
            if old_price_tag:
                price_wrapper_tag = old_price_tag.find('span', {'class': 'price-wrapper'})
                if price_wrapper_tag and 'data-price-amount' in price_wrapper_tag.attrs:
                    price_initial = float(price_wrapper_tag['data-price-amount'])
                else:
                    price_initial = 'N/A'
            else:
                price_initial = 'N/A'

            print(f"Initial Price: {price_initial}")
            
            # Promo Price (Optional)
            price_promo_tag = item.find('span', {'class': 'price-wrapper'})
            if price_promo_tag and 'data-price-amount' in price_promo_tag.attrs:
                price_promo = float(price_promo_tag['data-price-amount'])
            else:
                price_promo = 'N/A'
            
                
            promotions = []
            promo_tags = item.find_all("span", class_="octopia-discount percent")  # Chercher toutes les balises avec classe "tag"
            for promo_tag in promo_tags:
                if promo_tag.text.strip():
                    promotions.append(promo_tag.text.strip())

            promotion = ", ".join(promotions) if promotions else 'Aucune'

            if promotion == 'Aucune' :
                price_initial = price_promo
                price_promo = 'N/A' 

            structured_data = parse_product_name(product_name)

            # Product Object
            product = {
                **structured_data,
                'marketplace': 'Marjane Mall',
                'category': 'tablette',
                'link': link,
                'priceInitial': price_initial,
                'pricePromo': price_promo,
                'promotiontype' : promotion,
                'collectionTime': collection_time
            }
            all_products.append(product)

        except Exception as e:
            print(f"Error parsing product: {e}")
            print(f"Product HTML: {item.prettify()[:500]}")  # Debug the specific item
            continue

    return all_products

In [7]:
def save_to_csv(products):
    """
    Save product data to a single CSV file.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(products).to_csv(OUTPUT_FILE, index=False)
    else:
        pd.DataFrame(products).to_csv(OUTPUT_FILE, mode='a', header=False, index=False)

In [8]:
def clean_all_products(all_products):
    cleaned_products = [
        product for product in all_products
        if all(value != "Unknown" for value in product.values())
    ]

    print(f"Removed {len(all_products) - len(cleaned_products)} rows with 'Unknown' values.")
    return cleaned_products

In [9]:
def get_next_page(soup):
    """
    Identifies the URL for the next page.
    """
    try:
        next_button = soup.find('li', {'class': 'item pages-item-next'})
        if next_button:
            link_tag = next_button.find('a')
            if link_tag and 'href' in link_tag.attrs:
                next_page = link_tag['href']
                if not next_page.startswith('http'):
                    next_page = BASE_URL + next_page
                return next_page
    except Exception as e:
        print(f"Error finding next page: {e}")
    return None

In [10]:
# Main Execution
if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    current_url = START_URL
    all_products = []
    page_count = 0

    try:
        while current_url and page_count < MAX_PAGES:
            print(f"Fetching page {page_count + 1}: {current_url}")
            html = get_data(current_url)
            products = parse(html)
            all_products.extend(products)
            soup = BeautifulSoup(html, 'html.parser')
            current_url = get_next_page(soup)
            page_count += 1

        if all_products:
            cleaned_file = clean_all_products(all_products)
            save_to_csv(cleaned_file)
            print(f"Products saved to {OUTPUT_FILE}")
        else:
            print("No products found.")
    finally:
        driver.quit()


Fetching page 1: https://www.marjanemall.ma/informatique-gaming/tablette/tablette-tactile
<html lang="fr">
 <head>
  <meta charset="utf-8"/>
  <script async="" src="https://analytics.tiktok.com/i18n/pixel/static/identify_45dd5971.js" type="text/javascript">
  </script>
  <script async="" src="https://www.googletagmanager.com/gtag/destination?id=AW-11095620594&amp;l=dataLayer&amp;cx=c&amp;gtm=45he5240v883107650za204zb847259191" type="text/javascript">
  </script>
  <script async="true" data-owner="criteo-tag" src="https://sslwidget.criteo.com/event?a=110894&amp;v=5.31.1&amp;otl=1&amp;p0=e%3Dexd%26site_type%3Dd&amp;p1=e%3Dvl%26tms%3DCriteo%2520GTM%2520Enhanced%2520(Retail)%2520(GA4)%2520-%2520v14%26ca%3DTablette%2520tactile%253ETablette%253EInformatique%2520-%2520Gaming%26p%3D%255BSAM8806092268098%252CAAAAH57707%252CAAAAN07186%255D&amp;p2=e%3Ddis&amp;tld=marjanemall.ma&amp;dy=1&amp;fu=https%253A%252F%252Fwww.marjanemall.ma%252Finformatique-gaming%252Ftablette%252Ftablette-tactile&amp;cei