In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [6]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='field-product-item')
    #print(product_elements[0].prettify())
    #print(len(product_elements))
    
    products=[]
    # Extract and print the names
    
    for product in product_elements[:-1]:
        try:
            # Product ref
            ref_div = product.find("div", class_="product-reference").find("span")
            product_ref = ref_div.text.strip() if ref_div else None
            
            # Product Link
            link_tag = product.find("h2", class_="product_name").find("a")
            product_link = link_tag['href'] if link_tag else None
            
            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None
            
            # Image URL
            img_tag = product.find("img", class_="product_image")
            image_url = img_tag['src'].strip() if img_tag else None
            
            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None
            
            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:                    
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    data_sheet = product_page.find("dl", class_="data-sheet")
                    if data_sheet:
                        dt_tags = data_sheet.find_all("dt", class_="name")
                        dd_tags = data_sheet.find_all("dd", class_="value")

                        for dt, dd in zip(dt_tags, dd_tags):
                            key = dt.get_text(strip=True)
                            value = dd.get_text(strip=True)
                            specs[key] = value
            
            # Append product data
            product_dict = {
                "reference":product_ref,
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"SpaceNet"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)
            
            # Append to products list
            products.append(product_dict)
            
        
        except Exception as e:
            print(f"Error processing product: {e}")
    
    return products

In [7]:
base_url = "https://spacenet.tn/18-ordinateur-portable?page={}"
all_products = []

for page_num in range(1, 60):
    print(f"Scraping page {page_num}...")
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)
    all_products.extend(page_products)
    time.sleep(1) #A small delay between requests to avoid hammering the server.

print(f"Total products scraped: {len(all_products)}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [8]:
list(all_products[0].keys())

['reference',
 'lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 "Système d'exploitation",
 'Mémoire',
 'Ports',
 'Connectivité sans-fil',
 'Garantie',
 "Taille de l'écran",
 'Type de Processeur',
 'Disque Dur',
 'Cache',
 'Carte Graphique',
 'processeur',
 'Couleur',
 'Tactile',
 'Gamer',
 'Réf Carte Graphique',
 'Gamme PC']

In [9]:
columns_to_keep = ['Référence', 'lien', 'nom', 'Marque', 'image_url', 'boutique', 'Taille de l\'écran',
                   'Résolution écran', 'Processeur', 'Référence processeur', 'Mémoire','Disque dur', 
                   'Type de disque dur', 'Carte graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [10]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,Référence,lien,nom,Marque,image_url,boutique,Taille de l'écran,Résolution écran,Processeur,Référence processeur,Mémoire,Disque dur,Type de disque dur,Carte graphique,Système d'exploitation,Gamer,prix
0,,https://spacenet.tn/pc-portable-tunisie/68134-...,Pc Portable Schneider SCL141CTP Intel Atom x5-...,,https://spacenet.tn/184485-home_default/pc-por...,SpaceNet,14.1 Pouces,,,,2 Go,,,,Windows 10,Non,"319,000 DT"
1,,https://spacenet.tn/pc-portable-tunisie/81195-...,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,,https://spacenet.tn/243251-home_default/pc-por...,SpaceNet,15.6 Pouces,,,,8 Go,,,,Free Dos,Non,"689,000 DT"
2,,https://spacenet.tn/pc-portable-tunisie/81186-...,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,,https://spacenet.tn/243240-home_default/pc-por...,SpaceNet,15.6 Pouces,,,,8 Go,,,,Free Dos,Non,"689,000 DT"
3,,https://spacenet.tn/pc-portable-tunisie/81832-...,Pc Portable Acer Extensa 15 Intel Celeron N510...,,https://spacenet.tn/246665-home_default/pc-por...,SpaceNet,15.6 Pouces,,,,4 Go,,,,Free Dos,Non,"699,000 DT"
4,,https://spacenet.tn/pc-portable-tunisie/78603-...,Pc Portable Lenovo V15 G2 IJL Intel Celeron N4...,,https://spacenet.tn/229972-home_default/pc-por...,SpaceNet,15.6 Pouces,,,,8 Go,,,,Free Dos,Non,"719,000 DT"


In [11]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Référence,0,0,,
lien,2279,1169,https://spacenet.tn/pc-portable-tunisie/68134-...,2.0
nom,2279,1048,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,10.0
Marque,0,0,,
image_url,2279,1169,https://spacenet.tn/184485-home_default/pc-por...,2.0
boutique,2279,1,SpaceNet,2279.0
Taille de l'écran,2279,12,15.6 Pouces,1460.0
Résolution écran,0,0,,
Processeur,40,7,Intel Core i5-1335U,14.0
Référence processeur,0,0,,


In [12]:
# Save to CSV
df.to_csv("spacenet_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to spacenet_products.csv")

Data saved to spacenet_products.csv
