In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
import time

In [6]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='field-product-item')
    #print(product_elements[0].prettify())
    #print(len(product_elements))
    
    products=[]
    # Extract and print the names
    
    for product in product_elements[:-1]:
        try:
            # Product ref
            ref_div = product.find("div", class_="product-reference").find("span")
            product_ref = ref_div.text.strip() if ref_div else None
            
            # Product Link
            link_tag = product.find("h2", class_="product_name").find("a")
            product_link = link_tag['href'] if link_tag else None
            
            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None
            
            # Image URL
            img_tag = product.find("img", class_="product_image")
            image_url = img_tag['src'].strip() if img_tag else None
            
            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None
            
            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:                    
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    data_sheet = product_page.find("dl", class_="data-sheet")
                    if data_sheet:
                        dt_tags = data_sheet.find_all("dt", class_="name")
                        dd_tags = data_sheet.find_all("dd", class_="value")

                        for dt, dd in zip(dt_tags, dd_tags):
                            key = dt.get_text(strip=True)
                            value = dd.get_text(strip=True)
                            specs[key] = value
            
            # Append product data
            product_dict = {
                "reference":product_ref,
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"SpaceNet"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)
            
            # Append to products list
            products.append(product_dict)
            
        
        except Exception as e:
            print(f"Error processing product: {e}")
    
    return products

In [7]:
base_url = "https://spacenet.tn/18-ordinateur-portable"

response = requests.get(base_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

# Find all <li> elements inside the pagination
page_items = soup.select("ul.page-list li a")

# Extract text content and convert to integers if possible
page_numbers = []
for item in page_items:
    try:
        num = int(item.text.strip())
        page_numbers.append(num)
    except ValueError:
        continue

# Get the maximum page number
last_page = max(page_numbers) if page_numbers else 1

print("Last page number:", last_page)

Last page number: 59


In [13]:
base_url = "https://spacenet.tn/18-ordinateur-portable?page={}"
all_products = []

for page_num in tqdm(range(1, last_page+1), desc="Scraping pages"):
    #print(f"Scraping page {page_num}...")
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)
    all_products.extend(page_products)
    time.sleep(1) #A small delay between requests to avoid hammering the server.

print(f"Total products scraped: {len(all_products)}")

Scraping pages: 100%|████████████████████████████████████████████████████████████████| 59/59 [1:00:01<00:00, 61.05s/it]

Total products scraped: 2289





In [14]:
list(all_products[0].keys())

['reference',
 'lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 'Garantie',
 'Norme Clavier',
 'Couleur',
 'Connectivité',
 'Gamer',
 'Longueur de câble']

In [15]:
columns_to_keep = ['reference', 'lien', 'nom', 'Gamme PC', 'image_url', 'boutique', 'Taille de l\'écran',
                   'Résolution écran', 'Type de Processeur', 'processeur', 'Mémoire','Disque Dur', 
                   'Type Disque Dur', 'Réf Carte Graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [16]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,reference,lien,nom,Gamme PC,image_url,boutique,Taille de l'écran,Résolution écran,Type de Processeur,processeur,Mémoire,Disque Dur,Type Disque Dur,Réf Carte Graphique,Système d'exploitation,Gamer,prix
0,CK-550-GKTM1,https://spacenet.tn/clavier-gamer/57300-clavie...,Clavier Gamer Mécanique Cooler Master CK550 V2...,,https://spacenet.tn/144096-home_default/clavie...,SpaceNet,,,,,,,,,,Oui,"149,000 DT"
1,82LX00CFFG,https://spacenet.tn/pc-portable-tunisie/81195-...,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,IdeaPad 1,https://spacenet.tn/243251-home_default/pc-por...,SpaceNet,15.6 Pouces,,Intel Celeron,"Intel Celeron N4500 (Up to 2,8 GHz Turbo max, ...",8 Go,256 Go SSD,,Intel UHD,Free Dos,Non,"709,000 DT"
2,82LX00CKFG,https://spacenet.tn/pc-portable-tunisie/81186-...,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,IdeaPad 1,https://spacenet.tn/243240-home_default/pc-por...,SpaceNet,15.6 Pouces,,Intel Celeron,"Intel® Celeron® N4500 (Up to 2,8 GHz Turbo max...",8 Go,256 Go SSD,,Intel UHD,Free Dos,Non,"709,000 DT"
3,82QY00PEFE,https://spacenet.tn/pc-portable-tunisie/78603-...,Pc Portable Lenovo V15 G2 IJL Intel Celeron N4...,V15 G2 IJL,https://spacenet.tn/229972-home_default/pc-por...,SpaceNet,15.6 Pouces,,Intel Celeron,"ntel Celeron N4500 (Up to 2.8 Ghz , 4 Mo de mé...",8 Go,256 Go SSD,,Intel UHD,Free Dos,Non,"719,000 DT"
4,A30XQEA,https://spacenet.tn/pc-portable-tunisie/79478-...,PC Portable HP 15-fd0298nk Intel Celeron N100 ...,15-fd0298nk,https://spacenet.tn/235037-home_default/pc-por...,SpaceNet,15.6 Pouces,,Intel Celeron,Intel Celeron N100 (up to 3.4 GHz with Intel® ...,4 Go,256 Go SSD,,Intel UHD,Free Dos,Non,"719,000 DT"


In [17]:
df.describe().T

Unnamed: 0,count,unique,top,freq
reference,2289,1174,CK-550-GKTM1,2.0
lien,2289,1174,https://spacenet.tn/clavier-gamer/57300-clavie...,2.0
nom,2289,1046,Pc Portable Lenovo IdeaPad 1 15IJL7 Intel Cele...,9.0
Gamme PC,2285,198,LOQ 15IRX9,79.0
image_url,2289,1174,https://spacenet.tn/144096-home_default/clavie...,2.0
boutique,2289,1,SpaceNet,2289.0
Taille de l'écran,2287,12,15.6 Pouces,1444.0
Résolution écran,0,0,,
Type de Processeur,2287,28,Intel Core i5,903.0
processeur,2065,259,"Intel Core i5-12450HX (up to 4.4 GHz, 12Mo de ...",53.0


In [18]:
# Save to CSV
df.to_csv("scraped_data/spacenet_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to spacenet_products.csv")

Data saved to spacenet_products.csv
