In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
import time
import re


In [2]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('form', class_='product-item')
    #print(product_elements[0].prettify())

    products=[]
    # Extract and print the names
    for product in product_elements:
        try:
            # Product Link
            link_tag = product.find("a", class_="product-item-link")
            product_link = link_tag['href'] if link_tag else None
            
            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None
            
            # Image URL
            img_tag = product.find("img", class_="product-image-photo")
            image_url = img_tag['src'] if img_tag else None
            
            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None

            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:                    
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    table = product_page.find('table')  

                    for row in table.find_all('tr'):
                            cells = row.find_all(['td', 'th'])
                            if len(cells) >= 2:
                                key = cells[0].get_text(strip=True)
                                value = cells[1].get_text(strip=True)
                                specs[key] = value


            # Append product data
            product_dict = {
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"Batam"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)

            # Append to products list
            products.append(product_dict)
            
        except Exception as e:
            print(f"Error processing product: {e}")
   
    return products

In [11]:
base_url = "https://batam.com.tn/informatique/ordinateur-portable.html?p={}"
response = requests.get(base_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

# Find the first span with class toolbar-number
total_products_span = soup.select('p#toolbar-amount span.toolbar-number')

# Extract the number as int
total_products = int(total_products_span[-1].text) if total_products_span[-1] else 0

print("Total products:", total_products)

Total products: 114


In [12]:
# Initialize tqdm with total=total_products and initial=len(all_products)
pbar = tqdm(total=total_products, desc="Scraping products")

all_products = []

page_num = 1
while len(all_products) < total_products:
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)  # your scraping function
    all_products.extend(page_products)
    # Update progress bar by number of new products scraped
    pbar.update(len(page_products))
    page_num += 1
    time.sleep(1)  # polite delay

pbar.close()
print(f"Total products scraped: {len(all_products)} / {total_products}")

Scraping products: 100%|█████████████████████████████████████████████████████████████| 114/114 [01:44<00:00,  1.09it/s]

Total products scraped: 114 / 114





In [13]:
list(all_products[0].keys())

['lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 'Référence',
 'Marque',
 'Gamme pc',
 'Gamer',
 "Système d'exploitation",
 "Taille de l'écran",
 'Écran',
 'Résolution écran',
 'Écran tactile',
 'Processeur',
 'Type de processeur',
 'Référence processeur',
 'Fréquence processeur',
 'Mémoire',
 'Disque dur',
 'Type de disque dur',
 'Carte graphique',
 'Chipset graphique',
 'Connecteurs',
 'Couleur',
 'Garantie']

In [14]:
columns_to_keep = ['Référence', 'lien', 'nom', 'Marque', 'image_url', 'boutique', 'Taille de l\'écran',
                   'Résolution écran', 'Processeur', 'Référence processeur', 'Mémoire','Disque dur', 
                   'Type de disque dur', 'Carte graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [15]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,Référence,lien,nom,Marque,image_url,boutique,Taille de l'écran,Résolution écran,Processeur,Référence processeur,Mémoire,Disque dur,Type de disque dur,Carte graphique,Système d'exploitation,Gamer,prix
0,X515KA-EJ008,https://batam.com.tn/asus-celeron-4-256-silver...,PC Portable ASUS Intel Celeron N4500 4Go 256G...,ASUS,https://batam.com.tn/media/catalog/product/cac...,Batam,"15,6''",1920 x 1080,Intel Celeron,Intel® Celeron® N4500,4 Go,256GO,SSD,Graphique Intégrée,FreeDos,Non,"629,000 DT"
1,82QY00PEFE,https://batam.com.tn/pc-lenovo-n4500-8gb-256-s...,PC Portable LENOVO V15 G2 IJL Intel Celeron N4...,LENOVO,https://batam.com.tn/media/catalog/product/cac...,Batam,"15,6''",1920 x 1080,Intel Celeron,Intel® Celeron® N4500,8 Go,256GO,SSD,Graphique Intégrée,FreeDos,Non,"729,000 DT"
2,82LX00CKFG,https://batam.com.tn/pc-lenovon-celeron-8-256b...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,LENOVO,https://batam.com.tn/media/catalog/product/cac...,Batam,"15,6''",1366 x 768,Intel Celeron,Intel® Celeron® N4500,8 Go,256GO,SSD,Graphique Intégrée,FreeDos,Non,"729,000 DT"
3,82LX00CFFG,https://batam.com.tn/pc-lenovon-celeron-8-256b...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,LENOVO,https://batam.com.tn/media/catalog/product/cac...,Batam,"15,6''",1366 x 768,Intel Celeron,Intel® Celeron® N4500,8 Go,256GO,SSD,Graphique Intégrée,FreeDos,Non,"729,000 DT"
4,82LX00CEFG,https://batam.com.tn/pc-lenovo-celeron-8-256-w...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,LENOVO,https://batam.com.tn/media/catalog/product/cac...,Batam,"15,6''",1366 x 768,Intel Celeron,Intel® Celeron® N4500,8 Go,256GO,SSD,Graphique Intégrée,Windows 11 Famille,Non,"735,000 DT"


In [16]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Référence,114,107,9D6W0EA,2
lien,114,114,https://batam.com.tn/asus-celeron-4-256-silver...,1
nom,114,110,PC Portable HP 15-fd0048nk i5 13è Gén 8Go 512G...,2
Marque,114,6,HP,33
image_url,114,114,https://batam.com.tn/media/catalog/product/cac...,1
boutique,114,1,Batam,114
Taille de l'écran,114,6,"15,6''",90
Résolution écran,102,8,1920 x 1080,75
Processeur,114,18,Intel Core i5,36
Référence processeur,105,38,Intel® Core™ i5-1334U,14


In [17]:
# Save to CSV
df.to_csv("scraped_data/batam_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to batam_products.csv")

Data saved to batam_products.csv
