In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
import time

In [2]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='product-item-info')
    #print(product_elements[0].prettify())
    
    products=[]
    # Extract and print the names
    
    for product in product_elements[:-1]:
        try:
            # Product Link
            link_tag = product.find("a", class_="product-item-link")
            product_link = link_tag['href'] if link_tag else None
            
            
            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None
        
            # Image URL
            img_tag = product.find("img", class_="product-image-photo")
            image_url = img_tag['data-src'] if img_tag else None
            
            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None
            
            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:                    
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    table = product_page.find('table')
                    for row in table.find_all('tr'):
                            cells = row.find_all(['td', 'th'])
                            if len(cells) >= 2:
                                key = cells[0].get_text(strip=True)
                                value = cells[1].get_text(strip=True)
                                specs[key] = value

            
            # Append product data
            product_dict = {
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"graiet"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)

            # Append to products list
            products.append(product_dict)
            

        except Exception as e:
            print(f"Error processing product: {e}")
    
    return products

In [3]:
base_url = "https://www.graiet.tn/informatique-et-gaming/pc-portable.html?p={}"
response = requests.get(base_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

# Find the first span with class toolbar-number
total_products_span = soup.select("p#toolbar-amount span.toolbar-number")

# Extract the number as int
total_products = int(total_products_span[-1].text) if total_products_span[-1] else 0

print("Total products:", total_products)

Total products: 163


In [4]:
# Initialize tqdm with total=total_products and initial=len(all_products)
pbar = tqdm(total=total_products, desc="Scraping products")

all_products = []

page_num = 1
while len(all_products) < total_products:
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)  # your scraping function
    all_products.extend(page_products)
    # Update progress bar by number of new products scraped
    pbar.update(len(page_products))
    page_num += 1
    time.sleep(1)  # polite delay

pbar.close()
print(f"Total products scraped: {len(all_products)} / {total_products}")

Scraping products: 100%|█████████████████████████████████████████████████████████████| 163/163 [04:27<00:00,  1.64s/it]

Total products scraped: 163 / 163





In [5]:
list(all_products[0].keys())

['lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 'Prix',
 'Référence',
 'Garantie',
 "Type d'ordinateur",
 'Taille Ecran',
 'Type Ecran',
 'Résolution',
 'Processeur',
 'Système D’exploitation',
 'Mémoire RAM',
 'Disque Dur',
 'Carte Graphique',
 'Bluetooth',
 'Ecran Tactile',
 'Pc Gamer',
 'Couleur']

In [6]:
columns_to_keep = ['Référence', 'lien', 'nom', 'Marque', 'image_url', 'boutique', 'Taille Ecran',
                   'Résolution', 'Processeur', 'Référence processeur', 'Mémoire RAM','Disque Dur', 
                   'Type de disque dur', 'Carte Graphique', "Système D’exploitation",'Pc Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [7]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,Référence,lien,nom,Marque,image_url,boutique,Taille Ecran,Résolution,Processeur,Référence processeur,Mémoire RAM,Disque Dur,Type de disque dur,Carte Graphique,Système D’exploitation,Pc Gamer,prix
0,82QY00PEFE,https://www.graiet.tn/pc-portable-lenovo-v15-c...,Pc Portable LENOVO V15 | Intel Celeron N4500 -...,,https://www.graiet.tn/media/catalog/product/ca...,graiet,15.6 “,Full HD 1080p,Hélio G99,,8 G,256 Go SSD,,Intel HD Graphics,Android 14,Non,"729,00 TND"
1,82VG00Q2FG-2Y,https://www.graiet.tn/lenovo-pc-portable-ideap...,Pc Portable LENOVO IDEAPAD 1 | AMD ATHLON - 7...,,https://www.graiet.tn/media/catalog/product/ca...,graiet,15.6 “,Full HD 1080p,Hélio G99,,8 G,256 Go SSD,,AMD Radeon ™ Intégrée,FreeDos,Non,"729,00 TND"
2,AB1T8EA,https://www.graiet.tn/hp-pc-portable-15-ab1t8e...,Pc Portable HP 15 | Intel Celeron N100 - 8 Go ...,,https://www.graiet.tn/media/catalog/product/ca...,graiet,15.6 “,(1600 x 720 ) Pixels,Celeron N100,,8 G,256 Go SSD,,Intel UHD Graphics,FreeDos,Non,"759,00 TND"
3,886L5EA,https://www.graiet.tn/pc-portable-hp-15-fd0030...,Pc Portable HP 15 | Intel N100 - 4 GO - 256 SS...,,https://www.graiet.tn/media/catalog/product/ca...,graiet,15.6 “,Full HD 1080p,Celeron N100,,4 G,256 Go SSD,,Intel UHD Graphics,Windows 11,Non,"779,00 TND"
4,V3520-I3-512SSD,https://www.graiet.tn/dell-pc-portable-vostro-...,PC Portable Dell Vostro 3520 | Intel i3 12e Gé...,,https://www.graiet.tn/media/catalog/product/ca...,graiet,15.6 “,(1600 x 720 ) Pixels,Intel Core i3-12è Gén,,8 G,512 Go SSD,,Intel UHD Graphics,Ubuntu,Non,"1 019,00 TND"


In [8]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Référence,161,158,9S7-16R831-2607,2.0
lien,163,163,https://www.graiet.tn/pc-portable-lenovo-v15-c...,1.0
nom,163,160,Pc Portable DELL VOSTRO 3520 | Intel i5 - 16 G...,2.0
Marque,0,0,,
image_url,163,163,https://www.graiet.tn/media/catalog/product/ca...,1.0
boutique,163,1,graiet,163.0
Taille Ecran,162,6,15.6 “,143.0
Résolution,162,6,Full HD 1080p,150.0
Processeur,161,21,Intel Core i5 12è Gén,27.0
Référence processeur,0,0,,


In [9]:
# Save to CSV
df.to_csv("scraped_data/graiet_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to graiet_products.csv")

Data saved to graiet_products.csv
