In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [12]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='item-product')
    len(product_elements)
    products=[]
    # Extract and print the names
    for product in product_elements:
        try:
            # Product ID
            sku_div = product.find("span", class_="product-reference")
            product_id = sku_div.text.strip("[] \n") if sku_div else None

            # Product Link
            link_tag = product.find("h2", class_="product-title")
            a = link_tag.find("a")
            product_link = a['href'] if link_tag else None

            # Product Full Name
            full_name = a.text.strip() if a else None

            # Image URL
            img_tag = product.find("img", class_="img-responsive")
            image_url = img_tag['src'] if img_tag else None

            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None

            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    dl = product_page.find("dl",class_="data-sheet")
                    if dl:
                        dts = dl.find_all('dt', class_='name')
                        dds = dl.find_all('dd', class_='value')

                        for dt, dd in zip(dts, dds):
                            key = dt.get_text(strip=True)
                            value = dd.get_text(strip=True)
                            specs[key] = value


            # Append product data
            product_dict = {
                "id": product_id,
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"tunisianet"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)

            # Append to products list
            products.append(product_dict)

        except Exception as e:
            print(f"Error processing product: {e}")
    return products

In [13]:
base_url = "https://www.tunisianet.com.tn/301-pc-portable-tunisie?srsltid=AfmBOoq-YUtG1Bv2afn6FC2AoTwJtHhvbHhNbdsV2ozo0Wf55rQ3lMoG&page={}"
all_products = []

for page_num in range(1, 32):
    print(f"Scraping page {page_num}...")
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)
    all_products.extend(page_products)
    time.sleep(1) #A small delay between requests to avoid hammering the server.

print(f"Total products scraped: {len(all_products)}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Total products scraped: 740


In [14]:
all_products[0]

{'id': '82LX00CFFG',
 'lien': 'https://www.tunisianet.com.tn/pc-portable-tunisie/81139-pc-portable-lenovo-ideapad-1-15ijl7-celeron-n4500-8-go-256-go-ssd-gris.html',
 'nom': 'Pc Portable Lenovo IdeaPad 1 15IJL7 / Celeron N4500 / 8 Go / 256 Go SSD / Gris',
 'image_url': 'https://www.tunisianet.com.tn/403274-home/pc-portable-lenovo-ideapad-1-15ijl7-celeron-n4500-8-go-256-go-ssd-gris.jpg',
 'prix': '709,000 DT',
 'boutique': 'tunisianet',
 "Système d'exploitation": 'FreeDos',
 'Processeur': 'Intel Celeron',
 'Réf processeur': 'Intel Celeron Dual Core N4500, 1.10 Ghz up to 2.8 Ghz , 4 Mo de mémoire cache',
 'Mémoire': '8 Go',
 'Disque Dur': '256 Go SSD',
 'Carte Graphique': 'Graphique Intégrée',
 'Réf Carte graphique': 'Intel UHD Graphics',
 'Taille Ecran': '14" - 15.6"',
 'Type Ecran': 'HD',
 'Ecran Tactile': 'Non',
 'Garantie': '1 an',
 'Couleur': 'Gris'}

In [15]:
list(all_products[0].keys())

['id',
 'lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 "Système d'exploitation",
 'Processeur',
 'Réf processeur',
 'Mémoire',
 'Disque Dur',
 'Carte Graphique',
 'Réf Carte graphique',
 'Taille Ecran',
 'Type Ecran',
 'Ecran Tactile',
 'Garantie',
 'Couleur']

In [16]:
columns_to_keep = ['id', 'lien', 'nom', 'Marque', 'image_url', 'boutique', 'Taille Ecran',
                   'Type Ecran', 'Processeur', 'Réf processeur', 'Mémoire','Disque Dur', 
                   'Type de disque dur', 'Carte Graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [17]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,id,lien,nom,Marque,image_url,boutique,Taille Ecran,Type Ecran,Processeur,Réf processeur,Mémoire,Disque Dur,Type de disque dur,Carte Graphique,Système d'exploitation,Gamer,prix
0,82LX00CFFG,https://www.tunisianet.com.tn/pc-portable-tuni...,Pc Portable Lenovo IdeaPad 1 15IJL7 / Celeron ...,,https://www.tunisianet.com.tn/403274-home/pc-p...,tunisianet,"14"" - 15.6""",HD,Intel Celeron,"Intel Celeron Dual Core N4500, 1.10 Ghz up to ...",8 Go,256 Go SSD,,Graphique Intégrée,FreeDos,,"709,000 DT"
1,82LX00CKFG,https://www.tunisianet.com.tn/pc-portable-tuni...,Pc Portable Lenovo IdeaPad 1 15IJL7 / Celeron ...,,https://www.tunisianet.com.tn/403280-home/pc-p...,tunisianet,"14"" - 15.6""",HD,Intel Celeron,"Intel Celeron Dual Core N4500, 1.10 Ghz up to ...",8 Go,256 Go SSD,,Graphique Intégrée,FreeDos,,"709,000 DT"
2,82LX00CKFG-SAC,https://www.tunisianet.com.tn/pc-portable-tuni...,Pc Portable Lenovo IdeaPad 1 15IJL7 / Celeron ...,,https://www.tunisianet.com.tn/404194-home/pc-p...,tunisianet,"14"" - 15.6""",HD,Intel Celeron,"Intel Celeron Dual Core N4500, 1.10 Ghz up to ...",8 Go,256 Go SSD,,Graphique Intégrée,FreeDos,,"719,000 DT"
3,A30XQEA,https://www.tunisianet.com.tn/pc-portable-tuni...,Pc Portable HP 15-fd0298nk / N100 / 4 Go / 256...,,https://www.tunisianet.com.tn/391207-home/pc-p...,tunisianet,"14"" - 15.6""",HD,Intel Celeron,"Intel Celeron N100 (jusqu’à 3,4 GHz, 6 Mo de m...",4 Go,256 Go SSD,,Graphique Intégrée,FreeDos,,"719,000 DT"
4,X515KA-EJ008,https://www.tunisianet.com.tn/pc-portable-tuni...,Pc Portable ASUS Vivobook 15 X515KA / Celeron ...,,https://www.tunisianet.com.tn/396840-home/pc-p...,tunisianet,"14"" - 15.6""",Full HD,Intel Celeron,"Intel Celeron Dual Core N4500, 1.10 Ghz up to ...",4 Go,256 Go SSD,,Graphique Intégrée,FreeDos,,"719,000 DT"


In [18]:
df.describe().T

Unnamed: 0,count,unique,top,freq
id,740,740,82LX00CFFG,1.0
lien,740,740,https://www.tunisianet.com.tn/pc-portable-tuni...,1.0
nom,740,727,Pc portable Dell Latitude 5550 / Ultra 7 155U ...,3.0
Marque,0,0,,
image_url,740,740,https://www.tunisianet.com.tn/403274-home/pc-p...,1.0
boutique,740,1,tunisianet,740.0
Taille Ecran,718,6,"14"" - 15.6""",521.0
Type Ecran,682,7,Full HD,548.0
Processeur,734,16,Intel Core i5 | Intel Core 5,217.0
Réf processeur,729,76,"Intel Core i7-13620H (4.9 GH, 24 Mo de mémoire...",102.0


In [19]:
# Save to CSV
df.to_csv("tunisianet_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to tunisianet_products.csv")

Data saved to tunisianet_products.csv
