In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [14]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='product-item-info')
    #print(product_elements[0].prettify())
    
    
    products=[]
    # Extract and print the names
    for product in product_elements[:-1]:
        try:
            # Product ID
            sku_div = product.find("div", class_="skuDesktop")
            product_id = sku_div.text.strip("[] \n") if sku_div else None

            # Product Link
            link_tag = product.find("a", class_="product-item-link")
            product_link = link_tag['href'] if link_tag else None

            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None

            # Image URL
            img_tag = product.find("img", class_="product-image-photo")
            image_url = img_tag['src'] if img_tag else None

            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None

            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    # Find all tables
                    tables = product_page.find_all('table')
                    for row in tables[2].find_all('tr'):
                            cells = row.find_all(['td', 'th'])
                            if len(cells) >= 2:
                                key = cells[0].get_text(strip=True)
                                value = cells[1].get_text(strip=True)
                                specs[key] = value


            # Append product data
            product_dict = {
                "id": product_id,
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"Mytek"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)

            # Append to products list
            products.append(product_dict)

        except Exception as e:
            print(f"Error processing product: {e}")
    return products

In [15]:
base_url = "https://www.mytek.tn/informatique/ordinateurs-portables/pc-portable.html?p={}"
all_products = []

for page_num in range(1, 23):
    print(f"Scraping page {page_num}...")
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)
    all_products.extend(page_products)
    time.sleep(1) #A small delay between requests to avoid hammering the server.

print(f"Total products scraped: {len(all_products)}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Total products scraped: 540


In [16]:
list(all_products[0].keys())

['id',
 'lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 'DISPONIBILITÉ',
 'Marque',
 'Gamme PC',
 'Gamer',
 "Système d'exploitation",
 "Taille de l'écran",
 'Ecran',
 'Ecran Tactile',
 "Résolution d'écrans",
 'Processeur',
 'Type Processeur',
 'Référence Processeur',
 'Fréquence Processeur',
 'Mémoire Cache',
 'Mémoire',
 'Disque Dur',
 'Type Disque Dur',
 'Carte Graphique',
 'Chipset Graphique',
 'Connecteurs',
 'Couleur',
 'Garantie']

In [17]:
columns_to_keep = ['id', 'lien', 'nom', 'Gamme PC', 'image_url', 'boutique', 'Taille de l\'écran',
                   'Résolution d\'écrans', 'Processeur', 'Référence Processeur', 'Mémoire','Disque Dur', 
                   'Type Disque Dur', 'Carte Graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [18]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,id,lien,nom,Gamme PC,image_url,boutique,Taille de l'écran,Résolution d'écrans,Processeur,Référence Processeur,Mémoire,Disque Dur,Type Disque Dur,Carte Graphique,Système d'exploitation,Gamer,prix
0,82LX00CFFG,https://www.mytek.tn/pc-portable-lenovo-ideapa...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,IDEAPAD 1,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"709,000 DT"
1,X515KA-EJ008,https://www.mytek.tn/pc-portable-asus-x515ka-i...,PC Portable ASUS X515KA Intel Celeron N4500 4G...,X515KA,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1920 x 1080 pixels,Intel Celeron,Intel® Celeron® N4500,4 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"
2,82QY00PEFE,https://www.mytek.tn/pc-portable-lenovo-v15-g2...,PC Portable LENOVO V15 G2 IJL Intel Celeron N4...,V15 G2 IJL,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1920 x 1080 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"
3,A30XQEA,https://www.mytek.tn/pc-portable-hp-15-fd0298n...,PC Portable HP 15-fd0298nk Intel N100 4Go 256G...,15-fd0298nk,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® N100,4 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"
4,X515KA-EJ008-8G,https://www.mytek.tn/pc-portable-asus-x515ka-i...,PC Portable ASUS X515KA Intel Celeron N4500 8G...,X515KA,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1920 x 1080 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"739,000 DT"


In [19]:
df.describe().T

Unnamed: 0,count,unique,top,freq
id,540,540,82LX00CFFG,1
lien,540,540,https://www.mytek.tn/pc-portable-lenovo-ideapa...,1
nom,540,539,Pc Portable DELL Inspiron 15 3535 AMD RYZEN 7 ...,2
Gamme PC,540,105,VOSTRO 3530,26
image_url,540,540,https://mk-media.mytek.tn/media/catalog/produc...,1
boutique,540,1,Mytek,540
Taille de l'écran,540,9,15.6 Pouces,422
Résolution d'écrans,540,6,1920 x 1080 pixels,425
Processeur,540,16,Intel Core i5,195
Référence Processeur,540,57,Intel® Core™ i7-13620H,83


In [20]:
# Save to CSV
df.to_csv("mytek_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to mytek_products.csv")

Data saved to mytek_products.csv


In [21]:
df.columns

Index(['id', 'lien', 'nom', 'Gamme PC', 'image_url', 'boutique',
       'Taille de l'écran', 'Résolution d'écrans', 'Processeur',
       'Référence Processeur', 'Mémoire', 'Disque Dur', 'Type Disque Dur',
       'Carte Graphique', 'Système d'exploitation', 'Gamer', 'prix'],
      dtype='object')