In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm
import time
import re


In [2]:
def scrape_products_from_page(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all product name elements
    product_elements = soup.find_all('div', class_='product-item-info')
    #print(product_elements[0].prettify())
    
    
    products=[]
    # Extract and print the names
    for product in product_elements[:-1]:
        try:
            # Product ID
            sku_div = product.find("div", class_="skuDesktop")
            product_id = sku_div.text.strip("[] \n") if sku_div else None

            # Product Link
            link_tag = product.find("a", class_="product-item-link")
            product_link = link_tag['href'] if link_tag else None

            # Product Full Name
            full_name = link_tag.text.strip() if link_tag else None

            # Image URL
            img_tag = product.find("img", class_="product-image-photo")
            image_url = img_tag['src'] if img_tag else None

            # Price
            price_tag = product.find("span", class_="price")
            price = price_tag.text.strip() if price_tag else None

            specs = {}
            #Additional data from product page
            if product_link:
                response = requests.get(product_link)
                if response.status_code == 200:
                    product_page = BeautifulSoup(response.text, 'html.parser')
                    # Find all tables
                    tables = product_page.find_all('table')
                    for row in tables[2].find_all('tr'):
                            cells = row.find_all(['td', 'th'])
                            if len(cells) >= 2:
                                key = cells[0].get_text(strip=True)
                                value = cells[1].get_text(strip=True)
                                specs[key] = value


            # Append product data
            product_dict = {
                "id": product_id,
                "lien": product_link,
                "nom": full_name,
                "image_url": image_url,
                "prix": price,
                "boutique":"Mytek"
            }

            # Add specs dictionary to product_data (expand keys dynamically)
            product_dict.update(specs)

            # Append to products list
            products.append(product_dict)

        except Exception as e:
            print(f"Error processing product: {e}")
    return products

In [16]:
base_url = "https://www.mytek.tn/informatique/ordinateurs-portables.html"

response = requests.get(base_url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

spans = soup.select("a.page span")

last_page = int(spans[-1].text.strip())

print("Last page number:", last_page)

Last page number: 27


In [17]:
base_url = "https://www.mytek.tn/informatique/ordinateurs-portables/pc-portable.html?p={}"
all_products = []

for page_num in tqdm(range(1, last_page+1), desc="Scraping pages"):
    url = base_url.format(page_num)
    page_products = scrape_products_from_page(url)
    all_products.extend(page_products)
    time.sleep(1) #A small delay between requests to avoid hammering the server.

print(f"Total products scraped: {len(all_products)}")

Scraping pages: 100%|██████████████████████████████████████████████████████████████████| 27/27 [12:11<00:00, 27.08s/it]

Total products scraped: 646





In [18]:
list(all_products[0].keys())

['id',
 'lien',
 'nom',
 'image_url',
 'prix',
 'boutique',
 'DISPONIBILITÉ',
 'gtin',
 'Marque',
 'Gamme PC',
 'Gamer',
 "Système d'exploitation",
 "Taille de l'écran",
 'Ecran',
 'Ecran Tactile',
 "Résolution d'écrans",
 'Processeur',
 'Type Processeur',
 'Référence Processeur',
 'Fréquence Processeur',
 'Mémoire Cache',
 'Mémoire',
 'Disque Dur',
 'Type Disque Dur',
 'Carte Graphique',
 'Chipset Graphique',
 'Connecteurs',
 'Couleur',
 'Garantie']

In [19]:
columns_to_keep = ['id', 'lien', 'nom', 'Gamme PC', 'image_url', 'boutique', 'Taille de l\'écran',
                   'Résolution d\'écrans', 'Processeur', 'Référence Processeur', 'Mémoire','Disque Dur', 
                   'Type Disque Dur', 'Carte Graphique', "Système d'exploitation",'Gamer', 'prix']

# Filter the dictionary to keep only the desired columns
filtered_product_features = [{key: product.get(key) for key in columns_to_keep}
                             for product in all_products]

In [20]:
df = pd.DataFrame(filtered_product_features)
df.head()

Unnamed: 0,id,lien,nom,Gamme PC,image_url,boutique,Taille de l'écran,Résolution d'écrans,Processeur,Référence Processeur,Mémoire,Disque Dur,Type Disque Dur,Carte Graphique,Système d'exploitation,Gamer,prix
0,82LX00CKFG,https://www.mytek.tn/pc-portable-lenovo-ideapa...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,IDEAPAD 1,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"709,000 DT"
1,82LX00CFFG,https://www.mytek.tn/pc-portable-lenovo-ideapa...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,IDEAPAD 1,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"709,000 DT"
2,A30XQEA,https://www.mytek.tn/pc-portable-hp-15-fd0298n...,PC Portable HP 15-fd0298nk Intel N100 4Go 256G...,15-fd0298nk,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® N100,4 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"
3,X515KA-EJ008,https://www.mytek.tn/pc-portable-asus-x515ka-i...,PC Portable ASUS X515KA Intel Celeron N4500 4G...,X515KA,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1920 x 1080 pixels,Intel Celeron,Intel® Celeron® N4500,4 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"
4,BU-82LX00CKFG-SAC,https://www.mytek.tn/pc-portable-lenovo-ideapa...,PC Portable LENOVO IdeaPad 1 15IJL7 Intel Cele...,IDEAPAD 1,https://mk-media.mytek.tn/media/catalog/produc...,Mytek,15.6 Pouces,1366 x 768 pixels,Intel Celeron,Intel® Celeron® N4500,8 Go,256 Go SSD,SSD,Graphique Intégrée,FreeDos,Non,"719,000 DT"


In [21]:
df.describe().T

Unnamed: 0,count,unique,top,freq
id,646,645,BU-9S7-15K111-1491-W11P,2
lien,646,645,https://www.mytek.tn/pc-portable-gamer-msi-cyb...,2
nom,646,643,Pc Portable DELL Inspiron 15 3535 AMD RYZEN 7 ...,3
Gamme PC,646,105,VIVOBOOK 15 X1504VA,53
image_url,646,645,https://mk-media.mytek.tn/media/catalog/produc...,2
boutique,646,1,Mytek,646
Taille de l'écran,646,9,15.6 Pouces,517
Résolution d'écrans,646,6,1920 x 1080 pixels,518
Processeur,646,17,Intel Core i5,212
Référence Processeur,646,62,Intel® Core™ i7-13620H,94


In [25]:
# Save to CSV
df.to_csv("scraped_data/mytek_products.csv", index=False, encoding='utf-8-sig')

print("Data saved to mytek_products.csv")

Data saved to mytek_products.csv
