In [15]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime

In [7]:
url = 'https://brightstarcomp.com/collections/pc-components?sort_by=title-ascending&page=1&filter.v.price.gte=&filter.v.price.lte=&filter.p.product_type=Processor'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [None]:
print(soup)

In [None]:
products = soup.find('div', class_='right-products')
products = products.find_all('div', class_='product-item')
print(products[0])

In [None]:
def extract_product_details(product):
    """
    Extracts the URL, name, and price from a product item.

    Args:
        product (BeautifulSoup object): A single product item.

    Returns:
        dict: A dictionary containing the product's URL, name, and price.
    """
    # Extract the component URL
    url = product.find('a')['href']
    full_url = f"https://brightstarcomp.com{url}"  # Add the base URL

    # Extract the component name
    name = product.find('p', class_='product-title').text.strip()

    # Extract the price
    price = product.find('p', class_='price').text.strip()

    # Return the extracted details as a dictionary
    return {
        'url': full_url,
        'name': name,
        'price': price,
        'timestamp': datetime.datetime.now()
    }

""" 
Extract details for all products in each category.
For: Brightstar Computer Website
As of: 19th March 2025
"""

i = 1
product_list = []
product_type = 'PC Accessories'
while(True):
    url = f'https://brightstarcomp.com/collections/pc-components?sort_by=title-ascending&page={i}&filter.v.price.gte=&filter.v.price.lte=&filter.p.product_type={product_type}'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')

    products = soup.find('div', class_='right-products')
    products = products.find_all('div', class_='product-item')

    if len(products) == 0:
        break
    product_list += [extract_product_details(product) for product in products]
    i += 1

# Convert the list of dictionaries into a Pandas DataFrame
df = pd.DataFrame(product_list)
df['Note'] = None

# Print the DataFrame
display(df)

# Save the DataFrame to a CSV file
df.to_csv(f'data/parts/brightstarcomp/{product_type.lower().strip()}.csv', index=False)

Unnamed: 0,url,name,price,timestamp,Note
0,https://brightstarcomp.com/collections/pc-comp...,1st Player ARGB Silicon Sleeved Extension Cabl...,RM 130.00,2025-03-18 22:15:10.308476,
1,https://brightstarcomp.com/collections/pc-comp...,1st-player-steampunk-gold-series-psu-350mm-cable,RM 75.00,2025-03-18 22:15:10.308476,
2,https://brightstarcomp.com/collections/pc-comp...,ADATA XPG Prime ARGB Extension For VGA Sleeved...,RM 149.00,2025-03-18 22:15:10.308476,
3,https://brightstarcomp.com/collections/pc-comp...,Arctic A-RGB Controller with RF Remote Control...,RM 74.00,2025-03-18 22:15:10.308476,
4,https://brightstarcomp.com/collections/pc-comp...,Arctic Case PWM Sharing Fan Hub,RM 49.00,2025-03-18 22:15:10.308476,
...,...,...,...,...,...
62,https://brightstarcomp.com/collections/pc-comp...,Thermalright Intel LGA 1700 Air Cooler Refit B...,RM 15.00,2025-03-18 22:15:12.549726,
63,https://brightstarcomp.com/collections/pc-comp...,Thermalright Intel LGA1700 Bending Corrector F...,RM 35.00,2025-03-18 22:15:12.550726,
64,https://brightstarcomp.com/collections/pc-comp...,Thermaltake LCD Panel Kit for The Tower 500 - ...,RM 99.00,2025-03-18 22:15:12.550726,
65,https://brightstarcomp.com/collections/pc-comp...,Thermaltake The Tower 300 Chassis Stand Kit - ...,RM 129.00,2025-03-18 22:15:12.550726,


In [68]:
def extract_product_details(product):
    try:
        # Extract the component URL
        url = product.find('a', class_='product-item-link')['href']

        # Extract the component name
        name = product.find('strong', class_='product name product-item-name').text.strip()

        # Extract the price
        price_tag = product.find('span', class_='price-wrapper')
        price = price_tag['data-price-amount'] if price_tag else None  # Handle missing price

        # Return the extracted details as a dictionary
        return {
            'url': url,
            'name': name,
            'price': price,
            'timestamp': datetime.datetime.now()
        }
    except AttributeError as e:
        print(f"Error extracting product details: {e}")
        return None

In [69]:
import os

""" 
Extract details for all products in each category.
For: C-zone Website
As of: 19th March 2025
"""
components_done = ['cpu-processors', 'memory', 'motherboards','video-card-video-devices']
components = ['computer-cases', 'power-supplies', 'fan-pc-cooling', 'sound-cards', 'optical-drives']
output_dir = 'data/parts/czone/'
os.makedirs(output_dir, exist_ok=True)

for component in components:
    product_list = []
    seen_products = set()  # To track unique product URLs
    i = 1

    while True:
        # Scrape page
        url = f'https://czone.my/czone/computer-components/core-components/{component}.html?p={i}'
        page = requests.get(url)
        if page.status_code != 200:
            print(f"Failed to fetch {url}, status code: {page.status_code}")
            break

        soup = BeautifulSoup(page.text, 'html')

        # Extract data
        products_container = soup.find('div', class_='products wrapper grid products-grid')
        if not products_container:
            print(f"No products found on page {i} for component {component}")
            break

        products = products_container.findAll('li', class_='item product product-item')
        if len(products) == 0:
            break

        for product in products:
            product_details = extract_product_details(product)
            if product_details:
                product_url = product_details['url']  # Use URL as a unique identifier
                if product_url in seen_products:
                    print(f"Duplicate product detected: {product_url}. Stopping.")
                    break  # Stop processing further pages
                seen_products.add(product_url)
                product_list.append(product_details)
        else:
            # Continue to the next page if no duplicates were found
            i += 1
            continue

        # Break the outer loop if a duplicate is found
        break

    df = pd.DataFrame(product_list)
    df['Note'] = None

    # Save the DataFrame to a CSV file
    df.to_csv(f'{output_dir}{component}.csv', index=False, encoding='utf-8')

Duplicate product detected: https://czone.my/czone/nzxt-h9-computer-case.html. Stopping.
Duplicate product detected: https://czone.my/czone/mag a750bn pcie5-1739103631.html. Stopping.
Duplicate product detected: https://czone.my/czone/idc-hub-fh-07-1739103466.html. Stopping.
Duplicate product detected: https://czone.my/czone/cdkhb-1737817497.html. Stopping.
Duplicate product detected: https://czone.my/czone/mr193bk-1739103713.html. Stopping.


In [67]:
url = 'https://czone.my/czone/computer-components/core-components/computer-cases.html?p=1'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

products = soup.find('div', class_='products wrapper grid products-grid')
products = products.findAll('li', class_='item product product-item')
products_list = [extract_product_details(product) for product in products]
df = pd.DataFrame(products_list)

display(df)

Unnamed: 0,url,name,price,timestamp
0,https://czone.my/czone/phanteks-nv5s-computer-...,Phanteks NV5S Computer Case,449.0,2025-03-19 14:37:43.252984
1,https://czone.my/czone/deepcool-ch260-matx-com...,Deepcool CH260 MATX Computer Case,259.0,2025-03-19 14:37:43.254101
2,https://czone.my/czone/tecware-timber-m-tg-com...,Tecware Timber M TG Computer Case,179.0,2025-03-19 14:37:43.254101
3,https://czone.my/czone/pa401/bk/wood/tg-174192...,Asus ProArt PA401 Wood Edition Computer Case ...,560.0,2025-03-19 14:37:43.254101
4,https://czone.my/czone/pc-o11vpw-1739103813.html,Lian Li PC-O11 Vision Compact Computer Case,1920.0,2025-03-19 14:37:43.254610
5,https://czone.my/czone/twca-vxprm-bkar-1739107...,"Tecware VX Prism M Computer Case (mATX, TG, AR...",269.0,2025-03-19 14:37:43.254610
6,https://czone.my/czone/lian-li-a4-h2o-computer...,Lian Li A4-H2O Computer Case,679.0,2025-03-19 14:37:43.254610
7,https://czone.my/czone/twca-fus2a-bk-173910708...,"Tecware Fusion 2 Air Computer Case (mATX, TG, ...",229.0,2025-03-19 14:37:43.255006
8,https://czone.my/czone/twca-edm2-bkol-17391070...,"Tecware Edge M2 TG Computer Case (mATX, TG, AR...",169.0,2025-03-19 14:37:43.255006
9,https://czone.my/czone/gfg-p2storm-1739103401....,"Gaming Freak P2 Storm TG Computer Case (mATX, ...",189.0,2025-03-19 14:37:43.255006


In [26]:
url = f'https://brightstarcomp.com/collections/pc-components?sort_by=title-ascending&page=30&filter.v.price.gte=&filter.v.price.lte=&filter.p.product_type=Processor'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

products = soup.find('div', class_='right-products')
products = products.find_all('div', class_='product-item')
len(products)

0

In [23]:
df.to_csv('cpu_test.csv', index=False)