In [1]:
import pandas as pd
import requests as req
import json
import time
import openpyxl as px
import html
import re
from bs4 import BeautifulSoup
import threading
import queue
from requests.adapters import HTTPAdapter
import os
import random
import logging
from pathlib import Path

BASE_DIR = Path(os.getcwd()).resolve().parent.parent
FILES = {
    'products': BASE_DIR / 'data' / 'input' / 'products-0-200000.xlsx',
    'logs': BASE_DIR / 'tests' / 'logs' / 'crawl.log',
}

In [2]:
def setup_logging():
    logging.basicConfig(
        filename=FILES['logs'],
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    logging.info("Logging setup complete.")
    
    
setup_logging()

In [3]:
def read_data():
    """Read product IDs, check and improve veracity."""
    # Read the Excel file
    df = pd.read_excel(FILES['products'])
    logging.info(f"Read input file successfully")
    
    product_ids = df['id'].tolist()  # list product IDs
    logging.info(f"Extracted {len(product_ids)} product IDs from the input file")

    product_ids = dict.fromkeys(product_ids)  # remove duplicates
    logging.info(f"Removed duplicates successfully, {len(product_ids)} unique product IDs remaining")
    product_ids = list(product_ids.keys())  
    
    # Ensure product IDs are in appropriate format
    valid_ids = []
    invalid_count = 0
    for pid in product_ids:
       if isinstance(pid, (int, float)) and str(int(pid)).isdigit() and pid > 0 and pid == int(pid):
           valid_ids.append(int(pid))
       else:
           invalid_count += 1
           logging.warning(f"Invalid product ID found: {pid} (type: {type(pid)})")
    logging.info(f"Validation complete: {len(valid_ids)} valid IDs, {invalid_count} invalid IDs removed")
    product_ids = valid_ids
    return product_ids

In [None]:
read_data()

In [2]:
df = pd.read_excel('/home/lamanx/DEC/Lab2/data/input/products-0-200000.xlsx')

In [3]:
df['id'].duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
199995    False
199996    False
199997    False
199998    False
199999    False
Name: id, Length: 200000, dtype: bool

In [4]:
product_ids = df['id'].tolist()  # list product ID

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
    'Accept': 'application/json, */*',
    'Accept-Language': 'en-US,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,vi;q=0.6',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Sec-Ch-Ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Referer': 'https://tiki.vn/',
    'Origin': 'https://tiki.vn'
}
    
"""DoD: Sử dụng code Python, tải về thông tin của 200k sản phẩm (list product id bên dưới) của Tiki và lưu thành các file .json. 
Mỗi file có thông tin của khoảng 1000 sản phẩm. 
Các thông in cần lấy bao gồm: id, name, url_key, price, description, images url. 
Yêu cầu chuẩn hoá nội dung trong "description" và tìm phương án rút ngắn thời gian lấy dữ liệu.
- List product_id: https://1drv.ms/u/s!AukvlU4z92FZgp4xIlzQ4giHVa5Lpw?e=qDXctn
- API get product detail: https://api.tiki.vn/product-detail/api/v1/products/138083218"""

session = req.Session()
session.mount('https://', HTTPAdapter(
    pool_connections=100,
    pool_maxsize=100
))
max_retries = 3  # Base delay in seconds

def get_product_data(product_id, max_retries=3):
    url = f'https://api.tiki.vn/product-detail/api/v1/products/{product_id}'
    print(f"Fetching data for product ID {product_id}")
    
    for attempt in range(max_retries):
        base_delay = 2 * attempt  
        response = session.get(url, headers=headers, timeout=20)
        if response.status_code == 200:
            data = response.json()
            selected_data = {
                'id': data['id'],
                'name': data['name'], 
                'url_key': data['url_key'],
                'price': data['price'],
                'description': data['description'],
                'images_url': data['images']  
            }
            print(f"Fetched data for product ID {product_id}")
            return selected_data
        
        elif response.status_code in [429, 500]:
            print(f"Rate limit exceeded for product ID {product_id}, retrying...")
            if attempt < max_retries - 1:
                time.sleep(random.uniform(1, 3)) 
                
        else:
            print(f"Error fetching data for product ID {product_id}: {response.status_code}, retrying...")
            if attempt < max_retries - 1:
                time.sleep(random.uniform(1, 3)) 
            return None
        
    print(f"Failed to fetch data for product ID {product_id} after {max_retries} attempts")
    return None

In [None]:
def test_retry():
    for attempt in range(3):
        print(f"Attempt {attempt}")

        # Giả sử luôn gặp lỗi 404
        status_code = 404
        
        if status_code == 200:
            return "Success"
        elif status_code in [429, 500]:
            print("429/500 case")
            if attempt < 2:
                time.sleep(1)
        else:
            print("Other error case")  
            if attempt < 2:
                time.sleep(1)
    
    return None

if __name__ == "__main__":
    # Test the retry logic
    result = test_retry()
    print(f"Test result: {result}")

In [None]:
def preprocessing(data):
    text = data['description']
    
    if text:
        # Remove HTML tags and decode HTML entities
        text = html.unescape(text)
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub(r'\s+', ' ', text).strip()

        # surrogate characters
        text = text.encode('utf-8', errors='ignore').decode('utf-8')
    else:
        text = ""
    
    data['description'] = text
    print(f"Preprocessed data for {data['id']}")

In [21]:
def saving(data, batch_number):

    json_string = json.dumps(data, ensure_ascii=False, indent=4)

    filename = f"batches_test/batch_{batch_number}.json"  
    with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
        f.write(json_string)
        
    print(f"Saved batch {batch_number} to {filename}")

In [None]:
# Use ThreadPoolExecutor for better performance
from concurrent.futures import ThreadPoolExecutor
def get_product_data_wrapper(product_id):
    total_products = 0
    batch_size = 1000  # Number of products per batch
    batch_total = len(product_ids[:2000])  # Total number of products to process

    for batches in range(0, batch_total, batch_size):
        batch_ids = product_ids[batches:batches + batch_size]
        future = []
        data = []
        with ThreadPoolExecutor(max_workers=20) as executor:
            for product_id in batch_ids:
                future.append(executor.submit(get_product_data, product_id))
        # Collect results
        for f in future:
            data.append(f.result())
        # Filter out None results
        data = [d for d in data if d is not None]
        total_products += len(data)
        print(f"Total products fetched in this batch: {len(data)} / {batch_size}")
        print(f"Batch {batches // batch_size + 1} fetched with {len(data)} products.")
        
        # Process the result
        for item in data:
            preprocessing(item)
        
        # Process the result and save it
        saving(data, batches // batch_size + 1)
        print(f"Batch {batches // batch_size + 1} processed and saved.")

    print(f"Total products collected for this batch: {total_products} / {batch_total}")

if __name__ == "__main__":
    get_product_data_wrapper(product_ids)
    print("All batches processed and saved.")