In [18]:
import pandas as pd
import requests as req
import json
import time
import openpyxl as px
import html
import re
from bs4 import BeautifulSoup
import threading
import queue

In [19]:
df = pd.read_excel('products-0-200000.xlsx')

In [20]:
product_id = df['id'].tolist()  # list product ID

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-US,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,vi;q=0.6',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Cache-Control': 'max-age=0',
        'Sec-Ch-Ua': '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1'
    }
    
"""DoD: Sử dụng code Python, tải về thông tin của 200k sản phẩm (list product id bên dưới) của Tiki và lưu thành các file .json. 
Mỗi file có thông tin của khoảng 1000 sản phẩm. 
Các thông in cần lấy bao gồm: id, name, url_key, price, description, images url. 
Yêu cầu chuẩn hoá nội dung trong "description" và tìm phương án rút ngắn thời gian lấy dữ liệu.
- List product_id: https://1drv.ms/u/s!AukvlU4z92FZgp4xIlzQ4giHVa5Lpw?e=qDXctn
- API get product detail: https://api.tiki.vn/product-detail/api/v1/products/138083218"""

session = req.Session()
max_retries = 3

def get_product_data(product_id, q):
    url = f'https://api.tiki.vn/product-detail/api/v1/products/{product_id}'
    print(f"Fetching data for product ID {product_id}")
    
    for attempt in range(max_retries):
        response = session.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            selected_data = {
                'id': data['id'],
                'name': data['name'], 
                'url_key': data['url_key'],
                'price': data['price'],
                'description': data['description'],
                'images_url': data['images']  
            }
            q.put(selected_data)
            print(f"Got data for {product_id} \n")
            return selected_data
        elif response.status_code in [429, 500]:
            print(f"Rate limit exceeded for product ID {product_id}, retrying...")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt) 
        else:
            print(f"Error fetching data for product ID {product_id}: {response.status_code}")
            return None 
    return None

In [21]:
def preprocessing(q_in, q_out):
    data = q_in.get()
    text = data['description']
    
    # Xử lý Unicode an toàn hơn
    if text:
        text = html.unescape(text)
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Loại bỏ surrogate characters
        text = text.encode('utf-8', errors='ignore').decode('utf-8')
        # Hoặc thay thế surrogate characters
        # text = text.encode('utf-16', errors='surrogatepass').decode('utf-16', errors='ignore')
    else:
        text = ""
    
    data['description'] = text
    q_out.put(data)
    print(f"Preprocessed data for {data['id']}")

In [22]:
data_queue=queue.Queue()
result_queue=queue.Queue()

In [23]:
def saving(q_out, batch_number):
    batched_data = []
    while not q_out.empty():
        data = q_out.get()
        batched_data.append(data)
        
    json_string = json.dumps(batched_data, ensure_ascii=False, indent=4)

    filename = f"batches_test/batch_{batch_number}.json"  
    with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
        f.write(json_string)
        
    print(f"Saved batch {batch_number} to {filename}")

In [29]:
def batching():
    max_threads = 50
    total_files = 0

    for i in range(0, 1000, 1000):
        batch = product_id[i:i+1000]
        batch_number = i // 1000 + 1
        print(f"Processing batch {batch_number}")

        thread = []
        for single_product in batch:
            if len(thread) >= max_threads:
                for t in thread:
                    t.join()
                thread.clear()
            t1 = threading.Thread(target=get_product_data, args=(single_product, data_queue))
            t1.start()
            thread.append(t1)
            
        for t in thread:
            t.join()
            
        preprocessing_threads = []
        while not data_queue.empty():
            t2 = threading.Thread(target=preprocessing, args=(data_queue, result_queue))
            t2.start()
            preprocessing_threads.append(t2)
        
        for t in preprocessing_threads:
            t.join()
            
        print(f"Preprocessing done, now saving...")
        
        # Đếm số sản phẩm trong batch hiện tại trước khi lưu
        current_batch_count = result_queue.qsize()
        actual_saved = current_batch_count

        saving(result_queue, batch_number)
        
        # Cập nhật tổng số file
        total_files += current_batch_count
        print(f"Total products processed so far: {total_files}")

    print(f"\nTotal products processed: {total_files}")

In [None]:
batching()