In [1]:
import pandas as pd
import requests
from pymongo import MongoClient
from bs4 import BeautifulSoup
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Logging Setup
logging.basicConfig(
    filename='tiki_api_log.log',
    filemode='w',       # Delete the old log and overwrite it
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'        
)

# MongoDB setup
client = MongoClient('mongodb://localhost:27017/')
db = client['tiki_data']
collection = db['products']

# Read product IDs
df = pd.read_csv('200k_tiki_products.csv')
product_ids = df['id'].tolist()

# Function to normalize description
def normalize_description(description):
    soup = BeautifulSoup(description, 'html.parser')
    return soup.get_text()

# Headers for API request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
}

# Function to fetch data for a single product
def fetch_product_data(product_id):
    try:
        response = requests.get(f"https://api.tiki.vn/product-detail/api/v1/products/{product_id}", headers=headers)
        logging.info(f"Collecting data for product ID {product_id} - Status code: {response.status_code}")
        
        if response.status_code == 200:
            data = response.json()
            product_data = {
                "id": data.get("id"),
                "name": data.get("name"),
                "url_key": data.get("url_key"),
                "price": data.get("price"),
                "description": normalize_description(data.get("description", "")),
                "images": data.get("images", [])
            }
            collection.insert_one(product_data)
            logging.info(f"Successfully stored data for product ID {product_id}")
        elif response.status_code == 403:
            logging.warning(f"Access denied for product ID {product_id}. Status code: 403")
        else:
            logging.error(f"Failed to collect data for product ID {product_id}. Status code: {response.status_code} - Response: {response.text}")
    
    except Exception as e:
        logging.error(f"Error occurred for product ID {product_id}: {str(e)}")

# Use ThreadPoolExecutor for concurrent requests
with ThreadPoolExecutor(max_workers=30) as executor:
    # Create a list of Future objects, each representing an API request for a specific product ID
    futures = [executor.submit(fetch_product_data, product_id) for product_id in product_ids]
    # Iterate through the Future objects as they complete
    for future in as_completed(futures):
        try:
            # Retrieve the result of the Future
            future.result()
        except Exception as e:
            # Log any errors that occur during the execution
            logging.error(f"Error in concurrent task: {str(e)}")