In [12]:
# Import essential modules

import os            # miscellaneous operating system interfaces
import requests      # send HTTP requests using Python
import time
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [13]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless=new')   # Without open browser
chrome_options.add_argument('--no-sandbox')


driver = webdriver.Chrome('chromedriver', service=Service(ChromeDriverManager().install()), options=chrome_options)


In [14]:
# Step 2: Create empty folder
root_dir = r'D:\\AIO2023\Works\Crawling\tiki'
os.makedirs(root_dir, exist_ok=True)

In [15]:
# Step 3: Get list of book links
n_pages = 2
book_url_list = []
for page_id in range(1, n_pages+1):
    main_url = f'https://tiki.vn/nha-sach-tiki/c8322?page={page_id}'
    driver.get(main_url)
    time.sleep(1)
    
    book_items_class_name = 'product-item'
    tags = driver.find_elements(
        By.CLASS_NAME,
        book_items_class_name
    )
    for tag in tags:
        book_url_list.append(tag.get_attribute('href'))

In [16]:
data_list = []
for url in book_url_list:
    data = []
    driver.get(url)
    time.sleep(1)

    # Get title
    try:
        title_name_xpath = '//h1[@class="title"]'
        title = driver.find_element(
            By.XPATH,
            title_name_xpath
        ).text
    except:
        title = ''
    data.append(title)

    # Get author
    try:
        author_name_xpath = '//span[@class="brand-and-author no-after"]/h6/a'
        author = driver.find_element(
            By.XPATH,
            author_name_xpath
        ).text
    except:
        author = ''
    data.append(author)

    # Get quantity sold
    try:
        quantity_sold_xpath = '//div[@class="below-title"]/div/div[2]'
        quantity_sold = driver.find_element(
            By.XPATH,
            quantity_sold_xpath
        ).text.split()[-1]
    except:
        quantity_sold = ''
    data.append(quantity_sold)

    # Get review numbers
    try:
        review_numbers_xpath = '//div[@class="below-title"]/div/div[1]/a'
        review_numbers = driver.find_element(
            By.XPATH,
            review_numbers_xpath
        ).text.split()[1]
    except:
        review_numbers = ''
    data.append(review_numbers)

    # Get original price
    try:
        original_price_xpath = '//div[@class="price-and-icon "]/div[1]/div/div[2]'
        original_price = driver.find_element(
            By.XPATH,
            original_price_xpath
        ).text.split()[0]
        original_price = original_price.replace('.','')
    except:
        original_price = ''
    data.append(original_price)

    # Get discount
    try:
        discount_xpath = '//div[@class="price-and-icon "]/div[1]/div/div[3]'
        discount = driver.find_element(
            By.XPATH,
            discount_xpath
        ).text
    except:
        discount = ''
    data.append(discount.replace('-',''))

    # Get discounted price
    try:
        discounted_price_xpath = '//div[@class="price-and-icon "]/div[1]/div/div[1]'
        discounted_price = driver.find_element(
            By.XPATH,
            discounted_price_xpath
        ).text.split()[0]
        discounted_price = discounted_price.replace('.','')
    except:
        discounted_price = ''
    data.append(discounted_price)

    # Insert all data
    data_list.append(data)

    

    

In [18]:
n = len(data_list) 
for i in range(n):
    data_list[i].append(book_url_list[i])

In [19]:
columns = ['title','author','quantity_sold','review_numbers','original_price','discount','discounted_price','book_url']

In [20]:
pd.DataFrame(data_list, columns=columns).to_csv(r'D:\\AIO2023\Works\Crawling\tiki\crawling_230712.csv', header=True, sep=',', encoding='utf-8')