<a href="https://colab.research.google.com/github/kimhuongvu/Tiki-Web-Scrapping-by-Selenium/blob/main/Tiki_WebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install selenium and other resources for crawling data
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

### IMPORTS ###
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

from google.colab import drive
drive.mount('/content/gdrive')

In [187]:
###############
### GLOBALS ###
###############

# Urls
TIKI            = 'https://tiki.vn'
MAIN_CATEGORIES = [
    {'Name': 'Điện Thoại - Máy Tính Bảng',
     'URL': 'https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner'},

    {'Name': 'Điện Tử - Điện Lạnh',
     'URL': 'https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner'},

    {'Name': 'Phụ Kiện - Thiết Bị Số', 
     'URL': 'https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner'},

    {'Name': 'Laptop - Thiết bị IT', 
     'URL': 'https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner'},

    {'Name': 'Máy Ảnh - Quay Phim', 
     'URL': 'https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner'},

    {'Name': 'Điện Gia Dụng', 
     'URL': 'https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner'},

    {'Name': 'Nhà Cửa Đời Sống', 
     'URL': 'https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner'}

]

# Global driver to use throughout the script
DRIVER = None

In [188]:
close_driver()

In [189]:
initialize_driver()

Initiating driver...
Finished!


In [191]:
#################
### FUNCTIONS ###
#################

### Function to (re)start driver
def start_driver(force_restart=False):
    global DRIVER
    
    if DRIVER is not None:
        if force_restart:
            close_driver()
        else:
            raise RuntimeError('ERROR: cannot overwrite an active driver. Please close the driver before restarting.')
    
    # Setting up the driver
    options = webdriver.ChromeOptions()
    options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
    options.add_argument('-no-sandbox')
    options.add_argument('-disable-dev-shm-usage')

    DRIVER = webdriver.Chrome('chromedriver',options=options)

### Wrapper to close driver if its created
def close_driver():
    global DRIVER
    if DRIVER is not None:
        DRIVER.close()
    DRIVER = None

### Function to extract product info from the necessary html and json tags
def get_product_info_single(i):
    d = {'name':'',
         'price':'',
         'product_url':'',
         'image':'',
         'rating':'',
         'number_of_sales':'',
         'tiki_now':'',
         'freeship':'',
         'under_price':'',
         'discount':'',
         'installment':'',
         'gift':'',
         'advertising': '',
         'tikinow_member_deal':''}

    # name
    try:
        # name_elem = i.find_element_by_xpath(".//div[@class='name']/child::span")
        name_elem = i.find_element_by_class_name('name').find_element_by_tag_name('span')
        d['name'] = name_elem.get_attribute('innerHTML').strip()
    except NoSuchElementException:
        pass
        
    # price
    try:
        # price_elem = i.find_element_by_xpath(".//div[@class='price-discount__price']").get_attribute('innerHTML')
        price_elem = i.find_element_by_class_name('price-discount__price').get_attribute('innerHTML')
        d['price'] = re.sub(r'[\.\s₫]', '', price_elem)
    except (NoSuchElementException, ValueError):
        d['price'] = -1

    # link
    try:
        product_link     = i.get_attribute('href')
        d['product_url'] = 'https:'+product_link if product_link[:2]=='//' else product_link
    except NoSuchElementException:
        pass
    
    # thumbnail
    try:
        # thumbnail  = i.find_elements_by_xpath(".//div[@class='thumbnail']//child::img")[-1]
        thumbnail  = i.find_element_by_class_name('thumbnail').find_elements_by_tag_name('img')[-1]
        d['image'] = thumbnail.get_attribute('src')
    except NoSuchElementException:
        pass

    # review
    try:
        # elem_review   = i.find_element_by_xpath(".//div[@class='average']")
        elem_review   = i.find_element_by_class_name('average')
        d['rating']   = float(re.sub(r'\D','',elem_review.get_attribute('style')))/100*5
    except NoSuchElementException:
        d['rating']   = 0

    # tiki now
    try:
        # d['tiki_now'] = bool(i.find_element_by_xpath(".//div[@class='badge-service']/child::div[@class='item']"))
        d['tiki_now'] = bool(i.find_element_by_class_name('badge-service').find_element_by_class_name('item'))
    except NoSuchElementException:
        d['tiki_now'] = False
    
    # freeship
    try:
        # thumbnail_tag = i.find_element_by_xpath(".//div[@class='thumbnail']")
        thumbnail_tag = i.find_element_by_class_name('thumbnail')
        d['freeship'] = len(thumbnail_tag.find_elements_by_tag_name('img')) == 2
    except NoSuchElementException:
        d['freeship'] = False

    # under price
    try:
        # d['under_price'] = bool(i.find_element_by_xpath(".//div[@class='badge-under-price']/child::div[@class='item']"))
        d['under_price'] = bool(i.find_element_by_class_name('badge-under-price').find_element_by_class_name('item'))
    except NoSuchElementException:
        d['under_price'] = False

    # discount
    try:
        # discount = i.find_element_by_xpath(".//div[@class='price-discount__discount']").get_attribute('innerHTML')
        discount = i.find_element_by_class_name('price-discount__discount').get_attribute('innerHTML')
        d['discount'] = int(re.sub(r'[\-\%]','', discount))
    except (NoSuchElementException, ValueError):
        d['discount'] = 0

    # installment
    try:
        # d['installment'] = bool(i.find_element_by_xpath(".//div[@class='badge-benefits']//child::img[1]"))
        d['installment'] = bool(i.find_element_by_class_name('badge-benefits').find_element_by_tag_name('img'))
    except NoSuchElementException:
        d['installment'] = False
    
    # gift
    try:
        # d['gift'] = bool(i.find_element_by_xpath(".//div[@class='freegift-list']"))
        d['gift'] = bool(i.find_element_by_class_name('freegift-list'))
    except NoSuchElementException:
        d['gift'] = False
    
    # advertising
    try:
        d['advertising'] = i.find_element_by_class_name('name').find_element_by_tag_name('p').get_attribute('innerHTML')
    except NoSuchElementException:
        d['advertising'] = 'No Ad'
    

     # get unit of sold
    try:
        no_of_sales = i.find_element_by_class_name("styles__StyledQtySold-sc-732h27-2").text
        d["number_of_sales"] = no_of_sales.strip("Đã bán +")
    except NoSuchElementException:
        d["number_of_sales"] = False  

     # Tikinow member price deal badge
    try:
        tikinowmemdeal_thumbnail = i.find_element_by_class_name("badge-benefits").find_element_by_tag_name("img").get_attribute("src")
        d['tikinow_member_deal'] = tikinowmemdeal_thumbnail
        tikinowmemdeal_src = "https://salt.tikicdn.com/ts/upload/ba/9e/0d/9edd3b5458502375eb0cedf17acb732f.png"
        if d["tikinow_member_deal"] in tikinowmemdeal_src:
            d["tikinow_member_deal"] = "Yes"
        else:
            d["tikinow_member_deal"] = "No"
    except NoSuchElementException:
        d["tikinow_member_deal"] = False 
        
    return d

### Function to scrape all products from a page
def get_product_info_from_page(page_url):
    """ Extract info from all products of a specfic category on Tiki website
        Args:
            page_url: (string) url of the page to scrape
        Returns:
            data: list of dictionary of products info. If no products shown, return empty list.
    """
    global DRIVER

    data = []
    DRIVER.get(page_url) # Use the driver to get info from the product page
    time.sleep(3)


    try: 
        no_product_found = DRIVER.find_element_by_xpath("//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']")
        print("EMPTY PAGE")
        return data
    except NoSuchElementException:
        no_product_found = False
    

    # FIND ALL PRODUCT ITEMS
    # products   = DRIVER.find_elements_by_xpath("//a[@class='product-item']")
    products   = DRIVER.find_elements_by_class_name('product-item')
    print(f'Found {len(products)} products')
    
    if not no_product_found and len(products):
        for i in products:
            product_dict = get_product_info_single(i)
            data.append(product_dict)
    return data

### Function to get product info from a main category
def get_product_info_from_category(cat_url, max_page=0):
    '''
    Scrape for multiple pages of products of a category.
    Uses get_product_info_from_page().

    Args:
        cat_url: (string) a url string of a category
        max_page: (int) an integer denoting the maximum number of pages to scrape.
                  Default value is 0 to scrape all pages.
    Returns: 
        products: a list in which every element is a dictionary of one product's information
    '''
    products = []

    page_n = 1
    main_url, url_opts = cat_url.split('?')
    cat_page_url = main_url + f'?page={page_n}&' + url_opts
    product_list = get_product_info_from_page(cat_page_url)

    while len(product_list)>0:
        products.extend(product_list)
        page_n += 1

        # stop_flag = False if max_page <= 0 else (page_n > max_page)
        stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
        if stop_flag:
            break

        cat_page_url = main_url + f'?page={page_n}&' + url_opts
        product_list = get_product_info_from_page(cat_page_url)
    
    return products

In [192]:
######################
### START SCRAPING ###
######################

prod_data = []
num_max_page = 10
start_driver(force_restart=True)

for element in MAIN_CATEGORIES:
  cat_url = element['URL']

  print(cat_url)
  prod_per_cat = get_product_info_from_category(cat_url, num_max_page)
  prod_data.extend(prod_per_cat)

close_driver()

df = pd.DataFrame(data=prod_data, columns=prod_data[0].keys())
df.to_csv('tiki_products.csv')

https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner




Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
Found 48 products
https://tiki.vn/may-anh

In [193]:
len(df)

3360

In [196]:
df.head()

Unnamed: 0,name,price,product_url,image,rating,number_of_sales,tiki_now,freeship,under_price,discount,installment,gift,advertising,oficial or credit,tikinow_member_deal
0,Điện Thoại Samsung Galaxy M12 (4GB/64GB) - Hàn...,3709000,https://tka.tiki.vn/pixel/pixel?data=djAwMc6l6...,https://salt.tikicdn.com/cache/200x200/ts/prod...,4.8,1000,True,False,False,5,True,False,Ad,,No
1,Điện Thoại iPhone 12 Mini 64GB - Hàng Chính Hãng,18990000,https://tiki.vn/dien-thoai-iphone-12-mini-64gb...,https://salt.tikicdn.com/cache/200x200/ts/prod...,0.0,False,True,True,False,0,False,False,False,,False
2,Điện Thoại Samsung Galaxy M12 (4GB/64GB) - Hàn...,3709000,https://tiki.vn/dien-thoai-samsung-galaxy-m12-...,https://salt.tikicdn.com/cache/200x200/ts/prod...,4.8,,True,False,False,5,True,False,False,,No
3,Điện Thoại Vivo Y20 (4GB/64GB) - Hàng Chính Hãng,3429000,https://tiki.vn/dien-thoai-vivo-y20-4gb-64gb-h...,https://salt.tikicdn.com/cache/200x200/ts/prod...,0.0,False,True,False,False,0,False,False,False,,False
4,Máy Tính Bảng Samsung Galaxy Tab S7 FE LTE T73...,10490000,https://tka.tiki.vn/pixel/pixel?data=djAwMZArA...,https://salt.tikicdn.com/cache/200x200/ts/prod...,5.0,43,True,False,False,25,False,False,Ad,,False


In [207]:
df.to_csv('/content/gdrive/MyDrive/Tiki_products.csv',index= False)