<a href="https://colab.research.google.com/github/hieutrinhds/web-scraping/blob/master/CS_FTMLE_Tki_Crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from bs4 import BeautifulSoup
import requests
import sqlite3
import re

In [0]:
TIKI_URL = "https://tiki.vn"

In [0]:
conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

In [0]:
# Create table catergories in the database
def create_categories_table():
    query = """
        CREATE TABLE IF NOT EXISTS categories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT,
            parent_id INTEGER,
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY CREATE TABLE')
create_categories_table()

In [0]:
# Insert a row of data
query = """
    INSERT INTO categories (name, url, parent_id)
    VALUES (?, ?, ?);
    """
val = ('test', 'test_url', 1)
try:
    cur.execute(query, val)
    cat_id = cur.lastrowid
except Exception as err:
    print('ERROR BY INSERT:', err)

In [0]:
# Create a class Category
# attributes: name, url, parent_id
# instance method: save_into_db()
class Category:
    def __init__(self, name, url, parent_id=None, cat_id=None):    
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id

    def __repr__(self):
        return f"ID; {self.cat_id}, Name: {self.name}, URL: {self.url}, Parent: {self.parent_id}" 

    def save_into_db(self):
        query = '''
            INSERT INTO categories (name, url, parent_id)
            VALUES (?, ?, ?);
            '''
        val = (self.name, self.url, self.parent_id)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
        except Exception as err:
            print('ERROR BY INSERT:', err)


In [0]:
# Get the HTML content get_url()
def get_url(url):
    try:
        r = requests.get(url).text
        soup = BeautifulSoup(r, 'html.parser')
    except:
        pass
    return soup

In [0]:
# get_main_categories()
def get_main_categories(save_db=False):
    soup = get_url(TIKI_URL)

    result = []
    for a_tag in soup.find_all('a', {'class': 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'}):
        name = a_tag.find('span', {'class':'text'}).text
        url = a_tag['href']
        main_cat = Category(name, url)

        if save_db:
            main_cat.save_into_db()
        result.append(main_cat)
    return result

In [95]:
main_categories = get_main_categories(save_db=True)
main_categories

[ID; 54, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner, Parent: None,
 ID; 55, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner, Parent: None,
 ID; 56, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner, Parent: None,
 ID; 57, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner, Parent: None,
 ID; 58, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner, Parent: None,
 ID; 59, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner, Parent: None,
 ID; 60, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner, Parent: None,
 ID; 61, Name: Hàng Tiêu Dùng - T

In [0]:
# get_sub_categories() given a parent category URL
def get_sub_categories(parent_category, save_db=False):
    
    url = parent_category.url
    result = []

    try:
        soup = get_url(url)
        div_containers = soup.find_all('div', {'class':'list-group-item is-child'})
        for div in div_containers:
            name = div.a.text
            name = re.sub('\s{2,}', ' ', name)
            url = TIKI_URL + div.a['href']
            cat = Category(name, url, parent_category.cat_id)
            if save_db:
                cat.save_into_db()
            result.append(cat)
    except Exception as err:
        print('ERROR BY GET SUB CATEGORIES:', err)
    return result

In [98]:
get_sub_categories(main_categories[0], save_db=True)

[ID; 70, Name:  Máy tính bảng (55)
 , URL: https://tiki.vn/may-tinh-bang/c1794?src=c.1789.hamburger_menu_fly_out_banner, Parent: 54,
 ID; 71, Name:  Máy đọc sách (30)
 , URL: https://tiki.vn/may-doc-sach/c28856?src=c.1789.hamburger_menu_fly_out_banner, Parent: 54,
 ID; 72, Name:  Điện thoại Smartphone (190)
 , URL: https://tiki.vn/dien-thoai-smartphone/c1795?src=c.1789.hamburger_menu_fly_out_banner, Parent: 54,
 ID; 73, Name:  Điện thoại bàn (108)
 , URL: https://tiki.vn/dien-thoai-ban/c8061?src=c.1789.hamburger_menu_fly_out_banner, Parent: 54,
 ID; 74, Name:  Điện thoại phổ thông (91)
 , URL: https://tiki.vn/dien-thoai-pho-thong/c1796?src=c.1789.hamburger_menu_fly_out_banner, Parent: 54]

In [0]:
# get_all_categories() given a list of main categories
def get_all_categories(categories):
    if len(categories) == 0:
        return
    for cat in categories:
        sub_categories = get_sub_categories(cat)
        print(sub_categories)
        get_all_categories(sub_categories)