<a href="https://colab.research.google.com/github/hieutrinhds/web-scraping/blob/master/Get_Tiki_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from bs4 import BeautifulSoup
import requests
import sqlite3

TIKI_URL = 'https://tiki.vn'

In [0]:
conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

In [0]:
# Create table categories in the database
def create_categories_table():
    query = """
        CREATE TABLE IF NOT EXISTS categories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT, 
            parent_id INTEGER, 
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)
create_categories_table()

In [0]:
# Insert a row of data
query = """
    INSERT INTO categories (name, url, parent_id)
    VALUES (?, ?, ?);
"""
val = ('test','test_url', 1)
try:
    cur.execute(query, val)
    cat_id = cur.lastrowid
except Exception as err:
    print('ERROR BY INSERT:', err)

In [0]:
cur.execute('SELECT * FROM categories;').fetchall()

In [0]:
cur.execute('DROP TABLE categories;')

<sqlite3.Cursor at 0x7f80d62161f0>

In [0]:
# Create a class Category
# attributes: name, url, parent_id
# instance method: save_into_db()
class Category:
    def __init__(self, name, url, parent_id=None, cat_id=None):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id

    def __repr__(self):
        return f"ID: {self.cat_id}, Name: {self.name}, URL: {self.url}, Parent: {self.parent_id}"

    def save_into_db(self):
        query = """
            INSERT INTO categories (name, url, parent_id)
            VALUES (?, ?, ?);
        """
        val = (self.name, self.url, self.parent_id)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
        except Exception as err:
            print('ERROR BY INSERT:', err)

cat1 = Category('Test', 'Test URL')
cat1.save_into_db()
print(cat1.cat_id)
cur.execute('SELECT * FROM categories;').fetchall()

3


[(1, 'Test', 'Test URL', None, '2020-05-29 04:27:51'),
 (2, 'Test', 'Test URL', None, '2020-05-29 04:29:47'),
 (3, 'Test', 'Test URL', None, '2020-05-29 04:30:09')]

In [0]:
print(cat1)

ID: 3, Name: Test, URL: Test URL, Parent: None


In [0]:
# Get the HTML content get_url()
def get_url(url):
    try:
        response = requests.get(url).text
        soup = BeautifulSoup(response, 'html.parser')
        return soup
    except Exception as err:
        print('ERROR BY REQUEST:', err)
get_url(TIKI_URL)

In [0]:
# get_main_categories()
def get_main_categories(save_db=False):
    soup = get_url(TIKI_URL)

    result = []
    for a in soup.find_all('a', {'class': 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'}):
        name = a.find('span', {'class': 'text'}).text
        url = a['href']
        main_cat = Category(name, url)

        if save_db:
            main_cat.save_into_db()
        result.append(main_cat)
    return result

In [0]:
main_categories = get_main_categories(save_db=True)
main_categories

[ID: 4, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner, Parent: None,
 ID: 5, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner, Parent: None,
 ID: 6, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner, Parent: None,
 ID: 7, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner, Parent: None,
 ID: 8, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner, Parent: None,
 ID: 9, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner, Parent: None,
 ID: 10, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner, Parent: None,
 ID: 11, Name: Hàng Tiêu Dùng - Thực Ph

In [0]:
# get_sub_categories() given a parent category
import re

def get_sub_categories(parent_category, save_db=False):
    url = parent_category.url
    result = []

    try:
        soup = get_url(url)
        div_containers = soup.find_all('div', {'class':'list-group-item is-child'})
        for div in div_containers:
            name = div.a.text
            name = re.sub('\s{2,}', ' ', name)
            url = TIKI_URL + div.a['href']
            cat = Category(name, url, parent_category.cat_id)
            if save_db:
                cat.save_into_db()
            result.append(cat)
    except Exception as err:
        print('ERROR BY GET SUB CATEGORIES:', err)
    return result

In [0]:
get_sub_categories(main_categories[0], save_db=True)

[ID: 25, Name:  Máy tính bảng (57)
 , URL: https://tiki.vn/may-tinh-bang/c1794?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4,
 ID: 26, Name:  Máy đọc sách (30)
 , URL: https://tiki.vn/may-doc-sach/c28856?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4,
 ID: 27, Name:  Điện thoại Smartphone (191)
 , URL: https://tiki.vn/dien-thoai-smartphone/c1795?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4,
 ID: 28, Name:  Điện thoại bàn (108)
 , URL: https://tiki.vn/dien-thoai-ban/c8061?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4,
 ID: 29, Name:  Điện thoại phổ thông (91)
 , URL: https://tiki.vn/dien-thoai-pho-thong/c1796?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4]

In [0]:
# get_all_categories() given a list of main categories
def get_all_categories(categories):
    if len(categories) == 0:
        return
    for cat in categories:
        sub_categories = get_sub_categories(cat, save_db=False)
        print(sub_categories)
        get_all_categories(sub_categories)

In [0]:
get_all_categories(main_categories)

[ID: None, Name:  Máy tính bảng (57)
, URL: https://tiki.vn/may-tinh-bang/c1794?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4, ID: None, Name:  Máy đọc sách (30)
, URL: https://tiki.vn/may-doc-sach/c28856?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4, ID: None, Name:  Điện thoại Smartphone (191)
, URL: https://tiki.vn/dien-thoai-smartphone/c1795?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4, ID: None, Name:  Điện thoại bàn (108)
, URL: https://tiki.vn/dien-thoai-ban/c8061?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4, ID: None, Name:  Điện thoại phổ thông (91)
, URL: https://tiki.vn/dien-thoai-pho-thong/c1796?src=c.1789.hamburger_menu_fly_out_banner, Parent: 4]
[]
[]
[]
[]
[]
[ID: None, Name:  Máy giặt (734)
, URL: https://tiki.vn/may-giat/c3862?src=c.4221.hamburger_menu_fly_out_banner, Parent: 5, ID: None, Name:  Máy lạnh - Máy điều hòa (696)
, URL: https://tiki.vn/may-lanh-may-dieu-hoa/c3865?src=c.4221.hamburger_menu_fly_out_banner, Parent: 5, ID: None, Name: 

AttributeError: ignored

In [0]:
# To get the product, we need to get the lowest level categories and crawl from their urls (SELECT query)