In [4]:
# Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from time import sleep
import re

## Get soup

In [5]:
def get_url(url):
    """Get parsed HTML from url
      Input: url to the webpage
      Output: Parsed HTML text of the webpage
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try: 
        # Send GET request
        r = requests.get(url, headers=headers)

        # Parse HTML text
        soup = BeautifulSoup(r.text, 'html.parser')

        return soup
    
    except:
        print('ERROR')

## Create database


In [3]:
# Create database

conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

# **Category urls**

## Category table

In [8]:
# Create "categories" table
def create_categories_table():
    query = """
        CREATE TABLE IF NOT EXISTS categories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            category VARCHAR(255),
            category_url TEXT,
            parent_category VARCHAR(255)
        )
    """
    try:
        cur.execute(query)
        conn.commit()
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)

In [7]:
def add_to_category_table(row):
    query = """
    INSERT INTO categories (category, category_url, parent_category)
    VALUES (?, ?, ?);
    """
    val = (i['category'], i['category_url'], i['parent_category'])
    try:
        cur.execute(query, val)
        cat_id = cur.lastrowid
        print(cat_id)
        conn.commit()

    except Exception as err:
        print('ERROR BY INSERT:', err)    

## Scraping category URLs

In [83]:
# Get URL of 16 main categories:
def get_main_categories(url):
    mainsoup = get_url(url)
    regex = re.compile('.*MenuItem-.*')
    try:
        cat_links = mainsoup.find_all('li', {'class':regex})
        data = []

        for i in cat_links:
            d = {'category':'', 'category_url':'', 'parent_category':''}
            d['category_url'] = i.a['href'].split('?')[0]
            d['category'] = i.find('span', {'class':'text'}).get_text()
            d['parent_category'] = ''
            data.append(d)

        return data

    except:
        print('ERROR BY GET MAIN CATEGORIES')

In [84]:
# Get URL of sub categories
def get_sub_categories(parent_category, url):
    subsoup = get_url(url)

    data = []
    try:
        sub_cats = subsoup.find_all('div', {'class':'list-group-item is-child'})

        for eachdiv in sub_cats:
            d = {'category':'', 'category_url':'', 'parent_category':''}
            
            geturl = eachdiv.find('a', {'class':'list-group-item'})
            d['category_url'] = tikiurl + geturl['href']
            
            gettitle = geturl.get_text()
            d['category']  = re.sub('\s{2,}', ' ', gettitle).strip("\n")
            
            d['parent_category'] = parent_category
            
            # Add result to category table
            add_to_category_table(d)
            
            data.append(d)
        return data
    except:
        print('ERROR BY GET SUB CATEGORIES')
        return data

In [85]:
def get_all_categories(categories, sumlist):
    if len(categories) == 0: # end condition
        return []
    else:
        for cat in categories:
            sub_categories = get_sub_categories(cat['category'], cat['category_url'])
            sumlist.extend(sub_categories)
            if len(sumlist) % 100 == 0:
                print(len(sumlist))
#             sleep(1)
            get_all_categories(sub_categories, sumlist)
        return sumlist

## End-to-end scraping all categories

In [88]:
def scrape_all_categories(url):
    
    # Create database
    conn = sqlite3.connect('tiki.db')
    cur = conn.cursor()
    
    # Create category table
    create_categories_table()
    
    #Scrape 16 main categories
    main_categories = get_main_categories(url)
    print('Done scraping main categories, total: ', len(main_categories))
    
    # Scrape all sub categories
    try:
        all_categories = get_all_categories(main_categories, [])
        return all_categories
    
    except:
        print('Error')
        return all_categories

# **Product information**

## Product table

In [91]:
 def create_products_table():
    query = """
        CREATE TABLE IF NOT EXISTS products (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            cat_id INTEGER,
            product_category VARCHAR(255),
            product_brand VARCHAR(255),
            product_title VARCHAR(255),
            price VARCHAR(255),
            sku INTEGER,
            image_url TEXT,
            sale INTEGER,
            review INTEGER,
            rating INTEGER,
            product_url TEXT,
            FOREIGN KEY (cat_id) REFERENCES categories(id)
        )
    """
    try:
        cur.execute(query)
        conn.commit()
    except Exception as err:
        print('ERROR BY CREATE PRODUCTS TABLE', err)
        

In [92]:
class Product:
    def __init__(self, cat_id, category, brand, title, price, sku, image, sale, review, rating, url):
        self.cat_id = cat_id
        self.category = category
        self.brand = brand
        self.title = title
        self.price = price
        self.sku = sku
        self.image = image
        self.sale = sale
        self.review = review
        self.rating = rating
        self.url = url

    def save_into_db(self):
        query = """
            INSERT INTO products (cat_id,
            product_category,
            product_brand,
            product_title,
            price,
            sku,
            image_url,
            sale,
            review,
            rating,
            product_url)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
        """
        val = (self.cat_id, self.category, self.brand, self.title, self.price, self.sku, self.image, self.sale,
               self.review, self.rating, self.url)
        try:
            cur.execute(query, val)
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)

# cat1 = Category('Test 3', 'Test URL 3')
# cat1.save_into_db()
# print(cat1.cat_id)
# cur.execute('SELECT * FROM categories;').fetchall()

## Scraping product information

In [90]:
def get_sale(div): 
    getsale = div.find('span', {'class':'sale-tag'})
    sale = int(getsale.get_text()[1:-1]) if getsale else "No discount"

    return sale

def get_img_url(div):
    image = div.img['src']
    
    return image

def get_rating(div):
    getrating = div.find('span', {'class':'rating-content'})
    rating = int(getrating.span['style'][6:-1])/20 if getrating else "No rating"
    
    return rating

def get_review(div):
    getreview = div.find('p', {'class': 'review'})
    reviews = getreview.get_text().strip('()')
    if reviews[:-9] == 'Chưa có':
        review = 0
    else:
        review = int(reviews[:-9])
        
    return review

def get_url(div):
    getlink = div.a['href']
    if getlink[:11] == "//t.ants.vn":
        url = "http:" + getlink
    else:
        url = tikiurl + getlink    
    
    return url

In [65]:
def scrape_category(catid, category, categoryurl, save_db=False):
    '''Scrape one category from tiki.vn
        Input: one link of category (removed "?" and the part after it)
        Output: dataframe contains details of products
    '''
    products = []
    

    for i in range(1, 10):
        sleep(3)
        
        try:
    
            soup = get_url(categoryurl + f'?page={i}') # apply function

            product_div = soup.find_all('div', {'class':'product-item'})

            if len(product_div) == 0:
                break

            for prod in product_div:


                #first note:
                category= prod['data-category']
                brand = prod['data-brand']
                title = prod['data-title']
                price = prod['data-price']
                sku = prod['product-sku']
                cat_id = catid

                # lower levels

                sale = get_sale(prod)

                # image URL
                image = get_img_url(prod)

                # Rating. Can have no rating
                rating = get_rating(prod)

                #Review
                reviews = get_review(prod)

                # Product URL
                url = get_url(prod)

                prod = Product(cat_id, category, brand, title, price, sku, image, sale, review, rating, url)
                if save_db:
                    prod.save_into_db()
                products.append(prod)
        except Exception as err:
            print('ERROR BY GET SUB CATEGORIES:', err)
    
    return products


## Execute

In [None]:
last_layer = pd.read_sql_query('''SELECT A.id, A.category, A.category_url, B.parent_category
FROM categories as A LEFT JOIN categories AS B on A.category = B.parent_category
WHERE B.parent_category IS NULL;''', conn)

In [None]:
all_products = []

for index, row in last_layer.iterrows():
    d = scrape_category(row['id'],row['category'], row['category_url'], save_db=True)
    all_products.append(d)
    if len(all_products) % 1000 == 0: 
        print('Successfully scraped: ', len(all_products))