# Web scraping from Tiki.vn

## Crawl product data on TIKI

### Get product IDs

In [None]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests
import json
import re    
import pickle

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}

def crawl_product_id_with_html(url):
    product_list = []
    i = 1
    with requests.Session() as s:
        while (True):
            print("Crawl page: ", i)
            response = s.get(url.format(i), headers=headers)
            soup = bs(response.content)

            product_box = soup.find_all('a', class_='product-item')
            if (len(product_box) == 0):
                break


            for product in product_box:
                product_link = product['href']
                matched = re.search('-p(\d+).html', product_link)
                if matched:
                    product_list.append(matched.group(1))
                    print(matched.group(1))
                else:
                    product_list.append("N/A")
            i += 1

    return product_list

def crawl_product_id_with_api(category):
    api_url = 'https://tiki.vn/api/v2/products?limit=300&category={}&page={}'.format(category, '{}')
    product_list = []
    i = 1
    while (True):
        passed = False
        while (passed == False):
            try:
                response = requests.get(api_url.format(i), headers=headers)
                product_dict = json.loads(response.content)
                passed = True

            except Exception as e:
                if isinstance(e, KeyboardInterrupt):
                    sys.exit()
                print("Connection error")
        
        if product_dict['data'] == []:
            break

        for product_data in product_dict['data']:
            product_list.append(product_data['id'])
        i += 1

    return product_list

def save_pickle(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    print("Save file: ", filename)

def load_pickle(product_file):
    with open(product_file, "rb") as input_file:
        data = pickle.load(input_file)
    return data

In [None]:
categories_summary = [
    1789, 4221, 1815, 1846, 1801, 1882, 1883, 4384, 2549, 1520, 931, 915, 6000, 8371, 1703, 1686, 27498, 
    976, 27616, 1975, 8594, 17166, 8322, 11312
]

product_list_all = []
for category in categories_summary:
    product_list = crawl_product_id_with_api(category)
    if product_list != []:
        print("No. Product ID: ", len(product_list), 'with category ', category)
        product_list_all = product_list_all + product_list

print("No. Product ID: ", len(product_list_all))

In [None]:
product_list_all = load_pickle('product_id_summary.pkl')
product_list_all = set(product_list_all)

In [None]:
len(product_list_all)

In [None]:
len(set(product_list_all))

In [None]:
# save product id for backup
product_list_all = set(product_list_all)
save_pickle(product_list_all , 'product_id_summary.pkl')

### Get products info for each ID using Tiki API

In [None]:
def crawl_product(product_id):
    product_url='https://tiki.vn/api/v2/products/{}'
    passed = False
    product_info = None
    while (passed == False):
        try:
            response = requests.get(product_url.format(product_id), headers=headers)
            if (response.status_code == 200):
                passed = True
        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                sys.exit()
            print("Connection error")
    product_info = json.loads(response.text)
    return product_info

products_info = []
for index, product_id in enumerate(set(product_list_all)):
    product_info = crawl_product(product_id)
    products_info.append(product_info)
    if index%100 == 0:
        print("Crawl {} products".format(index))
        save_pickle(products_info, 'products_info_summary_0_{}.pkl'.format(index))
    if index%5000 == 0:
        save_pickle(products_info, 'products_info_summary_0_{}.pkl'.format(index))

### Convert string info from TIKI API to Pandas Dataframe

In [None]:
info_df = pd.DataFrame(products_info)
info_df.to_csv('products_info_raw.csv')

## Crawl TIKI reviews

In [None]:
review_api_url = 'https://tiki.vn/api/v2/reviews?product_id={}&sort=score%7Cdesc,id%7Cdesc,stars%7Call&page={}&limit=1000&include=comments'
def crawl_review_from_id(product_id):
    i = 1
    review_list = []
    while True:
        passed = False
        while (passed == False):
            try:
                response = requests.get(review_api_url.format(product_id, i), headers=headers)
                review_data = json.loads(response.content)['data']
                passed = True
            except Exception as e:
                if isinstance(e, KeyboardInterrupt):
                    sys.exit()
                print("Connection error")

        if review_data == []:
            break
        review_list = review_list + review_data
        i = i + 1
    return review_list
product_reviews = []
for index, product_id in enumerate(set(product_list_all)): 
    product_review = crawl_review_from_id(product_id)
    product_reviews = product_reviews + product_review
    if index%100==0:
        print('Crawled {} reviews from {} products'.format(len(product_reviews), index))
        save_pickle(product_reviews, 'products_review_summary_0_{}.pkl'.format(index))
    if index%5000 == 0:
        save_pickle(product_reviews, 'products_review_summary_0_{}.pkl'.format(index))
    

### Convert string info from TIKI API to Pandas Dataframe

In [None]:
review_df = pd.DataFrame(product_reviews)
review_df.to_csv('products_review_raw.csv')