# Project Introduction

In this project, we create a scraper program that visits a Website and reads their page contents to collects certain information.

The main function of our scapper is to collect the product information of products being sold on Tiki, including products on every pages of all categories. The collected information will be saved as arrays, which will in turn be added into a Pandas dataframe.

The product information includes:

Product ID
Seller ID
Product title
Price
URL of the product image
Product Category

In [0]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

### The HTML Parser
This function will return the parsed HTML content from the given URL

In [0]:
# Parser function to retrieve and parse the HTML code of a website 
def parser(url):
    """Get a parsed version of an URL"""
    
    try:
      # Retrieve plain HTML code.
      plain = requests.get(url).text

      # Parse the plain content into structured one
      soup = BeautifulSoup(plain)

      return soup
    
    except Exception as err:
      print('There was a problem: {}'.format(err))

### _Test the function_

In [0]:
url = 'https://tiki.vn/'
s = parser(url)
s

### Collect Main Category links
This function will collect all category URLs from tiki.vn website based on its main category navigation bar.

In [0]:
def get_category_urls():
    """Get the URLs of all categories on Tiki.vn"""
    
    url = "https://tiki.vn"
    
    # Get the homepage's html in BeautifulSoup format
    soup = parser(url)
    
    # Initialize an empty list of category 
    category_list = []

    # Scrape through the main category navigation bar
    for i in soup.find_all('li', class_="MenuItem-tii3xq-0"):
      
        # Get the category value
        category = i.a.find('span', class_='text').text
        
        # Get the url value
        url = i.a["href"]
        
        # Add category and url values to list
        category_list.append((category, url))
        
    return category_list

### _Test the function_

In [0]:
url_list = get_category_urls()
url_list

### The Product Scraper

The scraper will parse the product page from the given URL and return a list of products. If there is no products, it will return an empty list.

In [0]:
def scrape_products(cat, url):
    """Scrape product information of all products on one page"""
    
    # Initialize empty 'results' list
    results = []
   
    # Get the parsed html code
    soup = parser(url)
    
    # Find all products on this page
    product_items = soup.find_all('div', class_='product-item')
  
    # If there is no products, return an empty list.
    if len(product_items) == 0:
      return []
    
    # If the page has products
    else: 
        
        # Iterate through all product_items and store the product information in the 'row' list
        for product in product_items:
          
            row = [product.get('data-id'), 
                   product.get('data-seller-product-id'), 
                   product.get('data-title'),
                   product.get('data-price'),
                   product.img['src'], 
                   cat]   

            # Add the product information of each product into the 'results' list
            results.append(row)
              
    return results

### _Test the function_

In [0]:
test_scraper = scrape_products(url_list[0][0], url_list[0][1])
test_scraper

### The Main Scraper

In [0]:
def scrape_all():
    """Scrape all products on Tiki!
    """
    print('INFO scrape_all(): Start craping')
    
    # Get all category links
    queue = get_category_urls()
  
    # Initialize the results, which will store all products information from the scraper
    list_all_products = []
    
    # Initialize the 'page' variable, which indicates the current product page of the current category
    page = 1
    
    # While there are links in the queue, we will run through each link and get the products
    while len(queue) > 0:
      
      # We will proceed from the last link in the queue        
      url = queue[-1][1]
      cat = queue[-1][0]
      
      # Check to keep the original category's url and its category name
      if "page" not in url:
        url_orig = url
        cat_orig = cat
        
      # Remove the last link in queue so that new product url from page 2 will be added at the end of the queue
      queue.pop() 
      
      print('Scraping', cat_orig + " page " + str(page))
      print(url)
      
      # Get the list of products of the current page and store it in a temporary variable
      list_current_products = scrape_products(cat, url)
      
      # If the page has products, we will create the next product page link and add it to the queue
      if len(list_current_products) > 0:
        
        # Add the products from new_rows to the results list
        list_all_products += list_current_products
        
        # Generate next page url and add it to the end of list `queue` so that it will be the next link to be scraped
        page += 1
        url = url_orig + "&page=" + str(page)

        # Add the new page url to the end of list `queue`
        queue.append([cat_orig, url]) 
            
        print('Add next page', page)
      else: 
        # Now the product page link doesn't return any product, which indicates that we have done getting all products...
        # ...of the current category. We will reset the page number to 1 in order to scrape the next category
        page = 1
        
    # Return the final list of all products
    return list_all_products

### _Test the function_

In [0]:
tiki_products = scrape_all()

In [0]:
print(len(tiki_products))

In [0]:
df = pd.DataFrame(tiki_products, columns = ['product_id', 'seller_id', 'title', 'price', 'image_url', 'category'])
df.sample(10)
#df.to_csv("tiki_all_products.csv")
df.describe()

# BONUS: Our 2nd Solution
We came up with another solution, which is to collects all URLs of categories and then will loop through each category URL one by one to get the products.

For the convenience, we put all code within one single cell.

In [0]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

'''*****************************************************************************
'''
def parser(url):
    """Parser function to retrieve and parse the HTML code of a website 
    """
    
    try:
      # Retrieve plain HTML code.
      plain = requests.get(url).text

      # Parse the plain content into structured one
      soup = BeautifulSoup(plain)

      return soup
    
    except Exception as err:
      print('There was a problem: {}'.format(err))

      
'''******************************************************************************
'''
def get_category_urls():
    """Get the URLs of all categories on Tiki.vn"""
    
    url = "https://tiki.vn"
    list_category_urls = []
    
    # Get the homepage's html in BeautifulSoup format
    soup = parser(url)

    # Scrape through the main category navigation bar
    for i in soup.find_all('li', class_="MenuItem-tii3xq-0"):
      
        # Get the category value
        category = i.a.find('span', class_='text').text
        
        # Get the url value
        url = i.a["href"]
        
        # Add category and url values to list
        list_category_urls.append((category, url))
        
    return list_category_urls

  

'''******************************************************************************
'''
def scrape_all_products(cat_name, cat_url):
  '''This function will scrape through all product pages in given category URL
     and return the list of products
  '''
  
  all_products = []
  page = 1
  
  # Make the product page url template
  product_page_url = cat_url + '&page='
  
  # Now we will go through each product page, if it exists, then we will scrape the products
  while test_prod_page_url(product_page_url + str(page)):
    print('Scraping {} page {}'.format(cat_name, page))
      
    all_products += scrape_page(product_page_url + str(page))
    page += 1
    
  # Return the products
  return all_products



'''******************************************************************************
'''
def test_prod_page_url(url_page):
  """This function will check if a product page url exists on Tiki.vn or not
  """
  
  # Get the HTML document
  try:
    res = requests.get(url_page)
    
    #Detect if this is a real product url by searching for the result html tag
    return "<h4 name=\"results-count\">" in res.text 
  
  except Exception as err:
    print('There was a problem: {}'.format(err))

    
    
'''******************************************************************************
'''
def scrape_page(url):
  """This function will scrape the products of one given product page
  """
  
  # Get the HTML document  
  soup = parser(url)
  
  products = []
  
  # Get the list of articles   
  soup_products = soup.find_all("div", class_='product-item')
  
  for item in soup_products:
    product = [ item.get('data-id'), 
                item.get('data-seller-product-id'), 
                item.get('data-title'),
                item.get('data-price'),
                item.img['src'], 
                item.get('data-category') ]
    
    products.append(product)
  return products



'''******************************************************************************
'''
def get_tiki_products():
  """The main function to get all products on Tiki.vn
  """
  tiki_products = []

  # First, get all category URLs
  list_category_urls = get_category_urls()

  # Now loop through each category and collect the products
  for url in list_category_urls:
      tiki_products += scrape_all_products(url[0], url[1])

  return tiki_products

 

### _Test the function_

In [0]:
tiki_products = get_tiki_products()

In [0]:
df = pd.DataFrame(tiki_products, columns = ['product_id', 'seller_id', 'title', 'price', 'image_url', 'category' ])
df.sample(10)