# Web Scraping Project
### Introduction

This project goal is to scrape a certain e-commerce website (in this case bukalapak.com) using python with some packages; `Selenium`, `BeautifulSoup` and `pandas`. 

`Selenium` will be used to do the automation of webdriver.
<br>`BeautifulSoup` will be used to parse the html.
<br>`pandas` will be used for the data manipulation.

Some information that will be scraped are:
    - product_name = name of the product 
    - product_category = category of the product
    - rating = product rating out of 5 stars
    - store_name = name of the store including the city of the store
    - price = price of the product
    
Data that will be collected consists of 5 different categories (lari, renang, badminton, tenis, pancing) with at least 500 items in each category.


In [3]:
# Install the following packages for the first time use only
pip install selenium
pip install bs4

Collecting bs4
  Using cached https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Arsil\AppData\Local\pip\Cache\wheels\a0\b0\b2\4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Note: you may need to restart the kernel to use updated packages.


In [8]:
#1. Import Packages needed for the project
import selenium #for automation
from selenium import webdriver #for canvas navigator
from selenium.webdriver.common.by import By #for finding elements
from selenium.webdriver.support.ui import WebDriverWait #to make the webdriver wait
from bs4 import BeautifulSoup #to make the scraping easier
import pandas as pd #to manipulate data

In [9]:
#2. Setting up the driver

#Path of the webdriver
path = "C:/Program Files (x86)/chromedriver.exe"

#Setting Webdriver
driver = webdriver.Chrome(executable_path = path)

  import sys


In [28]:
#3. Setting up the automated surf for web driving

wait = driver.implicitly_wait(10) #set the driver to wait

#define the website target and category target
website_address = "https://www.bukalapak.com/c/olahraga/"
subcategory = ['lari', 'renang', 'badminton', 'tenis', 'pancing']

#define the container for data storing
products = []

#defining the function to scrape surfed data to each container
def scrape_func(i):
    results = [] #container for the scraped data
    
    #to set the inside of the html
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    #parsing the html
    soup = BeautifulSoup(html,"html.parser")
    
    #iteration for scraping every product displayed in the website
    for product in soup.find_all('div', class_= 'bl-product-card__description') :
        result = product.get_text().split("\n") #soup is scraping in JSON format, so get_text is used to scrape the text format
        result.append(i) #add the scraped data to the container
        results.append(result)
    return results #return the result of scraped data

#iteration for pagination on webdriver
for i in subcategory: #iteration for the subcategory
    for j in range(40): #iteration for pages needed to fulfil the minimum desired data
        
        '''
        because the website structure for the pagination is different between the first page and
        the next one, I use 2 conditions for the pagination
        '''
        if j > 0 : #for pagination when page number is greater than 1
            driver.get(website_address+i+'?page='+str(j+1))
            wait
            driver.execute_script("window.scrollTo(0, 700)")
            product = scrape_func(i)
            for k in product :
                products.append(k)
        else :  #for pagination of the first page
            driver.get(website_address+i+'?')
            wait
            driver.execute_script("window.scrollTo(0, 700)")
            product = scrape_func(i)
            for k in product :
                products.append(k)

#After the scraping is finished, close the webdriver to save memories and power            
driver.quit() 


  if __name__ == '__main__':


In [117]:
#explore the data scraped
products[:5]

[['',
  '            Tissue Handuk Mini Permen Travelling Liburan Kompress Towel Portable',
  '             ',
  '          Rp600',
  '            ',
  '              4.9',
  '              ',
  '          Terjual 19912',
  '         ',
  '          Depok',
  '         ',
  '            LBS OFFICIAL STORE',
  '            ',
  '            108rb Feedback Positif',
  '           ',
  'lari'],
 ['',
  '            celana mizuno voli celana badminton',
  '             ',
  '          Rp23.800',
  '            ',
  '              4.9',
  '              ',
  '          Terjual 16296',
  '         ',
  '          Jakarta Barat',
  '         ',
  '            IZDIHAAR SPORT',
  '            ',
  '            19rb Feedback Positif',
  '           ',
  'lari'],
 ['',
  '            Sepatu Sneaker Pria Wanita Slip On Sepatu Sport Olahraga Lari Running Jogging Sepatu Casual Cowok Cewek Terbaru Terlaris',
  '             ',
  '          Rp145.800',
  '            ',
  '              4.9',
  '     

The result from scraping is very 'dirty'. It has many information that we don't need, so we have to do some cleaning dor the data to make it easier for further analysis.

In [128]:
#4. Data Cleaning

#preparing the container for the cleaning result
products_cleaned = []

#Iteration for data cleaning
for product in products: 
    container = [] #reusable container for each product description
    for item in product:
        item = item.strip() #cleaning the whitespace
        if item != '' and item != 'Grosir' and item.startswith('Terjual') != True and item.endswith('Feedback Positif') != True and item.endswith('Feedback Negatif') != True :
            '''
            the if statement above is cleaning the excess newline, "Grosir" information
            sold item count, and item's feedback count
            '''
            container.append(item) #collecting the cleaned item
    if container[2].startswith('Rp') == True and container[3].endswith('%') == True:
        '''
        the if statement above is cleaning the discount information that display the
        price before discount and the value of the discount
        '''
        container.pop(2) #delete the price-before-discount data
        container.pop(2) #delete the discount value
    elif container[1].startswith('Rp') != True:
        #this statement clean the remaining abnormal data
        container[0] = container[0]+container[1]
        container.pop(1)
    elif len(container[2]) > 3:
        container.insert(2, "no rating")
    
    container[1] = container[1].replace("Rp", "") #Replacing the "Rp" so that we can turn it into int
    container[1] = container[1].replace(".","") #Replacing the "." so that we can turn it into int
    container[1] = int(container[1]) #changing the data type of the string into int
    
    if len(container[2]) == 3 : #changing the data that has rating into float
        container[2] = float(container[2])
    products_cleaned.append(container)
    
products_cleaned

[['Tissue Handuk Mini Permen Travelling Liburan Kompress Towel Portable',
  600,
  4.9,
  'Depok',
  'LBS OFFICIAL STORE',
  'lari'],
 ['celana mizuno voli celana badminton',
  23800,
  4.9,
  'Jakarta Barat',
  'IZDIHAAR SPORT',
  'lari'],
 ['Sepatu Sneaker Pria Wanita Slip On Sepatu Sport Olahraga Lari Running Jogging Sepatu Casual Cowok Cewek Terbaru Terlaris',
  145800,
  4.9,
  'Bandung',
  'Saudagar Muda',
  'lari'],
 ['celana voli fullprint kolor voly printing volleyball badminton',
  45000,
  4.8,
  'Jakarta Barat',
  'IZDIHAAR SPORT',
  'lari'],
 ['Penyangga Punggung Shoulder Back Support Belt Posture Corrector Korset - Hitam XL',
  45000,
  4.7,
  'Depok',
  'LBS OFFICIAL STORE',
  'lari'],
 ['Legging Panjang Baselayer Ukuran Standar dan Jumbo',
  18000,
  4.8,
  'Depok',
  'Leggingid official',
  'lari'],
 ['Tali medali ORIGINAL Tali medali sleret Merah Putih dan Hijau Putih',
  1650,
  '5',
  'Jakarta Utara',
  'Bintang Jaya',
  'lari'],
 ['CONE / CONES KERUCUT LENTUR UNTUK

In [132]:
#5. Data Storing

#prepare the container
product_name, product_category, product_rating, store_name, price = [],[],[],[],[]

#filling the container
for item in products_cleaned:
    product_name.append(item[0]) #adding the product name in the index of 0 from the beginning
    product_category.append(item[-1]) #adding the product category name in the index of 1 from behind
    product_rating.append(item[2]) #adding the product rating in the index of 2 from the beginning
    store_name.append(item[-2]) #adding the product name in the index of 2 from behind
    price.append(item[1]) #adding the product name in the index of 1 from the beginning
    
#zipping the lists into dictionary
listCols = ['product_name', 'product_category', 'product_rating', 'store_name', 'price']
dict_data = dict(zip(
            listCols,
                    (product_name, 
                     product_category, 
                     product_rating, 
                     store_name, 
                     price)
))

#saving the dataframe into csv
df = pd.DataFrame(data = dict_data)
df.to_csv('scraped_data.csv', index=False)

### This is the end of the project.

Thank you for your attention.
<br>
<br>
<br>
<br>
<br>
*below is some experiments I used for this project. you can check this out of you're interested about my trial and error. Thank you.

In [113]:
#It was used to check on the structure of the data

'''
sembilan = 0
sepuluh = 0
sebelas = 0
duabelas = 0
lebih = 0
lima = 0
enam = 0
tujuh = 0
delapan = 0
for product in products_cleaned :
    if len(product) <= 6 :
        enam+=1
    elif len(product) == 7:
        tujuh +=1
    elif len(product) == 8:
        delapan +=1
    elif len(product) == 9 :
        sembilan +=1
    elif len(product) == 10:
        sepuluh +=1
    elif len(product) == 11:
        sebelas +=1
    elif len(product) == 12:
        duabelas +=1
    else:
        lebih+=1#print(len(product))
        
print(enam, tujuh, delapan, sembilan, sepuluh, sebelas, duabelas, lebih)
enam+tujuh+delapan+sembilan+sepuluh+sebelas+duabelas+lebih
'''        

3880 0 0 0 0 0 0 0


3880

In [112]:
#this one for exploring the data for cleaning too
enam = []
tujuh = []
delapan = []
for product in products_cleaned :
    if len(product) <= 6 :
        enam.append(product)
    elif len(product) == 7:
        tujuh.append(product)
    elif len(product) == 8:
        delapan.append(product)
    elif len(product) == 9 :
        sembilan.append(product)
    elif len(product) == 10:
        sepuluh.append(product)
        
print(enam[0])    

['Tissue Handuk Mini Permen Travelling Liburan Kompress Towel Portable', 'Rp600', '4.9', 'Depok', 'LBS OFFICIAL STORE', 'lari']


In [74]:
#it was used to check for the abnormality
lah_tes = 0
for product in products_cleaned :
    if len(product) == 7 :
        pisah = product[2].split("Terjual")
        if pisah[0] == '':
            jumlah_tes+=1
        else:
            print(product)
jumlah_tes
        

['Besst seller sepatu tenis reebok', 'Grosir', 'Rp295.000', 'Bandung Barat', 'ADRS FOOTWEAR PROJECT', '8rb Feedback Positif', 'tenis']
['Raket Badminton Tanggung Baby Racket By Classic 2 Pcs', 'Grosir', 'Rp75.000', 'Depok', 'ABB Family', '11rb Feedback Positif', 'tenis']
['celana tenis-celana pingpong-celana murah-celana joging stiga gold peria dan wanita', 'Grosir', 'Rp75.000', 'Jember', 'IMPORTIR JERSEY  FUTSAL', '5rb Feedback Positif', 'tenis']
['GROSIR Raket Tenis Carbon Aluminium Alloy', 'Grosir', 'Rp224.000', 'Jakarta Barat', 'Ogah Repot', '21rb Feedback Positif', 'tenis']


42

In [29]:
#it was used to check on the minimum quantity for the data needed
renang = 0
tenis = 0
lari = 0
badminton = 0
pancing = 0
for product in products:
    if product[-1] == 'renang':
        renang+=1
    elif product[-1] == 'tenis':
        tenis += 1 
    elif product[-1] == 'badminton' :
        badminton += 1
    elif product[-1] == 'lari' :
        lari += 1
    else:
        pancing +=1
print('renang:', renang, 'lari:', lari, 'badminton:', badminton, 'tenis:', tenis, 'pancing:',pancing)
    

renang: 840 lari: 880 badminton: 800 tenis: 740 pancing: 620


In [None]:
#Alternative
'''
This is my lines of codes for scraping with Selenium Pagination, unfortunately, it doesn't work.
So I'm using my code above for pagination using link address.

#Running WebDriver
driver.get(website_adress)
wait = driver.implicitly_wait(10)
wait
#driver.implicitly_wait(10)
cursor = driver.find_element(By.LINK_TEXT, "Kategori barang")
cursor.click()

wait
#driver.implicitly_wait(10)
cursor = driver.find_element(By.LINK_TEXT, "Olahraga")
cursor.click()

wait
#driver.implicitly_wait(10)
driver.execute_script("window.scrollTo(0, 700)")

kategori = ['Lari', 'Badminton', 'Renang', 'Pancing', 'Tenis']
for i in kategori:
    wait
    #driver.implicitly_wait(10)
    cursor = driver.find_element(By.LINK_TEXT, i)
    cursor.click()
    
    #for j < 20:
     #   wait
      #  driver.execute_script("window.scrollTo(0, 6000)")
       # wait
        ##driver.implicitly_wait(30)
        #(driver.find_element_by_link_text(str(j+2))).click()
        #j+=1
    wait    
    #driver.implicitly_wait(30)
    driver.execute_script("window.scrollTo(0, 100)")
    cursor2 = driver.find_element(By.LINK_TEXT, "Olahraga")
    cursor2.click()
driver.quit()
#a.click()
#close welcome screen shopee
'''
