# Scrap CellphoneS

In [553]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# # this notebook was originally run on linux using Google Chrome

# !sudo cp ./chromedriver /usr/bin

In [633]:
def get_details(item):
    """Extract the details of an item.
    
    Argument:
        item -- a BeautifulSoup element containing url to the item's page.
        
    Return:
        a dictionary containing all details scrapped for the specified item.
    """
    
    results = {}
    
    # extract the url
    url = item.find('div', 'item-product__box-name').a.get('href')
    
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    results['url'] = url
    
    # name
    results['item_name'] = soup.find('div', {'class': 'box-name__box-product-name'}).h1.text.strip()

    # price
    price_box = soup.find('div', 'box-info__box-price')
    try:
        results['special_price'] = price_box.find('p', {'class': 'special-price'}).text[:-2]
    except:
        results['special_price'] = None
    try:
        results['old_price'] = price_box.find('p', {'class': 'old-price'}).text[:-2]
    except:
        results['old_price'] = None
    
    # versions with different prices
    versions_raw = soup.find_all('a', 'item-linked')
    versions = {
        version.strong.text: version.span.text[:-2] for version in versions_raw
    }
    results.update(versions)
    
    # rating
    rating_raw = soup.find_all('div', 'item-statistical')
    rating = {
        (level.find('p', 'number-star').strong.text + 'star'): level.find('p', 'number-percent').text[:-9]
        for level in rating_raw
    }
    results.update(rating)
    
    # specifications
    info_table = soup.find('div', {'id': 'technicalInfoModal'}).find_all('th')

    infos = {
        info_table[2*i].text: info_table[2*i + 1].text for i in range(len(info_table)//2)
    }
    results.update(infos)
    
    # comment count
    try:
        results['comment_count'] = soup.find('p', {'id': 'total_comment'}).text.split()[3][1:]
    except:
        results = get_details(item)
    
    return results
        

In [591]:
page_url = "https://cellphones.com.vn/laptop.html"

driver = webdriver.Chrome()

driver.get(page_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# click to the "show more" button to get full list of laptops
load = True
while load:
    try:
        WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'btn-show-more'))
            ).click()        
    except:
        load = False

In [638]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

# scrap the list of laptops
results = soup.find_all('div', {"class": "item-product"})

str(len(results)) + ' laptops found'

'440 laptops found'

In [None]:
records = []

for item in results:
    record = get_details(item)
    records.append(record)
    print(record['item_name'], 'details scrapped')

In [635]:
key_set = set()

for record in records:
    for k in record.keys():
        key_set.add(k)

In [636]:
# save records to .csv file
file = open('records_cellphones.csv', 'w')
writer = csv.DictWriter(file, key_set)
writer.writeheader()
writer.writerows(records)
file.close()