# Scrap CellphoneS

In [1]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# # this notebook was originally run on linux using Google Chrome

# !sudo cp ./chromedriver /usr/bin

In [79]:
def get_details(url):
    """Extract the details of an item.
    
    Argument:
        url -- url to the item's page.
        
    Return:
        a dictionary containing all details scrapped for the specified item.
    """
    
    data = {}
        
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    data['url'] = url
    
    # name
    data['item_name'] = soup.find('div', {'class': 'box-name__box-product-name'}).h1.text.strip()

    # price
    price_box = soup.find('div', 'box-info__box-price')
    try:
        data['special_price'] = price_box.find('p', {'class': 'special-price'}).text[:-2]
    except:
        data['special_price'] = None
    try:
        data['old_price'] = price_box.find('p', {'class': 'old-price'}).text[:-2]
    except:
        data['old_price'] = None
    
    # versions with different prices
#     versions_raw = soup.find_all('a', 'item-linked')
    
#     if len(versions_raw) == 1:
#         print(url)
    
#     if len(versions_raw) > 0:
#         for version in versions_raw:
#             if version.get('href') not in url_list[-len(versions_raw):]:
#                 url_list.append(version.get('href'))

#     try:
#         data['version'] = versions_raw[0].span.text[:-2]
#     except:
#         data['version'] = None
    
    # rating
    rating_raw = soup.find_all('div', 'item-statistical')
    rating = {
        (level.find('p', 'number-star').strong.text + 'star'): level.find('p', 'number-percent').text[:-9]
        for level in rating_raw
    }
    data.update(rating)
    
    # specifications
    info_table = soup.find('div', {'id': 'technicalInfoModal'}).find_all('th')

    infos = {
        info_table[2*i].text: info_table[2*i + 1].text for i in range(len(info_table)//2)
    }
    data.update(infos)
    
    # comment count
    try:
        data['comment_count'] = soup.find('p', {'id': 'total_comment'}).text.split()[3][1:]
    except:
        data = get_details(url)
    
    return data
        

In [87]:
page_url = "https://cellphones.com.vn/laptop.html"

driver = webdriver.Chrome()

driver.get(page_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# click to the "show more" button to get full list of laptops
load = True
while load:
    try:
        WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'btn-show-more'))
            ).click()        
    except:
        load = False

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

# scrap the list of laptops
results = soup.find_all('div', {"class": "item-product"})

str(len(results)) + ' laptops found'

In [86]:
url_list = []

In [None]:
records = []

# i = 0
# j = 0
# while True:
#     if j < len(results):
#         url_list.append(results[j].find('a').get('href'))
#         j += 1
    
#     if i < len(url_list):
#         record = get_details(url_list[i])
#         records.append(record)
#         print(record['item_name'])
#         i += 1
        
#     if i > len(url_list):
#         break

for item in results:
    driver.get(item.find('a').get('href'))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    versions_raw = soup.find_all('a', 'item-linked')
    
    if len(versions_raw) == 1:
        print(item.find('a').get('href'))
    
    if len(versions_raw) > 0:
        url_list.extend([version.get('href') for version in versions_raw])
    else:
        url_list.append(item.find('a').get('href'))
        
for url in url_list:
    record = get_details(url)
    records.append(record)
    print(record['item_name'])

In [90]:
key_set = set()

for record in records:
    for k in record.keys():
        key_set.add(k)

In [91]:
# save records to .csv file
file = open('records_cellphones.csv', 'w')
writer = csv.DictWriter(file, key_set)
writer.writeheader()
writer.writerows(records)
file.close()