In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib.request
import time
import warnings
warnings.filterwarnings('ignore')

# Get the URL of each product

In [3]:
# access the Lazada throgh auto web browser
driver = webdriver.Chrome()
url = 'https://www.lazada.vn/?creation=20-06-2019-REBRANDETA02-VN&exlaz=d_1:mm_150050845_51350203_2010350203::11:16495529661!138019133470!lazada!e!kwd-19342147066!c!!!!587236015401!&gclid=EAIaIQobChMI1_3wi-_1_gIV1JlmAh0hRgdYEAAYASAAEgLZrfD_BwE'
driver.set_window_size(730, 850)
driver.get(url)

# input searching key words
search_block = driver.find_element(by = By.CSS_SELECTOR, value = '.search-box__input--O34g')
search_block.send_keys(input("Enter searching topic: "))
search_block.send_keys(Keys.ENTER)

# get 10 page
url_lst = []

for i in range(10):
    # set explicit wait for maximum 20 seconds
    element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.Bm3ON')))
    block = driver.find_elements(by = By.CSS_SELECTOR, value = '.Bm3ON .Ms6aG.MefHh .qmXQo .ICdUp ._95X4G a')
    url_lst.extend([x.get_attribute('href') for x in block])

    # get the next page button
    next_button = driver.find_element(by = By.CSS_SELECTOR, value = '.ant-pagination-next .ant-pagination-item-link')
    # click the next page
    next_button.click()
    time.sleep(3)
driver.close()

# create data frame from the collected data
print('Number of blocks: ',len(url_lst))
url_lst

Enter searching topic: Đồ gia dụng
Number of blocks:  400


['https://www.lazada.vn/products/ao-thun-nam-polo-theu-logo-mec-chat-vai-cotton-4-chieu-do-dan-cuc-ky-sang-trong-lich-lam-plomec512-i262908276.html',
 'https://www.lazada.vn/products/hang-co-sanao-tay-dai-mo-to-yamaha-mio-i-125-m3-ao-dua-xuong-doc-moi-ao-di-xe-dap-ao-chay-xe-may-xe-dap-leo-nui-quan-ao-the-thao-dua-xe-mo-to-duong-dia-hinh-xe-dap-xe-dap-i859066742.html',
 'https://www.lazada.vn/products/ao-the-thao-nam-coolmate-basics-tham-hut-nhanh-kho-i1668377561.html',
 'https://www.lazada.vn/products/asrv-ao-thun-nam-tay-ngan-dang-rong-hoa-tiet-ke-soc-waffle-thiet-ke-niche-cao-cap-ao-khoac-co-mu-de-phoi-i2048459106.html',
 'https://www.lazada.vn/products/ao-thun-nam-co-tron-dap-van-noi-theu-hoa-tiet-2-mau-trang-den-mau-moi-chat-vai-cotton-co-gian-ahfashion-i2241976321.html',
 'https://www.lazada.vn/products/ao-thun-ao-phong-beautiful-ao-phong-form-rong-tay-lo-phong-cach-han-quoc-phu-hop-ca-nam-nu-cap-doi-tokyo-shop-video-that-i1865683251.html',
 'https://www.lazada.vn/products/ao-thu

# Iterate through each product URL and extract data:
- Product name
- Price
- Discount
- Image
- Description

In [67]:
product_name, price, describe, image_url = [], [], [], []
driver = webdriver.Chrome()

# mark start time for scraping
start = time.time()
# loop over each product url to extract data
# Due to the limited memmory of system, I only scrap 50 products
for u in url_lst[:50]:
    driver.set_window_size(730, 850)
    driver.get(u)
    
    # scrolling
    previous_height = driver.execute_script('return document.body.scrollHeight')
    body = driver.find_element(By.TAG_NAME, value = 'body')
    while True:
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(1)
        new_height = driver.execute_script('return document.body.scrollHeight')
        if new_height == previous_height:
            break
        previous_height = new_height
    
    # set explicitly wait
    element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.pdp-mod-specification .pdp-mod-section-title'))
        )

    # get product name
    product_name.append(driver.find_element(By.CSS_SELECTOR, 
                                            value = '.pdp-mod-product-badge-title').text)
    # get image url
    image_url.append(driver.find_element(By.CSS_SELECTOR, 
                                         value = '.pdp-mod-common-image').get_attribute('src'))
    # get product price
    p = driver.find_elements(By.CSS_SELECTOR, value = '.origin-block')
    if len(p) > 0:
        price.append(p[0].text)
    else:
        price.append(driver.find_element(By.CSS_SELECTOR, value = '.pdp-price').text)
    # get discription
    ds = driver.find_elements(By.CSS_SELECTOR, value = '.pdp-mod-specification .pdp-mod-section-title')
    describe.append(' '.join([x.get_attribute('innerHTML') for x in ds]))
    time.sleep(3)
driver.close()

# mark end time of scraping
end = time.time()
# calculate time consume for scraping 400 products 
print(f"Running time {end - start}")

Running time 1649.6687812805176


In [69]:
# create data frame based on scraped data
df = pd.DataFrame({
    'product_name' : product_name,
    'price' : price,
    'describe' : describe,
    'image' : image_url})
df[['price', 'discount']] = df['price'].str.split('-', expand=True)
df['price'] = df['price'].str.replace(' ₫', '')
df['discount'] = df['discount'].fillna(0)

In [70]:
df.shape

(50, 5)

In [71]:
df.head()

Unnamed: 0,product_name,price,describe,image,discount
0,ÁO THUN NAM POLO THÊU LOGO MEC CHẤT VẢI COTTON...,119.0,Đặc tính sản phẩm ÁO THUN NAM POLO THÊU LOGO M...,https://lzd-img-global.slatic.net/g/ff/kf/Se37...,34%
1,【Hàng Có Sẵn】áo Tay Dài Mô Tô Yamaha Mio I 125...,312.0,Đặc tính sản phẩm 【Hàng Có Sẵn】áo Tay Dài Mô T...,https://lzd-img-global.slatic.net/g/p/1ac0e62b...,38%
2,Áo thể thao nam Coolmate Basics thấm hút nhanh...,99.0,Đặc tính sản phẩm Áo thể thao nam Coolmate Bas...,https://lzd-img-global.slatic.net/g/p/4bede891...,2%
3,ASRV Áo Thun Nam Tay Ngắn Dáng Rộng Họa Tiết K...,199.0,Đặc tính sản phẩm ASRV Áo Thun Nam Tay Ngắn Dá...,https://lzd-img-global.slatic.net/g/p/b7ff0ac5...,60%
4,Áo thun nam cổ tròn dập vân nổi thêu họa tiết ...,99.0,Đặc tính sản phẩm Áo thun nam cổ tròn dập vân ...,https://lzd-img-global.slatic.net/g/ff/kf/S8e5...,63%


In [72]:
df.tail()

Unnamed: 0,product_name,price,describe,image,discount
45,Áo thun áo phông BEAUTIFUL- Áo phông form rộng...,59.0,Đặc tính sản phẩm Áo thun áo phông BEAUTIFUL- ...,https://lzd-img-global.slatic.net/g/p/6814f18d...,41%
46,Áo thun tay lỡ - PHỐI LAYER + IN NỔI CHỮ FLOWE...,200.0,Đặc tính sản phẩm Áo thun tay lỡ - PHỐI LAYER ...,https://lzd-img-global.slatic.net/g/p/28df7959...,52%
47,Áo thun nam POLO vải cá sấu cotton cao cấp ngắ...,109.0,Đặc tính sản phẩm Áo thun nam POLO vải cá sấu ...,https://lzd-img-global.slatic.net/g/p/42300e88...,29%
48,Áo Thun Tay Lỡ Form Rộng 1969Unisex Áo Phông U...,75.6,Đặc tính sản phẩm Áo Thun Tay Lỡ Form Rộng 196...,https://lzd-img-global.slatic.net/g/p/b72e3c9b...,57%
49,ASRV Áo thun nam Mới Mùa Hè kích thước lớn dán...,360.0,Đặc tính sản phẩm ASRV Áo thun nam Mới Mùa Hè ...,https://lzd-img-global.slatic.net/g/ff/kf/S3d0...,52%


In [73]:
# save data frame
df.to_csv('Lazada_male_fashion.csv', index=False)

In [75]:
# calculate the average length of product descriptions
print(np.array([len(x) for x in df.loc[:, 'describe']]))
print('\nAverage description length:', np.array([len(x) for x in df.loc[:, 'describe']]).mean())

[118 190  68 122 121 143 124  93 189 122 173 109 165 143 132  80 131 232
 104 103  82  71 107 249 133  72 144 265 251  76 107 113 107  92  85 106
 155 237  97 120 118 190  68 122 121 143 124  93 189 122]

Average description length: 132.42
