In [1]:
# Import library
from datetime import datetime
import numpy as np
from time import sleep
import random
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.by import By
import pandas as pd
import os

# Folder path
folder_path = os.path.join(os.path.dirname(os.getcwd()), "data")

# Add user agen
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"

# Setting options
options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={user_agent}")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--start-maximized")
options.add_argument("--disable-popup-blocking")
options.add_argument("--no-sandbox")

In [2]:
# Read ids from crawled csv
productdata_filename = "mergedproduct_20240326_1926.csv"

existing_df = pd.read_csv(os.path.join(folder_path, "merged", productdata_filename))
existing_product_ids = existing_df['product_id'].tolist()
existing_data_product_ids = existing_df['data_product_id'].tolist()
existing_links = existing_df['link_item'].tolist()

In [3]:
# Get rating for comments:
def get_star(string):
    start_index = string.find(':')
    end_index = string.find('%')
    return int(string[start_index+1:end_index]) / 20

# Process data-product-id
def get_unique_data_productids(nested_list):
    unique_ids = set()
    for sublist in nested_list:
        unique_ids.update(sublist.split(','))
    return [id for id in unique_ids]

# Parse data_product_id
def parse_data_product_id(data_product_id_str):
    # Split string by ","
    id_list = data_product_id_str.split(',')
    # Get unique
    set_list = set()
    set_list.update(id_list)
    # Convert each element in the list to an integer and return
    return [id_ for id_ in set_list]

# ============================ GET INFOMATION OF ALL ITEMS
# Declare browser
driver = webdriver.Chrome(options=options)
sleep(random.randint(1,5))

crawled_ids = set()
df_list = []
# [1:40+1] ~ 1-40
# [41:80+1] ~ 41-80
# [81:120+1] ~ 81-120
# [120:160+1] ~ 121-160 
# [161:200+1] ~ 161-200
# [201:240+1] ~ 201-240

# [241:280+1] ~ 241-280
# [281:320+1] ~ 281-320

# [320:360+1] ~ 321-360 
# [361:400+1] ~ 361-400

# [401:500+1] ~ 401-500 
# [501:600+1] ~ 501-600

# [601:700+1] ~ 601-700
# [701:800+1] ~ 701-800

# [801:900+1] ~ 801-900
# [901:1000+1] ~ 901-1000

# [1001:1100+1] ~ 1001-1100
# [1101:1200+1] ~ 1101-1200

# [1201:1300+1] ~ 1201-1300
# [1301:1400+1] ~ 1301-1400

# [1401:1500+1] ~ 1401-1500
# [1501:1600+1] ~ 1501-1600

# [1601:1700+1] ~ 1601-1700
# [1701:1800+1] ~ 1701-1800

# [1801:1900+1] ~ 1801-1900
# [1901:2000+1] ~ 1901-2000
for i, row in existing_df[1901:2000+1].iterrows():
    
    # Get product page
    name_comment, content_comment, product_variant, datetime_comment, rating_comment = [], [], [], [], []
    driver.get(row['link_item'])
    sleep(random.randint(6,7))
    
    # Get data_product_id_list
    elems_data_productids_list = driver.find_elements(By.CSS_SELECTOR, '.attribute-option-item')
    uniq_data_productids_list = parse_data_product_id(",".join([elem.get_attribute('data-product-ids') for elem in elems_data_productids_list]))
    uniq_data_product_id_str = ",".join(uniq_data_productids_list)

    # Get comment_pagination_number
    elems_cmtpage_nums = driver.find_elements(By.CSS_SELECTOR, '.pagination_comment a')
    if elems_cmtpage_nums:
        commentpage_nums = [int(elem.get_attribute('rel')) for elem in elems_cmtpage_nums
                        if elem.get_attribute('rel').isdigit()]
        max_cmtpage = max(commentpage_nums) if commentpage_nums else 1
    else:
        max_cmtpage = 1

    # Decide whether to crawl
    if not set(uniq_data_productids_list).intersection(crawled_ids):
        # Get comment details
        for page_num in range(1, max_cmtpage + 1):
            try:
                sleep(random.randint(2,3))
                
                print("Crawl Page " + str(page_num))
                elems_name = driver.find_elements(By.CSS_SELECTOR , ".title_comment strong.txt_color_1")
                name_comment = [elem.text for elem in elems_name] + name_comment
                sleep(random.randint(1,2))

                elems_content = driver.find_elements(By.CSS_SELECTOR , ".item_comment .content_comment")
                content_comment = [elem.text for elem in elems_content] + content_comment
                sleep(random.randint(1,2))

                elems_product_variant = driver.find_elements(By.CSS_SELECTOR , ".item_comment .txt_999")
                product_variant = [elem.text for elem in elems_product_variant] + product_variant
                sleep(random.randint(1,2))

                elems_datetime = driver.find_elements(By.CSS_SELECTOR , ".item_comment .timer_comment")
                datetime_comment = [elem.text for elem in elems_datetime] + datetime_comment
                sleep(random.randint(1,2))

                elems_rating = driver.find_elements(By.CSS_SELECTOR , ".item_comment .number_start")
                rating_comment = [get_star(elem.get_attribute('style')) for elem in elems_rating] + rating_comment
                sleep(random.randint(1,2))
                
                next_pagination_cmt = driver.find_element(By.CSS_SELECTOR, "a.item_next_sort .icon_carret_down")
                next_pagination_cmt.click()

                print("Clicked on button next page!")
                sleep(random.randint(2,3))

            except ElementNotInteractableException:
                print("Element Not Interactable Exception!")
                break
            except NoSuchElementException:
                print("Next page button not found or not clickable!")
                break        

        # Add into a dataframe
        comment_data = pd.DataFrame(
            list(zip(name_comment, content_comment, product_variant, datetime_comment, rating_comment)), 
            columns = ['name_comment', 'content_comment','product_variant', 'datetime_comment', 'rating'])
        
        # Add column "link_item", "data_product_id_list", "data_product_id"
        comment_data.insert(0, "link_item", row['link_item'])
        comment_data.insert(1, "data_product_id_list", uniq_data_product_id_str)
        comment_data.insert(2, "data_product_id", row['data_product_id'])
        
        # For "data_product_id_list", convert string into list
        comment_data['data_product_id_list'] = comment_data['data_product_id_list'].apply(parse_data_product_id)
        df_list.append(comment_data)

        crawled_ids.update(uniq_data_productids_list)
        sleep(random.randint(6,7))
    else:
        continue

Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Clicked on button next page!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Clicked on button next page!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Clicked on button next page!
Crawl Page 2
Clicked on button next page!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Clicked on button next page!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not found or not clickable!
Crawl Page 1
Next page button not fo

In [4]:
# Combine all comment crawled
combined_comment_data = pd.concat(df_list, ignore_index=True)
combined_comment_data.head(5)

  combined_comment_data = pd.concat(df_list, ignore_index=True)


Unnamed: 0,link_item,data_product_id_list,data_product_id,name_comment,content_comment,product_variant,datetime_comment,rating
0,https://hasaki.vn/san-pham/dau-goi-head-should...,"[89173, 89175, 89177, 89179]",89173,Phuong Ng,"Loại này dưỡng nhiều và có thể lên gàu nha, mì...",Dầu Gội Head & Shoulders Dưỡng Ẩm Da Đầu Khô 4...,09: 45 | 05/08/2023,4.0
1,https://hasaki.vn/san-pham/dau-goi-head-should...,"[89173, 89175, 89177, 89179]",89173,Trần Văn Sơn,"Được thật sự, mình bị viêm da tiết bã. Đang dù...",Dầu Gội Head & Shoulders Dưỡng Ẩm Da Đầu Khô 4...,23: 46 | 27/02/2023,5.0
2,https://hasaki.vn/san-pham/serum-duong-toc-ell...,"[92651, 92675, 92653, 92655, 92657, 92673, 926...",92647,Trần Thị Hà Phương,"Xài cũng mượt mà mùi hơi khó chịu, hôi mùi thu...",Serum Dưỡng Tóc Ellips Vitamin Phục Hồi Hư Tổn...,19: 25 | 12/12/2023,4.0
3,https://hasaki.vn/san-pham/serum-duong-toc-ell...,"[92651, 92675, 92653, 92655, 92657, 92673, 926...",92647,Trần Thị Hà Phương,"Màu vàng và nâu thơm, xài mượt tóc, mà phải xà...",Serum Dưỡng Tóc Ellips Vitamin Óng Mượt Vỉ 6 ...,19: 23 | 12/12/2023,5.0
4,https://hasaki.vn/san-pham/serum-duong-toc-ell...,"[92651, 92675, 92653, 92655, 92657, 92673, 926...",92647,Khánh Ly,Thơm lắm nha,Serum Dưỡng Tóc Ellips Vitamin Óng Mượt Vỉ 6 ...,14: 39 | 15/04/2023,5.0


In [5]:
# Save into csv
current_datetime = datetime.now().strftime("%Y%m%d_%H%M")
comment_data_file_name = f"comment_data_{current_datetime}.csv"
combined_comment_data.to_csv(os.path.join(folder_path, "comment", comment_data_file_name), encoding='utf-8-sig')