In [1]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import time
import random
import csv
import os
from tqdm import tqdm

In [2]:
# config
CONFIG = {
    "container": ".listproduct",  
    "item_tag": "a",             
    "see_more_btn": ".see-more-btn",
    "next_btn_id": "_pgNextPage",
    "disabled_class": "disabled"
}

# read data from file csv and access link
INPUT_FILE = 'list_link_product.csv'  # input
OUTPUT_FILE = 'reviews_rating.csv' # output
MAX_WORKERS = 4    
MAX_PAGES = 5

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") # Chạy ngầm
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    # turn of img.
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)
    
    # fake user-agent
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(options=options)
    return driver
    
    
def random_sleep(min_s=1.0, max_s=2.0):
    time.sleep(random.uniform(min_s, max_s))

def extract_links(driver, list_link_product):
    try : 
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, CONFIG["container"])))
        container = driver.find_element(By.CSS_SELECTOR, CONFIG["container"])
        elements = container.find_elements(By.TAG_NAME, CONFIG["item_tag"])
        for el in elements : 
            try :
                link = el.get_attribute("href")
                if link and "http" in link :
                    list_link_product.add(link)
            except StaleElementReferenceException:
                continue
    except Exception as e:
        return 
        
def process_see_more(driver, url, list_link) :
    driver.get(url)
    wait = WebDriverWait(driver, 10) 

    while True:
        try:
            btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, CONFIG["see_more_btn"])))
            # Scroll
            driver.execute_script("arguments[0].scrollIntoView(true);", btn)
            time.sleep(0.5) 
            driver.execute_script("arguments[0].click();", btn)
            random_sleep(1.5, 2.5) # load
            
        except (TimeoutException, NoSuchElementException):
            break
        except Exception as e:
                break
    # crawl data
    extract_links(driver, list_link)
    
def process_pagination(driver, url, list_link) :
    driver.get(url)
    wait = WebDriverWait(driver, 10) 
    
    while True:
        # crawl data
        extract_links(driver, list_link)
        try :
            # move page.
            next_li = driver.find_element(By.ID, CONFIG["next_btn_id"])
            if CONFIG["disabled_class"] in next_li.get_attribute("class"):
                break
                
            # Click next
            next_a = next_li.find_element(By.TAG_NAME, "a")
            driver.execute_script("arguments[0].click();", next_a)
            random_sleep(2, 3)
            
        except NoSuchElementException:
            break
        except Exception as e:
            break

def scrape_reviews(product_url):
    driver = init_driver()
    
    if "/danh-gia" not in product_url:
        if "?" in product_url:
             review_url = product_url.split("?")[0] + "/danh-gia"
        else:
             review_url = product_url + "/danh-gia"
    else:
        review_url = product_url

    
    all_reviews = []
    try : 
        driver.get(review_url)
        
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "comment-list")))
        except TimeoutException:
            driver.quit()
            return []
        
        for page in range(1, MAX_PAGES + 1):
  
            # get current HTML
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # get list <li> comment (class="par")
            review_items = soup.find_all('li', class_='par')
            
            if not review_items:
                break
            for item in review_items:
                try:
                    # user
                    name_tag = item.find(class_='cmt-top-name')
                    user_name = name_tag.get_text(strip=True) if name_tag else "Anonymous"
                    
                    # comment
                    content_tag = item.find(class_='cmt-txt')
                    content = content_tag.get_text(strip=True) if content_tag else ""
                    
                    # rating
                    stars_container = item.find(class_='cmt-top-star')
                    if stars_container:
                        stars = len(stars_container.find_all('i', class_='iconcmt-starbuy'))
                    else:
                        stars = 0 
                    
                    product_id = product_url
                    
                    if content: 
                        all_reviews.append({
                            "user_name": user_name,
                            "product_id": product_url,
                            "rating": stars,
                            "comment": content
                        })
                except Exception as e:
                    continue  
    except Exception as e:
        pass
    finally:
        driver.quit() 

    return all_reviews

       

In [3]:
information_url = {
    "https://www.thegioididong.com/dtdd" : "SeeMore",
    "https://www.thegioididong.com/laptop" : "SeeMore",
    "https://www.thegioididong.com/may-tinh-bang" : "SeeMore",
    "https://www.thegioididong.com/pc-may-in" : "SeeMore",
    
    "https://www.thegioididong.com/dong-ho-thong-minh-da-tien-ich" : "SeeMore",
    "https://www.thegioididong.com/dong-ho-thong-minh-the-thao-chuyen-nghiep" : "SeeMore",
    "https://www.thegioididong.com/dong-ho-thong-minh-tre-em" : "SeeMore",
    "https://www.thegioididong.com/day-dong-ho" : "SeeMore",
    
    "https://www.thegioididong.com/camera-giam-sat" : "SeeMore",
    "https://www.thegioididong.com/tai-nghe" : "SeeMore",
    "https://www.thegioididong.com/sac-dtdd" : "SeeMore",
    "https://www.thegioididong.com/phu-kien/apple" : "SeeMore",
    "https://www.thegioididong.com/loa-laptop" : "SeeMore",
    "https://www.thegioididong.com/sac-cap" : "SeeMore",
    "https://www.thegioididong.com/chuong-trinh-phu-kien-laptop" : "SeeMore",

    "https://www.thegioididong.com/bang-ve-dien-tu" : "SeeMore",
    "https://www.thegioididong.com/mieng-lot-chuot" : "SeeMore",
    "https://www.thegioididong.com/den-dien-den-sac" : "SeeMore",
    "https://www.thegioididong.com/gia-treo-man-hinh" : "SeeMore",
    "https://www.thegioididong.com/phan-mem" : "SeeMore",
    "https://www.thegioididong.com/mieng-phu-ban-phim" : "SeeMore",
    "https://www.thegioididong.com/tui-dung-phu-kien" : "SeeMore",
    "https://www.thegioididong.com/tui-chong-soc" : "SeeMore",
    "https://www.thegioididong.com/thiet-bi-mang" : "SeeMore",
    "https://www.thegioididong.com/ban-phim" : "SeeMore",
    "https://www.thegioididong.com/chuot-may-tinh" : "SeeMore",
    "https://www.thegioididong.com/hub-chuyen-doi" : "SeeMore",

    # Pagination
    "https://www.thegioididong.com/dong-ho-deo-tay#c=7264&o=8&pi=0" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-nam" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-nu" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-casio" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-citizen" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-orient" : "Pagination",
    "https://www.thegioididong.com/khuyen-mai-dong-ho-chi-ban-online" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-lacoste?itm_source=trang-nganh-hang&itm_medium=filter" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-tommy" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-festina" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-coach" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-bulova" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-ferrari" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-candino" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-movado" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-certina" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-ernest-borel" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-citizen" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-edifice-casio" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-orient-star" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-casio-protrek" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-g-shock" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-MVW" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-elio" : "Pagination",
    "https://www.thegioididong.com/dong-ho-deo-tay-tre-em" : "Pagination",
    # Pagination
}

In [None]:
if __name__ == "__main__":
    list_link_product = set()
    # init driver
    driver = init_driver()
    try :
        for url, flag in information_url.items():
            if flag == "SeeMore" : 
                process_see_more(driver, url, list_link_product)
            else :
                process_pagination(driver, url, list_link_product)      
    finally :
        driver.quit()


    print(len(list_link_product))
    # save
    with open("list_link_product.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["product_link"])   # header
        for link in list_link_product:
            writer.writerow([link])

    # access link and read review 
    try:
        df_links = pd.read_csv(INPUT_FILE, header=None) 
        list_urls = df_links.iloc[:, 0].tolist() 
        list_urls = [x for x in list_urls if isinstance(x, str) and "http" in x]
    except Exception as e:
        list_urls = []
        exit()

    all_results = []

    if list_urls:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_url = {executor.submit(scrape_reviews, url): url for url in list_urls}
            
            
            for future in tqdm(as_completed(future_to_url), total=len(list_urls), desc="Crawling Reviews"):
                url = future_to_url[future]
                try:
                    data = future.result()
                    if data:
                        all_results.extend(data)
                except Exception as exc:
                    print(f"error link : {url} - {exc}")

    # create DataFrame and sort
    if all_results:
        df_final = pd.DataFrame(all_results)
        # sort username
        df_final = df_final.sort_values(by="user_name", ascending=True)
        # save file
        df_final.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print(df_final.head())
    else:
        print("can not crawl data.")



    

In [6]:
    try:
        df_links = pd.read_csv(INPUT_FILE, header=None) 
        list_urls = df_links.iloc[:, 0].tolist() 
        list_urls = [x for x in list_urls if isinstance(x, str) and "http" in x]
    except Exception as e:
        list_urls = []
        exit()

    all_results = []

    if list_urls:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_url = {executor.submit(scrape_reviews, url): url for url in list_urls}
            
            
            for future in tqdm(as_completed(future_to_url), total=len(list_urls), desc="Crawling Reviews"):
                url = future_to_url[future]
                try:
                    data = future.result()
                    if data:
                        all_results.extend(data)
                except Exception as exc:
                    print(f"error link : {url} - {exc}")

    # create DataFrame and sort
    if all_results:
        df_final = pd.DataFrame(all_results)
        # sort username
        df_final = df_final.sort_values(by="user_name", ascending=True)
        # save file
        df_final.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print(df_final.head())
    else:
        print("can not crawl data.")


Crawling Reviews: 100%|█████████████████████████████████████████████████████████| 5636/5636 [14:13:16<00:00,  9.08s/it]


      user_name                                         product_id  rating  \
14958            https://www.thegioididong.com/dong-ho-deo-tay/...       4   
40096            https://www.thegioididong.com/dong-ho-deo-tay/...       5   
70103            https://www.thegioididong.com/dong-ho-deo-tay/...       5   
40103            https://www.thegioididong.com/dong-ho-deo-tay/...       5   
14948            https://www.thegioididong.com/dong-ho-deo-tay/...       4   

                                             comment  
14958                              Đeo rất thoải mái  
40096                    Nhân viên nhiệt tình vui vẻ  
70103  đồng hồ đẹp, vừa tay, hợp với cả nam và nữ ạ.  
40103                    Nhân viên nhiệt tình vui vẻ  
14948                              Đeo rất thoải mái  


In [35]:
import pandas as pd
import numpy as np

# Read file
df = pd.read_csv('reviews_rating.csv')

# fill missing user names 
missing_mask = df['user_name'].isnull()
num_missing = missing_mask.sum()

if num_missing > 0:
    random_names = [
        "Minh Anh", "Thanh Đạt", "Hoàng Nam", "Ngọc Linh", "Đức Huy",
        "Phương Thảo", "Quang Kiệt", "Mai Trang", "Tuấn Khang", "Bảo Châu"
    ]
    df.loc[missing_mask, 'user_name'] = np.random.choice(random_names, size=num_missing)
else:
    print("There are no blank rows in the user_name column.")

# extract id and clean name ---
# regex captures: Group 1 (digits), Group 2 (name part)
# we use 'combine_first' to keep the original name if the regex doesn't match (i.e., no ID prefix)
extracted = df['user_name'].str.extract(r'^(\d+)\s*[-]*\s*(.*)$')
df['user_id'] = pd.to_numeric(extracted[0], errors='coerce') # Convert captured ID to number
df['user_name'] = extracted[1].combine_first(df['user_name']) # Use cleaned name or original

# manage id assignment ---
# create a map of names that already have an ID {Name: ID}
# we drop duplicates so each name appears only once in the map
existing_map = df.dropna(subset=['user_id']).drop_duplicates('user_name').set_index('user_name')['user_id']

# identify names that are completely missing an ID
all_unique_names = df['user_name'].unique()
# find names that are NOT in the existing_map index
names_needing_id = np.setdiff1d(all_unique_names, existing_map.index)

# generate new unique ids 
if len(names_needing_id) > 0:
    # get all IDs currently in use
    taken_ids = existing_map.values
    
    # generate available IDs (10000 to 100000)
    # np.setdiff1d removes taken_ids from the full range efficiently
    possible_ids = np.arange(10000, 100001)
    available_ids = np.setdiff1d(possible_ids, taken_ids)
    
    if len(available_ids) < len(names_needing_id):
        raise ValueError("Error: Not enough unique IDs available in the range 10000-100000.")

    # select random unique IDs for the new names
    new_ids = np.random.choice(available_ids, size=len(names_needing_id), replace=False)
    # create new mapping Series and combine with existing one
    new_map = pd.Series(new_ids, index=names_needing_id)
    final_map = pd.concat([existing_map, new_map])
else:
    final_map = existing_map

# apply and save
# map the final ids back to the dataframe
df['user_id'] = df['user_name'].map(final_map).astype(int)

# reorder columns (user_id first) and sort
df = df[['user_id'] + [c for c in df.columns if c != 'user_id']]
df = df.sort_values(by="user_id", ascending=True)

# save file
df.to_csv('rating.csv', index=False)



In [36]:
mask = df["user_name"] == "Lê Quang Phục"
print(df.loc[mask, "user_id"])

96    100971
97    100971
95    100971
98    100971
99    100971
Name: user_id, dtype: int64
