In [5]:
import urllib.request
from bs4 import BeautifulSoup # parse html
import re #regex
import csv
import os
import json
import pandas as pd
import urllib.request
import joblib #load, dump pkl
from underthesea import word_tokenize #word_tokenize of lines
import numpy as np
import transformers as ppb # load model BERT
from transformers import BertModel, BertTokenizer
import torch
from sklearn.model_selection import train_test_split
# scrap comment = selenium
from selenium import webdriver 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
# import requests

# Craw comment of product tiki, lazada

In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import TimeoutException, WebDriverException

def load_url_selenium_lazada(url):
    # Create a Service object to specify the path to ChromeDriver
    service = Service(executable_path='chromedriver-win64/chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    
    print("Loading url=", url)
    driver.get(url)
    list_review = []
    # just crawl 10 pages
    x = 0
    while x < 10:
        try:
            # Wait for reviews to load
            WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.item")))
        except:
            print('No comments found')
            break

        product_reviews = driver.find_elements(By.CSS_SELECTOR, "[class='item']")
        # Get product reviews
        for product in product_reviews:
            review = product.find_element(By.CSS_SELECTOR, "[class='content']").text
            if review.strip():
                print(review, "\n")
                list_review.append(review)
        
        # Check if the next button is disabled or not
        if len(driver.find_elements(By.CSS_SELECTOR, "button.next-pagination-item.next[disabled]")) > 0:
            break
        else:
            try:
                button_next = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.next-pagination-item.next")))
                driver.execute_script("arguments[0].click();", button_next)
                print("Next page")
                time.sleep(2)
                x += 1
            except TimeoutException:
                print('Next button not found or timeout')
                break
                
    driver.close()
    return list_review

def load_url_selenium_tiki(url):
    # Create a Service object to specify the path to ChromeDriver
    service = Service(executable_path='chromedriver-win64/chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    
    print("Loading url=", url)
    driver.get(url)
    list_review = []
    # just crawl 10 pages
    x = 0
    while x < 10:
        try:
            # Wait for reviews to load
            WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.review-comment")))
        except:
            print('No comments found')
            break

        product_reviews = driver.find_elements(By.CSS_SELECTOR, "[class='review-comment']")
        # Get product reviews
        for product in product_reviews:
            review = product.find_element(By.CSS_SELECTOR, "[class='review-comment__content']").text
            if review.strip():
                print(review, "\n")
                list_review.append(review)
        
        # Check if the next button is disabled or not
        try:
            button_next = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[class = 'btn next']")))
            driver.execute_script("arguments[0].click();", button_next)
            print("Next page")
            time.sleep(2)
            x += 1
        except (TimeoutException, WebDriverException) as e:
            print('Failed to load next page:', e)
            break
                
    driver.close()
    return list_review


# Standard data, tokenizer

In [11]:

def standardize_data(row):
    # remove stopword
    # Remove . ? , at index final
    row = re.sub(r"[\.,\?]+$-", "", row)
    # Remove all . , " ... in sentences
    row = row.replace(",", " ").replace(".", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ") \
        .replace("-", " ").replace("?", " ")

    row = row.strip()
    return row

# Tokenizer
def tokenizer(row):
    return word_tokenize(row, format="text")

def analyze(result):
    bad = np.count_nonzero(result)
    good = len(result) - bad
    print("No of bad and neutral comments = ", bad)
    print("No of good comments = ", good)

    if good>bad:
        return "Good! You can buy it!"
    else:
        return "Bad! Please check it carefully!"


# Processing data

In [12]:
def processing_data(data):
    # 1. Standardize data
    data_frame = pd.DataFrame(data)
    print('data frame:', data_frame)
    data_frame[0] = data_frame[0].apply(standardize_data)

    # 2. Tokenizer
    data_frame[0] = data_frame[0].apply(tokenizer)

    # 3. Embedding
    X_val = data_frame[0]
    return X_val


# Load Pretrain model BERT

In [13]:
def load_pretrainModel(data):
    
    '''
    Load pretrain model/ tokenizers
    Return : features
    '''
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    #encode lines
    tokenized = data.apply((lambda x: tokenizer.encode(x, add_special_tokens = True)))

    # get lenght max of tokenized
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    print('max len:', max_len)

    # if lenght of tokenized not equal max_len , so padding value 0
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    print('padded:', padded[1])
    print('len padded:', padded.shape)

    #get attention mask ( 0: not has word, 1: has word)
    attention_mask = np.where(padded ==0, 0,1)
    print('attention mask:', attention_mask[1])

    # Convert input to tensor
    padded = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)


    # Load model
    with torch.no_grad():
        last_hidden_states = model(padded, attention_mask =attention_mask)
    #     print('last hidden states:', last_hidden_states)

    features = last_hidden_states[0][:,0,:].numpy()
    print('features:', features)
    
    return features


# Predict

In [27]:
def predict(url):
    # 1. Load URL and print comments
    if url== "":
        url = "https://tiki.vn/dien-thoai-samsung-galaxy-m31-128gb-6gb-hang-chinh-hang-p58259141.html"
    #data = load_url_selenium_lazada(url)
    data = load_url_selenium_tiki(url)
    data = processing_data(data)
    features = load_pretrainModel(data)
    # 2. Load weights
    model = joblib.load('save_model.pkl')
    # 3. Result
    result = model.predict(features)
    print(result)
    print(analyze(result))
predict(url ='https://tiki.vn/dien-thoai-samsung-galaxy-a05s-4gb-128gb-da-kich-hoat-bao-hanh-dien-tu-hang-chinh-hang-p273258825.html')

Loading url= https://tiki.vn/dien-thoai-samsung-galaxy-a05s-4gb-128gb-da-kich-hoat-bao-hanh-dien-tu-hang-chinh-hang-p273258825.html
No comments found
data frame: Empty DataFrame
Columns: []
Index: []


KeyError: 0