In [None]:
import requests, json
import threading
from queue import Queue
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import math
import datetime
import random
from itertools import chain
import configparser
SHOPEE_URL = "https://shopee.co.th"

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}


### init and read config
config = configparser.ConfigParser()
config.read('config.ini')

MONGO_DB = config["Database"]["MONGO_DB"]
MONGO_HOST=config["Database"]["HOST"]
MONGO_USER = config["Database"]["MONGO_USER"]
MONGO_PASS = config["Database"]["MONGO_PASS"]

# db info
mongoURL = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % (MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_DB)
dbName = MONGO_DB
client = MongoClient(mongoURL)
db = client[dbName]

# collection name
PRODUCT = "shopee_skincare"
REVIEW = "shopee_skincare_review"
ERROR = "shopee_skincare_review_error"

In [None]:
def review_url(itemid,shopid):
    
    url = "https://shopee.co.th/api/v2/item/get_ratings?filter=0&flag=1&itemid="+str(itemid)+"&limit=6&offset=0&shopid="+str(shopid)
    resp = requests.get(url,headers = headers)
    #print(url)
    try:
        post = json.loads(resp.text)
    except:
        print("json loads error")
        db[ERROR].insert_one({"url":url})
    
    

    try:
        total = int(post['data']['item_rating_summary']['rating_total']) #取得評論count
    except:
        total = 0
        
    totalPage = math.ceil(total/6)
        
    links = []
    for i in range(0,totalPage):
        subUrl = "https://shopee.co.th/api/v2/item/get_ratings?filter=0&flag=1&itemid="+str(itemid)+"&limit=6&offset="+str(6*i)+"&shopid="+str(shopid)
        links.append(subUrl)
        
        
        
    q = Queue()
    threads = []
    
    for url in links:
        t = threading.Thread(target=thread, args=[url,itemid,shopid,q])
        threads.append(t)
        t.start()

    # 等待所有的子執行緒結束
    for t in threads:
        t.join()
    
    item_list = list(q.queue)
    return item_list
        
    
             
    

In [None]:
def thread(url,itemid,shopid,q):
    result = crawl_review(url,itemid,shopid)
    q.put(result)

In [None]:
def crawl_review(url,itemid,shopid):
    #print(url)
    resp = requests.get(url,headers = headers)
    
    try:
        post = json.loads(resp.text)
    except:
        print("json loads error")
        db[ERROR].insert_one({"url":url})
    
    #print(post)
    ratings = post['data']['ratings']
    
    reviews = []
    for rating in ratings:
        author_username = rating['author_username']
        comment = rating['comment']
        ctime = datetime.datetime.strptime(time.ctime(rating['ctime']),"%a %b %d %H:%M:%S %Y")
        rating_star = rating['rating_star']
        cmtid = rating['cmtid']
        
        
        dic = {
            
            "author_username":author_username,
            "comment":comment,
            "time":ctime,
            "rating_star":rating_star,
            "itemid":itemid,
            "shopid":shopid,
            "cmtid":cmtid
            
        }
        
        reviews.append(dic)
        
    return reviews
    

In [None]:
data = db[PRODUCT].find({'rating_star':{'$gt':0}},{'itemid':1,'shopid':1})

In [None]:
data = [i for i in data]

In [None]:
count = 0 #總計數
bag = 0 #累積200計數
dataLen = len(data)
resultList = []
total = 0

for d in data:
    
    #print(count)

    results = review_url(d['itemid'],d['shopid']) #list
    resultList = resultList + results
    count = count + 1
    
    if count == 50 or bag == int(dataLen/50): #收集完100個商品的評論再insert
        bag = bag + 1
        db[REVIEW].insert_many(list(chain(*resultList))) # list(chain(*resultList)) 有巢狀list，只取出所有list裡的item
        #print("insert reviews of  product")
        total = total+count
        print(total)
        resultList = [] #清空list
        count = 0
        time.sleep(10)
        
    