In [None]:
import threading
import requests
import json
from queue import Queue
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import math
import random
import numpy as np
import configparser

In [None]:
SHOPEE_URL = "https://shopee.co.th"

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# db info
### init and read config
config = configparser.ConfigParser()
config.read('config.ini')

MONGO_DB = config["Database"]["MONGO_DB"]
MONGO_HOST=config["Database"]["HOST"]
MONGO_USER = config["Database"]["MONGO_USER"]
MONGO_PASS = config["Database"]["MONGO_PASS"]

# db info
mongoURL = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % (MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_DB)
dbName = MONGO_DB
client = MongoClient(mongoURL)
db = client[dbName]

# collection name
PRODUCT = "shopee_skincare"
SELLER = "shopee_skincare_seller"
ERROR = "shopee_skincare_error"

In [None]:
#爬 賣家資料
def crawl_seller(shopId):
    
    url = "https://shopee.co.th/api/v2/shop/get?is_brief=1&shopid="+str(shopId)
    #print(shopId)
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    shop_data = post['data']

    try:
        response_rate=shop_data.get('response_rate') #聊天回覆率
    except:
        response_rate = ""
    try:
        total_avg_star=shop_data.get('account').get('total_avg_star') #平均星等
    except:
        total_avg_star=""
    try:
        follower_count=shop_data.get('follower_count') #粉絲數
    except:
        follower_count=""
    try:
        is_shopee_verified=shop_data.get('is_shopee_verified') #是否為蝦皮推薦賣家
    except:
        is_shopee_verified=""
    try:
        ctime=time.ctime(shop_data.get('ctime')) #加入時間
    except:
        ctime=""
    try:
        preparation_time=getTime(shop_data.get('preparation_time'))
    except:
        preparation_time=""
    try:
        is_official_shop=shop_data.get('is_official_shop') #是否為商城
    except:
        is_official_shop=""

    dic = {
        "shop_id":shopId,
        "shop_name":shop_data.get('account').get('username'),
        "response_rate":response_rate, #聊天回覆率
        "total_avg_star":total_avg_star, #平均星等
        "follower_count":follower_count, #粉絲數
        "is_shopee_verified":is_shopee_verified, #是否為蝦皮推薦賣家
        "ctime":ctime, #加入時間
        "preparation_time":preparation_time,
        "is_official_shop":is_official_shop #是否為商城
    }

    #print(dic)
    return dic


In [None]:
# 將賣家資料中的 preparation_time 作轉換 
def getTime(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return("%d:%02d:%02d" % (h, m, s))


# In[49]:


def thread_crawl(shopId,q):
    result = crawl_seller(shopId)
    q.put(result)



In [None]:
## get unique shopid list

collection = db[PRODUCT]
data = collection.find({},{"shopid":1})

shop_list = []
for i in data:
    shop_list.append(i['shopid'])
    
shop_list = list(np.unique(shop_list))

total = 0
max_bag = 100
count = 0

In [None]:
for shopId in shop_list:
    if count == 0:
        q = Queue()
        threads = []      

    t = threading.Thread(target=thread_crawl, args=[int(shopId),q])
    threads.append(t)
    count = count + 1
    t.start()

    if count == max_bag: #每max_bag筆清空一次thread
        for t in threads:
            t.join()
        db[SELLER].insert_many(
            list(q.queue)
        )
        total = total+max_bag
        print(total)
        count = 0
        time.sleep(3)

for t in threads:
    t.join()
if len(list(q.queue)) > 0:
    db[SELLER].insert_many(list(q.queue))
print("Good job,yunhui! It's Finished.")

In [None]:
list(q.queue)