In [None]:
import requests, json
import threading
from queue import Queue
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import math
import random
import configparser

SHOPEE_URL = "https://shopee.co.th"

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}


### init and read config
config = configparser.ConfigParser()
config.read('config.ini')

MONGO_DB = config["Database"]["MONGO_DB"]
MONGO_HOST=config["Database"]["HOST"]
MONGO_USER = config["Database"]["MONGO_USER"]
MONGO_PASS = config["Database"]["MONGO_PASS"]

# db info
mongoURL = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % (MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_DB)
dbName = MONGO_DB
client = MongoClient(mongoURL)
db = client[dbName]

# collection name
PRODUCT = "shopee_skincare"
SELLER = "shopee_skincare_seller"
ERROR = "shopee_skincare_error"


In [None]:
def randomHeader():
    header = {"User-Agent": random.choice(agent)}
    return header

In [None]:
def insert_to_Db(collectionName,dataList,url):
    try:
        collection = db[collectionName]
        collection.insert_many(dataList)
        print("insert 50")
    except:
        print("insert error")
        db[ERROR].insert_one({"url":url})
    

In [None]:
def crawl_page(url):
    r = requests.get(url,headers=headers,allow_redirects=True)
    soup = BeautifulSoup(r.text, 'html.parser')
    all_items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
    #print(all_items)
    links = [i.find('a').get('href') for i in all_items]
    item_list = []
    seller_list = []

    q = Queue()
    threads = []
    for link in links:
        newLink = link.split(".")
        view_url = "https://shopee.co.th"+'.'.join(newLink)
        shopId = newLink[len(newLink)-2]
        itemId = newLink[len(newLink)-1]
        post_url = "https://shopee.co.th/api/v2/item/get?itemid="+itemId+"&shopid="+shopId
        t = threading.Thread(target=thread, args=[post_url,view_url,q])
        threads.append(t)
        t.start()

    # 等待所有的子執行緒結束
    for t in threads:
        t.join()
    
    item_list = list(q.queue)
#     insert_to_Db(PRODUCT,item_list,url)
    return item_list
    
#     for link in links:
#         newLink = link.split(".")
#         shopId = newLink[len(newLink)-2]
#         itemId = newLink[len(newLink)-1]
#         post_url = "https://shopee.co.th/api/v2/item/get?itemid="+itemId+"&shopid="+shopId
#         item_list.append(shopee_item(post_url))
#         #seller_list.append(shopee_seller(shopId))
        
#     insert_to_Db(PRODUCT,item_list) # 把 item 輸入進mongodb
#     insert_to_Db(SELLER,seller_list) # 把 seller 輸入進mongodb

In [None]:
def thread(post_url, view_url,q):
    result = shopee_item(post_url,view_url)
    q.put(result)

In [None]:
#爬 item
def shopee_item(post_url,view_url):
    resp = requests.get(post_url, headers=headers)
    post = json.loads(resp.text)
    #print(post_url)
    items = post['item']
    shopId = items.get('shopid')
    items_voucher = get_voucher(shopId)
    
    productName= items.get('name') #商品名稱 
    itemid=items.get('itemid') #商品id
    shopid=shopId,#賣家id
    rating_star=items.get('item_rating').get('rating_star')#星等
    rating_count=items.get('item_rating').get('rating_count')#評論數
    historical_sold=items.get('historical_sold')#售出數
    liked_count=items.get('liked_count')#收藏數
    categories1=items.get('categories')[len(items.get('categories'))-1].get('display_name') #分類1
    categories2=items.get('categories')[len(items.get('categories'))-2].get('display_name') #分類2(有或沒有)??
    lowest_price_guarantee = items.get('has_lowest_price_guarantee')
    shopee_verified = items.get('shopee_verified')
        
    try:
        price_before_discount=items.get('price_before_discount')/100000#原價 
    except:
        price_before_discount=""

    try:
        price = items.get('price')/100000#售價
    except:
        price=""
        
    try:
        raw_discount=items.get('raw_discount')#折扣百分比
    except:
        raw_discount=""


    try:
        description = items.get('description') #產品說明
    except:
        description=""

    try:
        options=items.get('tier_variations')
    except:
        options=""
        
    try:
        coupon=items_voucher
    except:
        coupon=""
        
    try:
        attributes=get_attributes(items.get('attributes'))
    except:
        attributes=""
        
        
    try:
        shipping_infos=get_shipping_infos(shopId, items.get('itemid'))
    except:
        shipping_infos=""
        

    dic = {
        "url":view_url,
        "productName": productName,#商品名稱 
        "itemid":itemid, #商品id
        "shopid":shopId,#賣家id
        "rating_star":rating_star,#星等
        "rating_count":rating_count,#評論數
        "historical_sold":historical_sold,#售出數
        "price_before_discount":price_before_discount,#原價 
        
        "price":price,#售價
        "raw_discount":raw_discount,#折扣百分比
        "liked_count":liked_count,#收藏數
        "categories1":categories1, #分類1
        "categories2":categories2, #分類2(有或沒有)??
        "description":description, #產品說明
        "options":options,#選項(array: ) 
        "coupon":items_voucher,
        "shipping_infos":get_shipping_infos(shopId, itemid),
        "lowest_price_guarantee":lowest_price_guarantee,
        "shopee_verified":shopee_verified
    }
    
    for attr in attributes:
        dic[attr['name']] = attr['value']
        
    return dic

In [None]:
#取得 item 中的產品規格
def get_attributes(att_list):
    newList = []
    for att in att_list:
        dic = {
            "name":att.get('name'),
            "value":att.get('value')
        }
        newList.append(dic)
    return newList

In [None]:
#取得coupon資料
def get_voucher(shopId):
    url = "https://shopee.co.th/api/v2/voucher_wallet/get_shop_vouchers_by_shopid?shopid="+str(shopId)+"&with_claiming_status=false"
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    voucher_list = post['data'].get("voucher_list")
    newList=[]
    
    try:
        discount_percentage = vlist.get("discount_percentage")
    except:
        discount_percentage = ""
    try:
        discount_value = int(vlist.get("discount_value"))/100000
    except:
        discount_value = ""
    try:
        min_spend = int(vlist.get("min_spend"))/100000
    except:
        min_spend = ""
        
    for vlist in voucher_list:
        dic = {
            "discount_percentage":discount_percentage,
            "discount_value":discount_value,
            "min_spend":min_spend
        }
        newList.append(dic)
    return newList

In [None]:
#取得物流資料
def get_shipping_infos(shopId, itemId):
    
    try:
        url = "https://shopee.co.th/api/v0/shop/"+str(shopId)+"/item/"+str(itemId)+"/shipping_info_to_address/"
        resp = requests.get(url, headers=headers)
        post = json.loads(resp.text)
        shipping_infos = post['shipping_infos']
        promotion_rules = post['promotion_rules']
        shippingList = []

        for info in shipping_infos:
            dic = {
                "shipping_name" : info.get('channel').get('display_name'),#物流商名稱  
                "channel_id" : info.get('channel').get('channelid'),#物流商id  
                "shipping_fee" : info.get('cost_info').get('estimated_shipping_fee')/100000,#運費原價  
                "discounted_shipping_fee" : info.get('cost_info').get('discounted_shipping_fee')/100000#運費折扣價
            }
            shippingList.append(dic)

        for info in promotion_rules:
            for ship in shippingList:
                if str(ship['channel_id']) in list(info['channels'].keys()):
                    ship['method'] = True
                    ship['method_discount'] = info['discount_delta']
                    ship['method_min_order_total'] = info['extra_data']['min_order_total']
    except:
        shippingList = []

    return shippingList

In [None]:
#爬 賣家資料
def shopee_seller(shopId):
    url = "https://shopee.co.th/api/v2/shop/get?is_brief=1&shopid="+str(shopId)
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    shop_data = post['data']
    dic = {
        "shop_id":shopId,
        "shop_name":shop_data.get('account').get('username'),
        "response_rate":shop_data.get('response_rate'), #聊天回覆率
        "total_avg_star":shop_data.get('account').get('total_avg_star'), #平均星等
        "follower_count":shop_data.get('follower_count'), #粉絲數
        "is_shopee_verified":shop_data.get('is_shopee_verified'), #是否為蝦皮推薦賣家
        "ctime":time.ctime(shop_data.get('ctime')), #加入時間
        "preparation_time":getTime(shop_data.get('preparation_time'))
    }
    #print(dic)
    return dic

In [None]:
# 將賣家資料中的 preparation_time 作轉換 
def getTime(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return("%d:%02d:%02d" % (h, m, s))

In [None]:
def shoppee_crawler():
    run = True
    has_product = 5 #連續5個價格區間都沒商品 代表跑完了
    price = 5910
    price_range = 5
    start_page = 0
    price_end = 10000
    
    while(run):

        url = "https://shopee.co.th/%E0%B8%9C%E0%B8%A5%E0%B8%B4%E0%B8%95%E0%B8%A0%E0%B8%B1%E0%B8%93%E0%B8%91%E0%B9%8C%E0%B8%94%E0%B8%B9%E0%B9%81%E0%B8%A5%E0%B8%9C%E0%B8%B4%E0%B8%A7%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2-cat.52.1151?maxPrice="+str(price+price_range)+"&minPrice="+str(price)+"&page="+str(start_page)+"&sortBy=pop"  # 10元一個區間
        print(price,"-",price+price_range)
        resp = requests.get(url,headers = headers)
        soup = BeautifulSoup(resp.content,"lxml")

        try:
            totalPage = int(soup.find("span", class_="shopee-mini-page-controller__total").text) #code : 取得商品count
        except:
            totalPage = 0

        if totalPage > 0:
            has_product = 5
            resultList = []
            for page in range(start_page,totalPage):

                url = "https://shopee.co.th/%E0%B8%9C%E0%B8%A5%E0%B8%B4%E0%B8%95%E0%B8%A0%E0%B8%B1%E0%B8%93%E0%B8%91%E0%B9%8C%E0%B8%94%E0%B8%B9%E0%B9%81%E0%B8%A5%E0%B8%9C%E0%B8%B4%E0%B8%A7%E0%B8%AB%E0%B8%99%E0%B9%89%E0%B8%B2-cat.52.1151?maxPrice="+str(price+price_range)+"&minPrice="+str(price)+"&page="+str(page)+"&sortBy=pop"  # 10元一個區間
                #print(url)
                result = crawl_page(url) #crawl_page function 返回爬蟲資料
                
                if len(result) > 0:
                    resultList = resultList + result
                else:
                    db[ERROR].insert_one({"url":url})
                #time.sleep(5) #看網頁會不會擋 視情況停頓幾秒
            
            try:
                print("insert")
                db[PRODUCT].insert_many(resultList)
            except:
                print("insert error")
            
            
            resultList = []                    
            price = price + price_range # 進入下一個價格區間
            start_page = 0 # page初始化1
            time.sleep(5)

        elif totalPage == 0: #代表這頁沒有商品
            resultList = []
            has_product = has_product - 1
            price = price + price_range
            start_page = 0


        if has_product < 0 or price == price_end - price_range: #連續5個價格區間沒有商品
            run = False # run = false 停止while迴圈            


In [None]:
#執行應該就可以跑了
shoppee_crawler()
print("Finished.")

In [None]:
post_url = 'https://shopee.co.th/api/v2/item/get?itemid=5110703644&shopid=166451491'
shopee_item(post_url,"hi")