In [23]:
import requests, json
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

SHOPEE_URL = "https://shopee.co.th"
headers = {
    'User-Agent': 'Googlebot',
    'From': 'phoebe777@gmail.com'
}

# db info
mongoURL = ""
dbName = ""

# collection name
PRODUCT = ""
SELLER = ""


In [24]:
def insert_to_Db(collectionName,dataList):
    try:
        client = MongoClient(mongoURL)
        db = client[dbName]
        collection = db[collectionName]
        collection.insert_many(dataList)
    except errors.ConnectionFailure as err:
        print(err)

In [13]:
def crawl_page(url):
    r = requests.get(url,headers=headers,allow_redirects=True)
    soup = BeautifulSoup(r.text, 'html.parser')
    all_items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
    #print(all_items)
    links = [i.find('a').get('href') for i in all_items]
    item_list = []
    seller_list = []
    
    for link in links:
        newLink = link.split(".")
        shopId = newLink[len(newLink)-2]
        itemId = newLink[len(newLink)-1]
        post_url = "https://shopee.co.th/api/v2/item/get?itemid="+itemId+"&shopid="+shopId
        item_list.append(shopee_item(post_url))
        seller_list.append(shopee_seller(shopId))
        
    insert_to_Db(PRODUCT,item_list) # 把 item 輸入進mongodb
    insert_to_Db(SELLER,seller_list) # 把 seller 輸入進mongodb

In [22]:
#爬 item
def shopee_item(url):
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    items = post['item']
    shopId = items.get('shopid')
    items_voucher = get_voucher(shopId)
    dic = {
        "productName": items.get('name'), #商品名稱 
        "itemid":items.get('itemid'), #商品id
        "shopid":shopId,#賣家id
        "rating_star":items.get('item_rating').get('rating_star'),#星等
        "rating_count":items.get('item_rating').get('rating_count')[0],#評論數
        "historical_sold":items.get('historical_sold'),#售出數
        "price_before_discount":items.get('price_before_discount')/100000,#原價 
        "price":items.get('price')/100000,#售價
        "raw_discount":items.get('raw_discount'),#折扣百分比
        "liked_count":items.get('liked_count'),#收藏數
        "categories":items.get('categories')[len(items.get('categories'))-1].get('display_name'), #分類1
        "categories":items.get('categories')[len(items.get('categories'))-2].get('display_name'), #分類2(有或沒有)??
        "description":items.get('description'), #產品說明
        "options":items.get('tier_variations')[0].get('options'),#選項(array: ) 
        "coupon":items_voucher,
        "attributes":get_attributes(items.get('attributes')),
        "shipping_infos":get_shipping_infos(shopId, items.get('itemid'))
    }
    re

In [4]:
#取得 item 中的產品規格
def get_attributes(att_list):
    newList = []
    for att in att_list:
        dic = {
            "name":att.get('name'),
            "value":att.get('value')
        }
        newList.append(dic)
    return newList

In [5]:
#取得coupon資料
def get_voucher(shopId):
    url = "https://shopee.co.th/api/v2/voucher_wallet/get_shop_vouchers_by_shopid?shopid="+str(shopId)+"&with_claiming_status=false"
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    voucher_list = post['data'].get("voucher_list")
    newList=[]
    for vlist in voucher_list:
        dic = {
            "discount_percentage":vlist.get("discount_percentage"),
            "discount_value":vlist.get("discount_value")/100000,
            "min_spend":vlist.get("min_spend")/100000
        }
        newList.append(dic)
    return newList

In [None]:
#取得物流資料
def get_shipping_infos(shopId, itemId):
    url = "https://shopee.co.th/api/v0/shop/"+str(shopId)+"/item/"+str(itemId)+"/shipping_info_to_address/"
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    shipping_infos = post['shipping_infos']
    newList = []
    for info in shipping_infos:
        dic = {
            "shipping_name" : info.get('channel').get('display_name'),#物流商名稱  
            "shipping_fee" : info.get('cost_info').get('estimated_shipping_fee')/100000,#運費原價  
            "discounted_shipping_fee" : info.get('cost_info').get('discounted_shipping_fee')/100000#運費折扣價
        }
        newList.append(dic)
    return newList

In [6]:
#爬 賣家資料
def shopee_seller(shopId):
    url = "https://shopee.co.th/api/v2/shop/get?is_brief=1&shopid="+str(shopId)
    resp = requests.get(url, headers=headers)
    post = json.loads(resp.text)
    shop_data = post['data']
    dic = {
        "shop_id":shopId,
        "shop_name":shop_data.get('account').get('username'),
        "response_rate":shop_data.get('response_rate'), #聊天回覆率
        "total_avg_star":shop_data.get('account').get('total_avg_star'), #平均星等
        "follower_count":shop_data.get('follower_count'), #粉絲數
        "is_shopee_verified":shop_data.get('is_shopee_verified'), #是否為蝦皮推薦賣家
        "ctime":time.ctime(shop_data.get('ctime')), #加入時間
        "preparation_time":getTime(shop_data.get('preparation_time'))
    }
    #print(dic)

In [7]:
# 將賣家資料中的 preparation_time 作轉換 
def getTime(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return("%d:%02d:%02d" % (h, m, s))

In [10]:
def shoppee_crawler():
    run = True
    has_product = 5 #連續5個價格區間都沒商品 代表跑完了
    price = 0
    price_range = 10
    page = 1
    
    while(run):
        if page == 1: #先從第一頁取得總頁數

            url = "https://shopee.co.th/%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B9%80%E0%B8%97%E0%B9%89%E0%B8%B2%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B8%AB%E0%B8%8D%E0%B8%B4%E0%B8%87-cat.2077?maxPrice="+str(price+price_range)+"&minPrice="+str(price)+"&page="+str(page)  # 10元一個區間
            resp = requests.get(url,headers = header)
            soup = BeautifulSoup(resp.content,"lxml")


            count = soup.find("span", class_="shopee-mini-page-controller__total").text #code : 取得商品count

            totalPage = math.ceil(int(count)/50) #除40後無條件進位,得目前查詢價格的總頁數#取得總頁數

            if totalPage > 0:
                has_product = 5
                for page in range(1,totalPage+1):

                    url = "https://shopee.co.th/%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B9%80%E0%B8%97%E0%B9%89%E0%B8%B2%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B8%AB%E0%B8%8D%E0%B8%B4%E0%B8%87-cat.2077?maxPrice="+str(price+price_range)+"&minPrice="+str(price)+"&page="+str(page)  # 10元一個區間
                    result = crawl_page(url) #crawl_page function 返回爬蟲資料
                    #time.sleep(random.randrange(5,10)) #看網頁會不會擋 視情況停頓幾秒

                price = price + price_range # 進入下一個價格區間
                page = 1 # page初始化1

            elif totalPage == 0: #代表這頁沒有商品
                has_product = has_product - 1
                price = price + price_range
                page = 1


        if has_product < 0 : #連續5個價格區間沒有商品
            run = False # run = false 停止while迴圈

In [None]:
#執行應該就可以跑了
shoppee_crawler()