In [9]:
import cmd
import requests
import re
import json
import csv
from bs4 import BeautifulSoup
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, wait, ProcessPoolExecutor

DOMAIN = "https://www.pinkoi.com"
STYLE_LIST = ['cute','minimal','romantic','neutral','vintage','zakka','urban','zen','green']

In [10]:
def crawl_comments(product_info):
    product_url = DOMAIN + product_info["url"]
    html_text = requests.get(product_url).text
    soup = BeautifulSoup(html_text,"html5lib")

    tid = re.findall('/product/(.{8})',html_text)[0]

    sssid = (soup.select_one("span.store-link-wrap > a")).attrs["href"]
    ssid = re.findall('/store/(.*)',sssid)
    sid = ssid[0]

    comments_url = "https://www.pinkoi.com/apiv2/review/get?tid=%s&makeup_by_sid=%s&limit=20"%(tid,sid)

    comments_text = requests.get(comments_url).text

    js = json.loads(comments_text)

    comments_list = []
    for comment in js['result']:
        comment_dict = {}
        comment_dict['id'] = comment['owner_nick']
        comment_dict['comment'] = comment['description']
        comment_dict['score'] = comment['score']
        comments_list.append(comment_dict)
    return comments_list

In [17]:
def crawl_list(category_url,style,page):    
    product_list = []
    html_text = requests.get('%s&style=%s&page=%s'%(category_url,style,page)).text
    soup = BeautifulSoup(html_text,'html5lib')
    urls = re.findall('div class="title"><a href="(/product/.*)?\?category',html_text)
    if (re.findall('class="m-filter-empty-result-wrapper"',html_text)):
        pass
    else:
        for url in urls:
            product_list.append({"url":url,"style":style})
        for product_info in product_list:
            crawl_product(product_info)  

In [12]:
def crawl_product(product_info):

    product_url = DOMAIN + product_info["url"]

    html_text = requests.get(product_url).text

    product_dict = {}
    soup = BeautifulSoup(html_text,"html5lib")

    product_dict['style'] = product_info["style"]
    product_dict['title'] = soup.find('span',{"data-translate":"title"}).text
    product_dict['price'] = soup.select_one('span.amount').text
    product_dict['brand'] = soup.select_one('span.store-link-wrap > a').text
    if re.findall(r'<a href="/browse\?category=\d+&subcategory=\d+">(.*?)</a>',html_text):
        product_dict['category'] = re.findall(r'<a href="/browse\?category=\d+&subcategory=\d+">(.*?)</a>',html_text)[0]
    else:
        product_dict['category'] = "no info"
    if re.findall(r'<a href="/browse\?category=\d+&subcategory=\d+&material=\d+">(.*?)</a>',html_text):
        product_dict['material'] = re.findall(r'<a href="/browse\?category=\d+&subcategory=\d+&material=\d+">(.*?)</a>',html_text)[0]
    else:
        product_dict['material'] = "no info"
    product_dict['description'] = soup.select_one('div.m-richtext').text
    if re.findall(r'被欣賞 (\d+)',soup.select_one('div.box > ul > li').text.replace(',','')):
        product_dict['view_count'] = re.findall(r'被欣賞 (\d+)',soup.select_one('div.box > ul > li').text.replace(',',''))[0]
    else:
        product_dict['view_count'] = "0"
    product_dict['comments'] = crawl_comments(product_info)

    writer.writerow(product_dict)
    f.flush()
    
    print("crawled "+product_url)

    

In [21]:
##main methon
start_time = datetime.now()
category_url = 'https://www.pinkoi.com/browse/?category=9&subcategory=904'
category_num = re.findall('\?category=(\d+)',category_url)
subcategory_num = re.findall('subcategory=(\d+)',category_url)

##get the pages of all the style in this category
url_style_count = "https://www.pinkoi.com/apiv2/match?category=9&subcategory=904"
style_html_text = requests.get(url_style_count).text
js = json.loads(style_html_text)
style_terms = js["result"][0]["facets"]["style"]["terms"]
style_count = {'cute':0,'minimal':0,'romantic':0,'neutral':0,'vintage':0,'zakka':0,'urban':0,'zen':0,'green':0}
for style in style_terms:
    pages = 0
    if ((style["count"])%60 != 0):
        pages = ((style["count"])//60) + 1
    else:
        pages = ((style["count"])//60)        
    style_count[style["term"]] = pages
style_count

with open('pinkoi_%s_%s.csv'%(category_num[0],subcategory_num[0]),'w') as f:
    headers = ['title', 'price', 'category', 'material','brand','description','view_count','comments','style']
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    pool = ProcessPoolExecutor()
    futures = []
    for style in STYLE_LIST:
        for page in range(1,(style_count["%s"% style]+1)):
            future = pool.submit(crawl_list,category_url,style,page)
            futures.append(future)
            wait(futures)  
end_time = datetime.now()
time_spent = str(end_time - start_time).split(".")[0]
print('spent %s'%time_spent)

crawled https://www.pinkoi.com/product/7rs4WBtj
crawled https://www.pinkoi.com/product/XVNXtM4U
crawled https://www.pinkoi.com/product/RG4vg5qH
crawled https://www.pinkoi.com/product/4gwU5KNp
crawled https://www.pinkoi.com/product/98XuXb88
crawled https://www.pinkoi.com/product/RYrrDbUx
crawled https://www.pinkoi.com/product/dbibbtUx
crawled https://www.pinkoi.com/product/kSZfzrXt
crawled https://www.pinkoi.com/product/xVgfDSLY
crawled https://www.pinkoi.com/product/DrDH7dvL
crawled https://www.pinkoi.com/product/LqGKDpEY
crawled https://www.pinkoi.com/product/ymeS4XkC
crawled https://www.pinkoi.com/product/aDsXr94p
crawled https://www.pinkoi.com/product/5JJuH7eQ
crawled https://www.pinkoi.com/product/QAnNCkVq
crawled https://www.pinkoi.com/product/19wUFcXj
crawled https://www.pinkoi.com/product/1xwIS9kI
crawled https://www.pinkoi.com/product/QDUxtEN3
crawled https://www.pinkoi.com/product/897tMmCv
crawled https://www.pinkoi.com/product/RswrAWJ8
crawled https://www.pinkoi.com/product/C

In [47]:
a =1
a!=2

True