In [13]:
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from datetime import datetime
import hashlib
import json
from pymongo import MongoClient
import time
import os
import re
from dotenv import load_dotenv

## Config

In [14]:
load_dotenv('../.env')

# Variables of mongodb
MONGODB_HOST = os.environ.get("MONGODB_HOST", "localhost")
MONGODB_PORT = int(os.environ.get("MONGODB_PORT", 27017))
MONGODB_DB = os.environ.get("MONGODB_DB", "")
MONGODB_USER = os.environ.get("MONGODB_USER", "")
MONGODB_PWD = os.environ.get("MONGODB_PWD", "")

MONGODB_COLLECTION='timberland'

---

## Try

In [3]:
url_base = "http://www.timberland.com.tw"

In [9]:
res = requests.get(url_base)

In [13]:
doc = pq(res.text)
doc

[<html.no-js.zh-TW.tw>]

In [50]:
doc_categories = doc('.sub-menu .sub-menu .menu-item-object-product_category a')

In [51]:
categories = [(i.text(), i.attr('href')) for i in doc_categories.items()]
categories

[('T 恤', 'http://www.timberland.com.tw/men-apparel-t-shirts/'),
 ('運動上衣', 'http://www.timberland.com.tw/men-apparel-sweatshirts/'),
 ('POLO 衫', 'http://www.timberland.com.tw/men-apparel-polo/'),
 ('外套', 'http://www.timberland.com.tw/men-apparel-outerwear/'),
 ('襯衫', 'http://www.timberland.com.tw/men-apparel-shirts/'),
 ('褲款', 'http://www.timberland.com.tw/men-apparel-pants-shorts/'),
 ('毛衣', 'http://www.timberland.com.tw/men-apparel-sweaters/'),
 ('健行鞋／靴', 'http://www.timberland.com.tw/men-footwear-hiking-shoes-boots/'),
 ('靴款', 'http://www.timberland.com.tw/men-footwear-boots/'),
 ('運動靴', 'http://www.timberland.com.tw/men-footwear-sneakerboots/'),
 ('帆船鞋', 'http://www.timberland.com.tw/men-footwear-boat-shoes/'),
 ('休閒鞋款', 'http://www.timberland.com.tw/men-footwear-shoes/'),
 ('涼鞋', 'http://www.timberland.com.tw/men-footwear-sandals/'),
 ('T 恤', 'http://www.timberland.com.tw/women-apparel-t-shirts/'),
 ('運動上衣', 'http://www.timberland.com.tw/women-apparel-sweatshirts/'),
 ('洋裝', 'http:

In [82]:
cat_url = 'http://www.timberland.com.tw/men-apparel-t-shirts/'
first_page = requests.get(cat_url)
doc = pq(first_page.text)
doc

[<html.no-js.zh-TW.tw>]

In [85]:
items = doc('.entry-header > h1 > a')
print(items)

<a href="http://www.timberland.com.tw/products/a1mbw001-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1mbw037-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1mbvm83-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1mbv001-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1mbvn23-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1mbv037-warner-river-oversized-tee/" rel="bookmark">男士Warner River 寬版 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1n3c001-wild-river-back-graphic-tee/" rel="bookmark">男士Wild River 背面圖案 T 恤</a>
<a href="http://www.timberland.com.tw/products/a1n3c052-wild-river-back-graphic-tee/" rel="bookmark">男士Wild River 背面圖

---

## Refactor

**Records schema**

```json
{
  "product_number": "A1MWJF74",
  "product_url": "http://www.timberland.com.tw/products/a1mwjf74-camden-falls-full-grain-boat-shoes/",
  "series_name": "女士Camden Falls 全粒面帆船鞋",
  "cate1": "女士",
  "cate2": "帆船鞋",
  "cate3": "女鞋",
  "color": "中咖啡色全粒面皮革",
  "description": "Sensorflex 彈性感應舒適系統\n鞋面以 LWG 銀級鞣革廠的優質皮革製成\n真皮鞋帶\n皮革沿條\n皮革襯裡\n皮革貼面 OrthoLite® 泡棉鞋床\n舒適貼地感 EVA 中底\n外底以 15% 循環利用橡膠鞋底製成\n尺碼：5.5-10，11 M,W\n5750WP/13000SRP",
  "features": "SensorFlex 三層緩震系統構成，為足部提供持久支撐､絕佳避震與靈活彈性穩定性，為您帶來嶄新的舒適感受。 OrthoLite® 鞋墊以輕盈舒適泡棉物料製造，此超透氣材質，不會損壞或喪失避震效能。加上水分傳遞功能以及抗菌處理，有效控制氣味。",
  "img_urls": [
    "http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-1.jpg",
    "http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-2.jpg",
    "http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-3.jpg",
    "http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-4.jpg",
    "http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-5.jpg"
  ],
  "price": 5500
}
```

In [87]:
def get_products_info(urls, normalize=True, session=None):
    '''Get products info.'''
    
    def _normalize_product(prod_layout):
        '''List of normalized products.'''
        out = []
        for color in prod_layout['color_variants']:
            out.append({**color,
                        **{k: v for (k, v) in prod_layout.items() if k != 'color_variants'}})
        
        return out
    
    out_list = []
    for url in urls:
        out = _get_product_info(url)
        if normalize:
            out = _normalize_product(out)
        
        out_list.extend(out)
    
    return out_list

In [35]:
def _get_product_info(prod_url, is_color_variant=False, session=None):
    
    def _get_color_variant(doc):
        doc_colors = doc('.color-select li a:not(.active)')
        # print(doc_colors)
        urls = [i.attr('href') for i in doc_colors.items()]
        
        color_variants = []
        for url in urls:
            color_variants.append(_get_product_info(url, is_color_variant=True))
            
        return color_variants
    
    def _get_color(doc):
        color = doc('div.color-variant > div.color > b').text()
        product_number = doc('div.color-variant > div.style > b').text()
        img_urls = [i.attr('href') for i in doc('.product-image-slider a').items()]
        return {'product_number': product_number, 'color': color, 'product_url': prod_url, 'img_urls': img_urls}
    
    
    if session:
        res = requests.get(prod_url, session=session)
    else:
        res = requests.get(prod_url)
    
    doc = pq(res.text)
    
    if is_color_variant:
        return _get_color(doc)
    else:        
        series_name = doc('h1.entry-title').text().strip()
        color_variants = [_get_color(doc)]
        color_variants.extend(_get_color_variant(doc))
        description = doc('div.description-content').text().strip()
        _cates = [i.text().strip() for i in doc('.separator+ a').items()]
        cate1 = _cates[1]
        cate2 = _cates[2]
        cate3 = _cates[3] if len(_cates) == 5 else None
        price = doc('span.product-price').text().strip()
        price = int(price.split('TWD')[-1].split('.')[0].replace(',', ''))
        features = doc('.feature-content').text().strip()
        
    return {
        'series_name': series_name,
        'color_variants': color_variants,
        'description': description,
        'cate1': cate1,
        'cate2': cate2,
        'cate3': cate3,
        'price': price,
        'features': features,
        'data_time': datetime.utcnow().isoformat(),
    }

In [49]:
def get_product_urls_by_cates(cate_urls, session=None):
    out_list = []
    for url in cate_urls:
        out_list.extend(_get_cate_pagination(url))
        
    return out_list

In [75]:
def _get_cate_pagination(cat_url, session=None, verbose=False):
    
    def _get_single_page_products(url, referer=cat_url, session=None):
        
        try:
            if session:
                res = requests.get(url, session=session, headers={'Referer': referer})
            else:
                res = requests.get(url, headers={'Referer': referer})
        except:
            print(f'Error in getting URL: {url}')
            raise
   
        doc = pq(res.text)
        doc_items = doc('.entry-header > h1 > a')
        items_url = [i.attr('href') for i in doc_items.items()]
        
        if verbose:
            print(f'Get {url} ...')
            print(doc_items.text())
        
        return items_url, _get_max_page(doc)
    
    def _get_max_page(doc):
        nums = doc('[class="page-numbers"]').text()
        max_num = int(max(nums)) if nums else None
        return max_num
    
    out_list, max_page = _get_single_page_products(cat_url)  # first_page
    if max_page:
        page_links = [cat_url + 'page/' + str(page_num) + '/' for page_num in range(2, max_page+1)]
    
        for page in page_links:
            out_list.extend(_get_single_page_products(page)[0])
    
    return list(set(out_list))

In [44]:
def _get_categories(url_base="http://www.timberland.com.tw", session=None):
    res = requests.get(url_base)
    doc = pq(res.text)
    doc_categories = doc('.sub-menu .sub-menu .menu-item-object-product_category a')
    categories = [(i.text(), i.attr('href')) for i in doc_categories.items()]
    
    for i, (key, val) in enumerate(categories):
        sex = val.rsplit('/', maxsplit=2)[-2].split('-')[0]
        key = '{sex}-{key}'.format(sex=sex, key=key.replace(' ', ''))
        categories[i] = (key, val)
    
    return dict(categories)

---

## Run

In [20]:
_get_categories()

{'men-T恤': 'http://www.timberland.com.tw/men-apparel-t-shirts/',
 'men-運動上衣': 'http://www.timberland.com.tw/men-apparel-sweatshirts/',
 'men-POLO衫': 'http://www.timberland.com.tw/men-apparel-polo/',
 'men-外套': 'http://www.timberland.com.tw/men-apparel-outerwear/',
 'men-襯衫': 'http://www.timberland.com.tw/men-apparel-shirts/',
 'men-褲款': 'http://www.timberland.com.tw/men-apparel-pants-shorts/',
 'men-毛衣': 'http://www.timberland.com.tw/men-apparel-sweaters/',
 'men-健行鞋／靴': 'http://www.timberland.com.tw/men-footwear-hiking-shoes-boots/',
 'men-靴款': 'http://www.timberland.com.tw/men-footwear-boots/',
 'men-運動靴': 'http://www.timberland.com.tw/men-footwear-sneakerboots/',
 'men-帆船鞋': 'http://www.timberland.com.tw/men-footwear-boat-shoes/',
 'men-休閒鞋款': 'http://www.timberland.com.tw/men-footwear-shoes/',
 'men-涼鞋': 'http://www.timberland.com.tw/men-footwear-sandals/',
 'women-T恤': 'http://www.timberland.com.tw/women-apparel-t-shirts/',
 'women-運動上衣': 'http://www.timberland.com.tw/women-appare

In [25]:
_get_cat_pagination('http://www.timberland.com.tw/men-apparel-t-shirts/', verbose=True)

Get http://www.timberland.com.tw/men-apparel-t-shirts/ ...
男士Warner River 寬版 T 恤 男士Warner River 寬版 T 恤 男士Warner River 寬版 T 恤 男士Warner River 寬版 T 恤 男士Warner River 寬版 T 恤 男士Warner River 寬版 T 恤 男士Wild River 背面圖案 T 恤 男士Wild River 背面圖案 T 恤 男士Wild River 背面圖案 T 恤 男士Wild River 背面圖案 T 恤 男士Wild River 背面圖案 T 恤 男士Wild River 背面圖案 T 恤
Get http://www.timberland.com.tw/men-apparel-t-shirts/page/2/ ...
男士Wild River 橫列標誌迷彩 T 恤 男士Wild River 橫列標誌迷彩 T 恤 男士Wild River 橫列標誌迷彩 T 恤 男士Wild River 橫列標誌迷彩 T 恤 男士Wild River 復古風 T 恤 男士Wild River 復古風 T 恤 男士Wild River 復古風 T 恤 男士Wild River 文字款品牌標誌基本款 T 恤 男士Wild River 文字款品牌標誌基本款 T 恤 男士Wild River 文字款品牌標誌基本款 T 恤 男士Wild River 文字款品牌標誌基本款 T 恤 男士Wild River 文字款品牌標誌基本款 T 恤
Get http://www.timberland.com.tw/men-apparel-t-shirts/page/3/ ...
男士Wild River 文字款品牌標誌基本款 T 恤 男士Kennebec River 修身版個性 T 恤 男士Kennebec River 修身版個性 T 恤 男士Kennebec River 修身版個性 T 恤 男士Great Brook 升級版「帆船鞋圖案」粗紡 T 恤 男士Great Brook 升級版「帆船鞋圖案」粗紡 T 恤 男士Great Brook 多圖案故事主題風格粗紡 T 恤 男士Great Brook 多圖案故事主題風格粗紡 T 恤 男士Great Brook 多

['http://www.timberland.com.tw/products/a1n75h44-salmon-brook-engineered-large-stripe-jaspe-tee/',
 'http://www.timberland.com.tw/products/a1n55j82-chinese-new-year-gold-tree-tee/',
 'http://www.timberland.com.tw/products/a1m2g001-kennebec-river-statement-slim-fit-tee/',
 'http://www.timberland.com.tw/products/a1mbw037-warner-river-oversized-tee/',
 'http://www.timberland.com.tw/products/a1m25l59-great-brook-multi-graphic-storytelling-slub-tee/',
 'http://www.timberland.com.tw/products/a1m1kj38-kennebec-riverbrand-carrier-tbl-tee/',
 'http://www.timberland.com.tw/products/a1mh6052-dunstan-river-slim-fit-tee/',
 'http://www.timberland.com.tw/products/a1m18k52-kennebec-riverbrand-tee/',
 'http://www.timberland.com.tw/products/a1m1lh43-salmon-brookslim-fit-tee/',
 'http://www.timberland.com.tw/products/a1m25j22-great-brook-multi-graphic-storytelling-slub-tee/',
 'http://www.timberland.com.tw/products/a1m2gj10-kennebec-river-statement-slim-fit-tee/',
 'http://www.timberland.com.tw/products

In [157]:
%%time
with requests.Session() as sess:
    sess.headers = {'User-Agent': 'Mozilla/5.0'}
    want_cat = list(_get_categories(session=sess).values())[:2]
    product_list = get_product_list_by_cat(want_cat, session=sess)
    
len(product_list)

CPU times: user 231 ms, sys: 20 ms, total: 251 ms
Wall time: 13.5 s


In [254]:
prod_url = 'http://www.timberland.com.tw/products/a1mwf403-camden-falls-full-grain-boat-shoes/'
_get_product_info(prod_url)

{'series_name': '女士Camden Falls 全粒面帆船鞋',
 'color_variants': [{'product_number': 'A1MWF403',
   'color': '海軍藍全粒面皮革',
   'product_url': 'http://www.timberland.com.tw/products/a1mwf403-camden-falls-full-grain-boat-shoes/',
   'img_urls': ['http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWF-1.jpg',
    'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWF-2.jpg',
    'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWF-3.jpg',
    'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWF-4.jpg',
    'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWF-5.jpg']},
  {'product_number': 'A1MWJF74',
   'color': '中咖啡色全粒面皮革',
   'product_url': 'http://www.timberland.com.tw/products/a1mwjf74-camden-falls-full-grain-boat-shoes/',
   'img_urls': ['http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-1.jpg',
    'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1MWJ-2.jpg',
    'http://www.timberland.com.tw/wp-co

In [88]:
%%time
with requests.Session() as sess:
    sess.headers = {'User-Agent': 'Mozilla/5.0'}
    want_cates = list(_get_categories(session=sess).values())[-2:]
    
    product_urls = get_product_urls_by_cates(want_cates, session=sess)
    print(f'Got {len(product_urls)} product urls')
    out = get_products_info(product_urls, session=sess)
    
len(out)

Got 20 product urls
CPU times: user 1.16 s, sys: 119 ms, total: 1.27 s
Wall time: 46.9 s


In [89]:
out

[{'product_number': 'A1O8Y015',
  'color': '黑色正絨面皮革',
  'product_url': 'http://www.timberland.com.tw/products/a1o8y015-londyn-chelsea/',
  'img_urls': ['http://www.timberland.com.tw/wp-content/uploads/product/s18/A1O8Y-1.jpg',
   'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1O8Y-2.jpg',
   'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1O8Y-3.jpg',
   'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1O8Y-4.jpg',
   'http://www.timberland.com.tw/wp-content/uploads/product/s18/A1O8Y-5.jpg'],
  'series_name': '女士Londyn 騎士短靴',
  'description': '鞋面以 LWG 銀級鞣革廠的優質皮革製成\n50% 循環利用 PET 網布襯裡\nOrthoLite® 鞋床\n34% 循環利用橡膠外底\n1 公分內增高鞋墊\n易於穿脫的設計\n尺碼：5.5-10，11 M,W\n5300WP/12000SRP',
  'cate1': '女士',
  'cate2': '女鞋',
  'cate3': '休閒鞋款',
  'price': 4900,
  'features': 'OrthoLite® 鞋墊以輕盈舒適泡棉物料製造，此超透氣材質，不會損壞或喪失避震效能。加上水分傳遞功能以及抗菌處理，有效控制氣味。 Recycled Rubber 橡膠鞋底使用橡膠廢料製造，包含34% 可回收橡膠，提供的抓地力與耐用性足以滿足我們的高標準。',
  'data_time': '2018-05-27T15:38:11.967162'},
 {'product_number':

## IO

In [115]:
def upsert_timberland(db, collection, product_number='product_number', sleep_sec=None):
    with requests.Session() as sess:
        sess.headers = {'User-Agent': 'Mozilla/5.0'}
        want_cates = list(_get_categories(session=sess).values())
        product_urls = get_product_urls_by_cates(want_cates, session=sess)
        print(f'Got {len(product_urls)} product urls')
        
        product_numbers = db.distinct[product_number]
        
        for _, product_url in enumerate(product_urls):
            out = get_products_info([product_url], session=sess)[0]
            if _ % 20 == 0:
                print(f'Upsert {out["product_number"]}: {out["product_url"]} ...')
            db[collection].replace_one({product_number: out[product_number]}, out, upsert=True)

In [61]:
client = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
db = client[MONGODB_DB]
if MONGODB_USER:
    db.authenticate(MONGODB_USER, MONGODB_PWD)
    print('Authenticated!')

Authenticated!


In [None]:
%%time
upsert_timberland(db, MONGODB_COLLECTION, product_number='product_number')

Got 406 product urls
Upsert A1N75H44: http://www.timberland.com.tw/products/a1n75h44-salmon-brook-engineered-large-stripe-jaspe-tee/ ...
Upsert A1N3C037: http://www.timberland.com.tw/products/a1n3c037-wild-river-back-graphic-tee/ ...
Upsert A1M1WJ38: http://www.timberland.com.tw/products/a1m1wj38-kennebec-river-multi-graphic-slim-fit-tee/ ...
Upsert A1MBM100: http://www.timberland.com.tw/products/a1mbm100-wild-river-linear-camo-logo-tee/ ...
