In [None]:
## Script Index: S5
## Author: Liu Yue
## Date: 03/10/2016 - 05/10/2016
## Python Version: Anaconda 4.2.0, python 3.5

""" 
    Data Scraping:
    This scrpt gets the technical specifications of phones/tablets from web page link
"""

In [28]:
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import re

## Pre-set value relevant for the other dataset: 
## Talking Data collected 7 days phone user behaviour on 05/16
tk_year = 2016 
tk_month = 5
    
def dm_record_set(index, zol_list):
    """ method to create a dm_record from zol_list"""
    dm_record = zol_list[index]
    return(dm_record)

def dm_default():
    """ default value for device model: price=0, page_name='', age_month=999, dm_time_text='',"""
    return(0,'',999,'')

def bsObjFormation(url):
    """ method for creating bs4 object in general """
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    try:
        request=urllib.request.Request(url,None,headers) #The assembled request # may have TimeoutError
        htmlfile=urllib.request.urlopen(request)
        bsObj = BeautifulSoup(htmlfile.read(), "html.parser", )
        return bsObj
    except Exception:
        pass

def price_page_bsObj(zol_index):
    """ method to construct bs4 object for pricing page of device model """
    url = "http://detail.zol.com.cn/cell_phone/index" + zol_index + ".shtml"
    try:
        bsObj = bsObjFormation(url)
        return bsObj
    except Exception: # may be time out
        pass

def param_page_bsObj(zol_index):
    """ method to construct bs4 object for parameter page of device model """
    ceiling = str(-(-int(zol_index) // 1000))
    url = "http://detail.zol.com.cn/" + ceiling + "/" + zol_index + "/param.shtml"    
    try:
        bsObj = bsObjFormation(url)
        return bsObj
    except Exception: # may be time out
        pass
    
def zol_price(bsObj):
    """ method to obtain price for device model """
    """Class name of the smallest division containing price is dynamic, 
    It can can be "price price-normal"/ "price price-na"/ "price price-sp-num"
    Use larger division instead: use regex to get the number"""    
    try:
        p_line = bsObj.findAll("div",class_="product-price-info")
        price = re.search('￥(\d+)', p_line[0].text).group(1)
        return(price)
    except Exception: ## No numeric answer, default value 0
        pass

def zol_age_month(bsObj, ref_month, ref_year):
    """ method to calculate age_month for device model """
    try:
        dm_time_text = bsObj.findAll(id="newPmVal_1")[0].text
        return (age_calc(dm_time_text, tk_month, tk_year))
    except Exception:
        pass  
    
def age_calc (dm_time_text, ref_month, ref_year):
    """ method to calculate age_month from xx年 or xx年xx月 """
    try:
        dm_month = int(re.search('(\d+)月', dm_time_text).group(1))
    except AttributeError:
        dm_month = 1
    try:
        dm_year = int(re.search('(\d+)年', dm_time_text).group(1))
        age_month = 12 * (ref_year - dm_year ) + ref_month - dm_month        
    except AttributeError:
        age_month = 999
    return age_month

def zol_tech_param(bsObj):
    """record tech specifications for device models in dm_record, else return False"""
    try:
        t = bsObj.find(id="mParam")
        lst = t.findAll("ul")[0].findAll("p")
        record_param (lst, dm_record)
    except Exception:
        return False
        
def record_param (lst, dm_record):
    """ method to record tech specifications for device models in dm_record """
    screen_size ='主屏尺寸'
    screen_resolution = '主屏分辨率'
    back_camera_pixel = '后置摄像头'
    front_camera_pixel = '前置摄像头'
    core_num = '核心数'
    ram_mb = '内存'
    battery_mah = '电池容量'    
    device_param_dic = {}
    for line in lst:
        text = line.text.replace('\n','').split('：') #Split the text
        device_param_dic.update({text[0]:text[1]})    
    param_list = ([screen_size,screen_resolution,front_camera_pixel,back_camera_pixel,
                   front_camera_pixel,core_num,ram_mb,battery_mah])
    for param in param_list:
        add_param(param, device_param_dic, dm_record)

def add_param (param_key, chinese_lookup, record):
    """method to add tech specifications for device models to the record, 
    using English word is key, and Chinese word is the value """
    try:
        record.append(chinese_lookup[param_key])
    except Exception:
        record.append('')
   

In [29]:
""" Loadingdata from Script S4 """
""" This is the 'FINAL' data used
    906 / 1217 device models are registered with zol (74% device_model)
    53057 / 58462 device_id are covered (98% device_id)
"""

import csv
import re
## Load tk data: [[device_model_index, phone_brand, count_device_id, tk_name, zol_name, zol_link], ...]
with open('S4_zol phone match device model.csv', newline='', encoding='utf-8-sig') as f:
    file = csv.reader(f)
    lst_man_added = list(file)
print('Sample original data', lst_man_added[:2]) 
zol_list = []
for line in lst_man_added :
    zol_index = re.search('index(\d+)', line[5]).group(1)
    zol_list.append([line[0], zol_index])
print('\nNumber of zol divice models:', len(zol_list))
print('\nRelevant fields: device_model_index, zol_index')
print('Sample data', zol_list[:2])

Sample original data [['13', '??', '3', 'X50TS', '??X50TS', '/cell_phone/index376347.shtml'], ['20', '??', '1', 'X10i', '?????X10i', '/cell_phone/index201421.shtml']]

Number of zol divice models: 1461

Relevant fields: device_model_index, zol_index
Sample data [['13', '376347'], ['20', '201421']]


In [34]:
## Containers for recording
zol_phones = []
error_url = []
error_zol_list = []

for i in range(0,len(zol_list)):
    """Initial trial: specification of device models, for all identified tk device_model"""    
    dm_record =  dm_record_set(i, zol_list)
    device_model_index = dm_record[0]
    zol_index = dm_record[1] 
    price_rmb, page_name, age_month, dm_time_text = dm_default()

    ## To get price
    bsObj = price_page_bsObj(zol_index)
    if bsObj is not None:       
        price_rmb = zol_price(bsObj)
        page_name = bsObj.title.string
    else:
        error_url.append(["load_price_error", device_model_index, zol_index])
    
    ## To get other parameters
    bsObj = param_page_bsObj(zol_index)
    if bsObj is not None:
        age_month = zol_age_month(bsObj, tk_month, tk_year)
        param_unrecorded = zol_tech_param(bsObj)
        if param_unrecorded:
            error_url.append(["param_unrecorded", device_model_index, zol_index])
    else: 
        error_url.append(["load_param_error", device_model_index, zol_index]) 

    dm_record.append(age_month)
    dm_record.append(price_rmb)
    dm_record.append(page_name)
    zol_phones.append(dm_record)

## Write the data to CSV files
with open('S5_zol phone device specidication.csv','w',newline='',encoding='utf-8-sig') as f:
    a = csv.writer(f)
    a.writerows(zol_phones)

print('Number of error url:', len(error_url))
print('Sample output:', zol_phones[len(zol_phones)-1])

print('Number of error url:', len(error_url))
print('Sample output:', zol_phones[len(zol_phones)-1])

Number of error url: 13
Sample output: ['1209', '380854', '4.5英寸', '854x480像素', '500万像素', '200万像素', '四核', '1GB', '3000mAh', 25, '1099', '【亿通P3】报价_参数_图片_论坛_ETON P3亿通手机报价-ZOL中关村在线']
