In [26]:
## Script Index: S2
## Author: Liu Yue
## Date: 30/09/2016
## Python Version: Anaconda 4.2.0, python 3.5

""" 
    Data Scraping:
    This script gets the all names of and links to device models
"""

from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import csv
import re

## Function: Form BeautifulSoup object with exception handling 
def bsObjFormation(url):
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    try:
        request=urllib.request.Request(url,None,headers) #The assembled request # may have TimeoutError
    except TimeoutError:
        pass
    try:
        htmlfile=urllib.request.urlopen(request)
        bsObj = BeautifulSoup(htmlfile.read(), "html.parser", )
        return bsObj
    except urllib.error.URLError:
        return False;

## Read tk_brands_found: tk_brand_name, brand_name_displayed, zol_link
with open('S1_zol phone brand link.csv', newline='',encoding='utf-8') as f:
    file = csv.reader(f)
    lst = list(file)
lst[0][0] = lst[0][0].replace(u'\ufeff', '') # Remove Byte Order Mark character at start of file
brands_found = lst

## Containers for recording
zol_phones = []
error_url = []
error_zol_phones = []

## Set url construction tool
prefix = "http://detail.zol.com.cn/cell_phone_index/subcate57_"
mid = "_list_1_0_1_1_0_"
suffix = ".html"


## Initial trial: all names and links of zol device models, for all tk_brands on zol
## If error occurs, pass and record the breaking point
for brand in range(0,len(brands_found)):
    tk_brand_name = brands_found[brand][0]
    zol_brand_index = brands_found[brand][1].split("_")[3]
    page_index = 1
    
    while page_index > 0:
        phone_page_url = prefix + zol_brand_index + mid + str(page_index) + suffix    
        try:
            bsObj = bsObjFormation(phone_page_url)
        except Exception: #except urllib.error.URLError as err:
            bsObj = False
        if bsObj != False :
            lst = bsObj.findAll("div",class_="pro-intro")        
            for i in range(0,len(lst)):
                line = lst[i].h3.a
                phone_name = line.text
                link = line["href"]
                zol_phones.append([tk_brand_name,phone_name,link])
            if bsObj.findAll("a",class_="small-page-next"):
                page_index = page_index + 1
                # page_index = int(bsObj.findAll("a",class_="small-page-next")[0]["href"].split("_")[10][:-5]) + 1
            else:
                page_index = -1
        else:
            error_url.append(["error",tk_brand_name,phone_page_url])
            page_index = -1

## Handel the breaking points: all names and links of zol device models, for all tk_brands on zol
for i in range(0,len(error_url)):
    seed_page = error_url[i]
    tk_brand_name = seed_page[1]
    phone_page_url = seed_page[2]
    ## Instead of forcing through: page_index = int(phone_page_url.split("_")[10][:-5])
    ## Use regular expression
    page_index = int(re.search('([\w.-]+)_([\w.-]+).html',phone_page_url).group(2))
    zol_brand_index = re.search('_([\w.-]+)_list',phone_page_url).group(1)
    while page_index > 0:
        phone_page_url = prefix + zol_brand_index + mid + str(page_index) + suffix    
        try:
            bsObj = bsObjFormation(phone_page_url)
        except Exception: #except urllib.error.URLError as err:
            bsObj = False
        if bsObj != False :
            lst = bsObj.findAll("div",class_="pro-intro")        
            for i in range(0,len(lst)):
                line = lst[i].h3.a
                phone_name = line.text
                link = line["href"]
                error_zol_phones.append([tk_brand_name,phone_name,link])
            if bsObj.findAll("a",class_="small-page-next"):
                page_index = page_index + 1
                # page_index = int(bsObj.findAll("a",class_="small-page-next")[0]["href"].split("_")[10][:-5]) + 1
            else:
                page_index = -1
        else:
            error_url.append(["error",tk_brand_name,phone_page_url])
            page_index = -1

# Combine the results 
zol_phones_final = zol_phones + error_zol_phones

## Write the data to CSV files
with open('S2_zol phone model link.csv','w',newline='',encoding='utf-8-sig') as f:
    a = csv.writer(f)
    a.writerows(zol_phones_final)

In [2]:
## How to get back the list
import csv
## Read zol_phone_model: tk_brand_name,zol_phone_name,zol_link
with open('S2_zol phone model link.csv', newline='',encoding='utf-8') as f:
    file = csv.reader(f)
    lst = list(file)

lst[0][0] = lst[0][0].replace(u'\ufeff', '') # Remove Byte Order Mark character at start of file
zol_phone_model = lst

In [3]:
""" 
    Get: names and links of tablet models on zol 
"""

# scrape data(name, link) for tablet
prefix = "http://detail.zol.com.cn/tablepc/"
suffix = ".html"
index_range = range(1,136)

zol_tablets = []
for index in index_range:
    tablet_url = prefix + str(index) + suffix
    if bsObj is not None:
        bsObj = bsObjFormation(tablet_url)
        lst = bsObj.findAll("div",class_="pro-intro")
        for i in range(0,len(lst)):
            line = lst[i].h3.a
            tablet_name = line.text
            link = line["href"]
            zol_tablets.append([tablet_name,link])
        
## Write the data to CSV files
with open('S2_zol tablet model link.csv','w',newline='',encoding='utf-8-sig') as f:
    a = csv.writer(f)
    a.writerows(zol_tablets)