# “网络数据采集” 课程

# 第3章 Web页面爬取

## 案例1

### 名称：从百度贴吧下载多页话题内容

### 内容：

先了解以下百度贴吧http://tieba.baidu.com/f?
我们定义几个函数：  

- loadPage(url) 用于获取网页
- writePage(html,filename) 用于将已获得的网页存储为本地文件
- tiebaCrawler(url,beginpPage,endPage,keyword)用于调度，提供需要抓取的页面URLs
- main：程序主控模块，完成基本命令行交互接口

In [None]:
"""A case of crawler is used to fetch the content of baidu's tieba url, in according to user's input keywords.

"""
import urllib.request
import urllib.parse

def loadPage(url):
    """
        Function: Fetching url and accessing the webpage content.
        url: the wanted webpage url.
    """
    headers = {'Accept': 'text/html','User-Agent':'Mozilla/5.0',}
    print('To send http request to %s' % url)
    request = urllib.request.Request(url,headers=headers)
    
    return  urllib.request.urlopen(request).read().decode('utf-8')

def writePage(html,filename):
    """
        Fuction: To write the content of html into a local file.
        html: The response content.
        filename: the local filename to be used stored the response.
    """
    print('To write html into a local file %s ...' % filename)
    with open(filename,'wb') as f:
        f.write(html.encode('utf-8'))
    print('Work done.')
    
    print('-'*10)
    
    print("for cosole debug:")
    print(html.encode('utf-8'))

def tiebaCrawler(url,beginpPage,endPage,keyword):
    """
        Function: The scheduler of tieba crawler, is used to access every wanted url in turns.
        url: the url of baidu's tieba webpage
        beginPage: initial page
        endPage: end page
        keyword: the wanted keyword 
    """
    filename = keyword + '_tieba.html'
    for page in range(beginpPage,endPage+1):
        pn = (page - 1) * 50
        queryurl = url + '&pn=' + str(pn)
        writePage(loadPage(queryurl),filename)
        
if __name__ == '__main__':
    kw = input('Pl input the wanted tieba\'s name:' )
    beginPage = int(input('The beginning page number:'))
    endPage = int(input('The ending page number:'))
    # 百度贴吧查询url例子：http://tieba.baidu.com/f?ie=utf-8&kw=%E5%8C%97%E8%88%AA&fr=search&red_tag=i2305631770
    url = 'http://tieba.baidu.com/f?'
    key = urllib.parse.urlencode({'kw':kw})
    queryurl = url+ key
    tiebaCrawler(queryurl,beginPage,endPage,kw)

## 案例结论

本案例演示了使用urllib访问百度贴吧，并根据用户兴趣，下载相关web页面的过程。

本案例可以作为课后作业布置给学生。

## 案例2

### 名称

爬取新浪财经网http://finance.sina.com.cn/stock/，各股票公司每日公告（爬取股票分析所需语料）

###  内容

本案例用于爬取新浪财经股票公司公告。

基本流程如下：

1. 输入开始日期、结束日期，作为查询条件，之后程序后计算之间有多少天，之后生成每一天的日期，以每天的日期作为查询条件提交到 http://vip.stock.finance.sina.com.cn/corp/view/vCB_BulletinGather.php?gg_date=&ftype=0 进行查询。

2. 程序采用了多线程技术加速爬取过程。每个线程都将首先执行spiderOneGroupDays；

3. 调用 spiderOneDay

4. 调用 spiderOnePage

5. 调用 spiderOnePiece

In [None]:
## coding:utf-8
# 爬取新浪财经网股票公司每日公告
# 提供日期即可  eg: 2017-02-21
import os
import math
import time
import datetime
import requests
import threading
from lxml import etree


# 爬取一条公告并保存
def spiderOnePiece(iurl,headers,datetime,filename):
    # 去除文件名中的非法字符
    invaild=['*','\\','/',':','\"','<','>','|','?']
    for c in invaild:
        if c in filename:
            filename=filename.replace(c,'')
    print("    公告链接为：",iurl)
    response=requests.get(iurl,headers=headers).content
    page=etree.HTML(response)
    content=page.xpath('//*[@id="content"]/pre')
    if len(content)==0:
        return
    content=content[0].text
    print(content[:20])
    with open(datetime+os.sep+filename,'w') as f:
        f.write(content.encode('utf-8'))

# 爬取一页
def spiderOnePage(url,headers,datetime):
    website='http://vip.stock.finance.sina.com.cn'

    response=requests.get(url,headers=headers).content
    page=etree.HTML(response)
    trList=page.xpath(r'//*[@id="wrap"]/div[@class="Container"]/table/tbody/tr')

    print("当前页面共有公告{}条".format(len(trList) ))
    if len(trList)==1:  # 爬取结束  该行（对不起没有相关记录）
        return 0

    if not os.path.exists(datetime):  # 创建日期文件夹
        os.mkdir(datetime)

    for item in trList:
        aUrl=item.xpath('th/a[1]')
        title=aUrl[0].text    # 公告标题
        href=aUrl[0].attrib['href']   # 公告uri
        href=website+href    # 公告url

        atype=item.xpath('td[1]')[0].text # 公告类型
        print("准备爬取公告{}".format(title))
        spiderOnePiece(href,headers,datetime,title+'_'+atype+'.txt')
    return 1

# 爬取一天
def spiderOneDay(url,headers,datetime,log_path='log'):
    url=url.replace('#datetime#',datetime)  # 填充日期
    flag=1   # 爬取成功标志
    index=1  # 起始页
    with open(log_path+os.sep+datetime+'.txt','a') as f:
        while flag:
            t_url=url+str(index)
            try:
                flag=spiderOnePage(t_url,headers,datetime)
            except Exception as e:
                print('err:',e)
                flag=0
            finally:
                if flag:
                    print('%s page_%d load success,continue.' %(datetime,index))
                    f.write('%s_page_%d load success.\n' %(datetime,index))
                    f.flush()
                else:
                    print('%s page_%d load fail,end.' %(datetime,index))
                    f.write('%s_page_%d load failed.\n' %(datetime,index))
                    f.flush()
                index+=1
    

# 爬取一组天股票公司的数据
def spiderOneGroupDays(url,headers,date_group,log_path):
    for idate in date_group:
        try:
            spiderOneDay(url,headers,idate,log_path)
            print('%s has load success.over.' %idate)
        except Exception as e:
            print('err:',e)
            continue


# 获取指定起始日期[包含]--结束日期[包含]之间的日期  
def getBetweenDay(begin_date,end_date):
    date_list=[]
    begin_date=datetime.datetime.strptime(begin_date,'%Y-%m-%d')
    # 现在的日期
    now_date=datetime.datetime.strptime(time.strftime('%Y-%m-%d',time.localtime(time.time())),'%Y-%m-%d')
    end_date=datetime.datetime.strptime(end_date,'%Y-%m-%d')
    # 如果给出的结束日期大于现在的日期  则将今天的日期作为结束日期
    if end_date>now_date:
        end_date=now_date
    while begin_date<=end_date:
        date_str=begin_date.strftime('%Y-%m-%d')
        date_list.append(date_str)
        begin_date+=datetime.timedelta(days=1)
    return date_list

# 将date_list 平均分成threadNum组  最后一组可能较少
def split_date_list(date_list,threadNum):
    # length=(len(date_list)/threadNum if len(date_list)%threadNum==0 else len(date_list)/threadNum+1)
    length=int(math.ceil(len(date_list)*1.0/threadNum))
    return [date_list[m:m+length] for m in range(0,len(date_list),length)]

def main():
    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8", 
        "Accept-Encoding": "gzip, deflate, sdch", 
        "Host": "vip.stock.finance.sina.com.cn", 
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 
        "Upgrade-Insecure-Requests": "1", 
        "Connection": "keep-alive", 
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
    }
    
    url='http://vip.stock.finance.sina.com.cn/corp/view/vCB_BulletinGather.php?gg_date=#datetime#&page='
    
    # 创建数据与日志的保存文件夹
    base_dir = "company_announcements"
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    log_path = os.path.join(base_dir,'log')
    if not os.path.exists(log_path):
        os.mkdir(log_path)

    # datetime='2017-02-19'
    # spiderOneDay(url,headers,datetime,log_path)
    #symbol = input("请输入A股股票的代码/名称/拼音")
    begin_date = input("请输入需查询的开始日期（例如：2017-01-01）：")
    end_date = input("请输入需查询的结束日期（例如：2017-01-31）：")
    # begin_date[包含]-->end_date[包含] 之间的所有date
    date_list = getBetweenDay(begin_date,end_date)
    print('%s 到 %s，共 %d 天。' % (begin_date,end_date,len(date_list)))
    
    # begin_date[包含]-->end_date[包含] 之间的所有date
    date_list=getBetweenDay(begin_date,end_date)
    print('%s-%s:%d days.' %(begin_date,end_date,len(date_list)))

    cut_date_list=split_date_list(date_list,4)
    print(cut_date_list)

    threads=[]
    for dgroup in cut_date_list:
        t=threading.Thread(target=spiderOneGroupDays,args=(url,headers,dgroup,log_path,))
        threads.append(t)

    # 开始线程
    for t in threads:
        t.start()

    # 等待所有线程结束  阻塞主线程
    for t in threads:
        t.join()
    print('all load success...')
    
if __name__ == '__main__':
    main()

In [2]:
import os
base_dir = "company_announcements"
if not os.path.exists(base_dir):
    os.mkdir(base_dir)
log_path = os.path.join(base_dir,'log')
if not os.path.exists(log_path):
    os.mkdir(log_path)