In [None]:
import requests
import time
from lxml import etree
from requests.exceptions import RequestException
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xlrd
from PIL import Image
import json
import pickle
import numpy as np
import io
import os
import xlwt
import warnings
warnings.filterwarnings("ignore")

In [None]:
#处理图片数字
def analyze_img():
    response = requests.get('http://static8.ziroom.com/phoenix/pc/images/price/6b8a3fdb72f557032810e060e3cd52b3s.png')
    im = Image.open(io.BytesIO(response.content))
    im = im.convert('1')      #原图像被转换为黑白图像
    num = [2,3,6,4,7,8,9,5,0,1]
    num_dict = {}
    for i in range(10):
        data = im.crop((i*30,0,(i+1)*30,30)).getdata()     #crop() 方法可以从一幅图像中裁剪指定区域，getdata以包含像素值的sequence对象形式返回图像的内容
        data = np.matrix(data,dtype='int')/255
        num_dict[num[i]] = data
    fp =open('num_dict.num','wb')
    pickle.dump(num_dict,fp,protocol=1)
    fp.close()
    return num_dict

In [None]:
#获取指定url的图片，并解析出图片中的数字
def img_ocr(img_url):
    response = requests.get(img_url)
    im = Image.open(io.BytesIO(response.content))
    im = im.convert('1')
    fp = open('num_dict.num','rb')
    num_dict = pickle.load(fp)
    result = []
    for i in range(10):
        data = im.crop((i*30,0,(i+1)*30,30)).getdata()
        data = np.matrix(data,dtype='int')/255
        flag = False
        for item in num_dict:
            if (num_dict[item] == data).all():
                flag = True
                result.append(item)
        if not flag:
            result.append(8)
    return result

In [None]:
#获取价格信息
def get_room_price(room_id,house_id):
    room_price_header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "Host": "www.ziroom.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
        "Pragma": "no-cache"
    }
    price_img_url = "http://www.ziroom.com/detail/info?id={}&house_id={}".format(room_id,house_id)
    resp = requests.get(price_img_url,headers = room_price_header)
    price = json.loads(resp.text)['data']['price']
    price_pos = price[2]
    num_list = img_ocr('http:'+price[0])
    result = ""
    for i in price_pos:
        result += str(num_list[i])
    return result

In [None]:
#获取单页源码
def get_one_page(page):
    try:
        headers = {
            'Referer':'http://hz.ziroom.com/',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
        }
        res = requests.get(page,headers=headers)
        if res.status_code == 200:
            return(res.text)
    except RequestException:
        return None

In [None]:
#解析单个网页
def parse_one_page(sourcehtml):
    contentTree = etree.HTML(sourcehtml)
    results = contentTree.xpath('//ul[@id="houseList"]/li')
    for result in results:
        #获取出租屋的标题，去掉前缀
        title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
        house = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1).split(" ")
        area = house[0]    #获取可使用面积
        floor = house[1].split('/')[0]     #获取所在楼层
        floors = house[1].split('/')[1].strip('层')      #获取总楼层
        chamber = house[2]     #获取几室几厅
        #距离最近地铁站的距离有些是空，使用if进行判断
        if len(result.xpath("./div/div/p[2]/span/text()")) == 0:
            nearby = ''
        else:
            nearby = result.xpath("./div/div/p[2]/span/text()")[0]
        loc_html = "http:" + result.xpath("./div/h3/a/@href")[0]    #详情页的地址
        page_text = get_one_page(loc_html)    #获取详情页
        page_contentTree = etree.HTML(page_text)
        loc = page_contentTree.xpath('//div[@class="room_name"]/p/span/text()')[0].strip().split()    #获取所在行政区和区域
        district = loc[0]
        location = loc[1]
        #获取room和house的id，传到get_room_price方法中，以获取出租屋的价格
        room_results = page_contentTree.xpath('//div[@class="hide"]/input[4]/@value')[0]
        house_results = page_contentTree.xpath('//div[@class="hide"]/input[5]/@value')[0]
        time.sleep(1)
        price = get_room_price(room_results,house_results)
        #将租房信息合并成一个字符串，写入到文件中
        data = ','.join([title,area,floor,floors,chamber,nearby,district,location,price]) + '\n'
        write_data(data)

In [None]:
#获取每个分类起始页面的最大页数
def get_pages(url):
    html = get_one_page(url)    #获取每个分类的起始网页
    time.sleep(0.5)
    contentTree = etree.HTML(html)
    if 0<len(contentTree.xpath('//div[@class="pages"]/span/text()'))<4:
        pages = int(contentTree.xpath('//div[@class="pages"]/span/text()')[0].strip('共页'))
        return pages
    elif len(contentTree.xpath('//div[@class="pages"]/span/text()')) == 4:
        pages = int(contentTree.xpath('//div[@class="pages"]/span/text()')[1].strip('共页'))
        return pages
    else:
        pages = 1  
        return pages

In [None]:
#写入文件函数
def write_data(data):
    with open('shenzhen_test.txt','a',encoding='utf-8') as f:
        f.write(data)

In [None]:
#当区域的页数超过50页时，进一步通过拆分价格获取完整租房信息
def big_page(next_url):
    for i in range(0,8001,500):
        if i != 8000:
            split_url2 = 'http://www.ziroom.com/z/nl/r{}TO{}-z3-{}'.format(i,i+500,next_url)
        else:
            split_url2 = 'http://www.ziroom.com/z/nl/r{}TO{}-z3-{}'.format(i,i+100000,next_url)
        big_pages = get_pages(split_url2)
        print(str(i) + '-' + str(i+500) + ':',big_pages)
        for page in range(1,big_pages+1):            
            real_url = split_url2 + "?p=" + str(page)
            print(real_url)
            sourcehtml = get_one_page(real_url)
            parse_one_page(sourcehtml)

In [None]:
#从起始页开始爬取每个行政区的租房信息
def get_original_data():
    start_url = 'http://sz.ziroom.com/z/nl/z3.html'
    start_text = get_one_page(start_url)
    contentTree = etree.HTML(start_text)
    results = contentTree.xpath('//ul[@class="clearfix filterList"]/li')
    #第一个节点是全部分类
    for result in results[1:]:
        district = result.xpath('./span/a/text()')[0]
        loc_results = result.xpath('./div/span')
        for loc_result in loc_results[1:]:
            split_url = 'http:' + loc_result.xpath('./a/@href')[0]
            pages = get_pages(split_url)
            location = loc_result.xpath('./a/text()')[0]
            print(pages)
            if pages>=50:
                next_url = loc_result.xpath('./a/@href')[0].split('-',1)[1]
                big_page(next_url)
            else:
                for page in range(1,pages+1):            
                    real_url = split_url + "?p=" + str(page)
                    print(real_url)
                    sourcehtml = get_one_page(real_url)
                    parse_one_page(sourcehtml)
            print("区域 " + location + " 采集成功")           
        print("-"*20 + district + "区采集成功" + "-"*20)

In [None]:
#对多个网页进行解析
def main():
    get_original_data()

In [None]:
if __name__ == '__main__':   
    analyze_img()
    main()