## 腾讯招聘技术类职位爬取

In [56]:
import requests
from lxml import etree
import csv
import time


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    return res
    

def get_position_link(page_url):
    '''
    @params: page_url:页面链接
    @return：每个页面下的职位详情链接
    '''
    base_prefix = 'https://hr.tencent.com/'
    res = get_html(page_url)
    html = etree.HTML(res.text)
    position_links = html.xpath('//tr[@class="even" or @class="odd"]//a/@href')
    position_links = map(lambda x: base_prefix+x, position_links)
    return position_links
        
def get_position_detail(detail_url):
    position = {}
    res = get_html(detail_url)
    html = etree.HTML(res.text)
    sharetitle = html.xpath('//tr[@class="h"]/td/text()')[0]
    position['岗位名称'] = sharetitle
    location,job_type,numbers = html.xpath('//tr[contains(@class, "bottomline")]/td/text()')
    position['工作地点'] = location
    position['职位类型'] = job_type
    position['招聘人数'] = numbers
    
    job_responsibility,job_requirements = html.xpath('//ul[@class="squareli"]')
    responsibility = job_responsibility.xpath('./li/text()')
    requirements = job_requirements.xpath('./li/text()')
    
    position['工作职责'] = responsibility
    position['工作要求'] = requirements
#     print(position)
    return position
    
def write2csv(position_list):
    '''
    将职位信息保存入csv文件
    
    因为职位信息是以字典形式保存的，所以采用DictWriter方法
    '''
#     headers = ['sharetitle','location', 'job_type', 'numbers', 'responsibility', 'requirements']
    headers = ['岗位名称','工作地点', '职位类型', '招聘人数', '工作职责', '工作要求']

    with open('tencent_position.csv', 'w', encoding='utf-8') as tencent:
        writer = csv.DictWriter(tencent, headers)
        writer.writeheader()
        writer.writerows(position_list)
    
    
        
if __name__ == '__main__':
    base_url = 'https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=%d#a'
    position_links_list = []
    position_list = []
    for x in range(5):
        url = base_url % (x*10)
        print('爬取第%d页：%s' %(x+1, url))
        position_links = get_position_link(url)
        position_links_list.extend(position_links)
        time.sleep(1)
#         break
        
    for index, position_link in enumerate(position_links_list):
#         print(position_link)
        print('解析第%d个职位: %s' % (index+1, position_link))
        position_list.append(get_position_detail(position_link))
        time.sleep(0.5)
#         break
        
    write2csv(position_list)
    

爬取第1页：https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=0#a
爬取第2页：https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=10#a
爬取第3页：https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=20#a
爬取第4页：https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=30#a
爬取第5页：https://hr.tencent.com/position.php?lid=&tid=87&keywords=python&start=40#a
解析第1个职位: https://hr.tencent.com/position_detail.php?id=33901&keywords=python&tid=87&lid=0
解析第2个职位: https://hr.tencent.com/position_detail.php?id=36868&keywords=python&tid=87&lid=0
解析第3个职位: https://hr.tencent.com/position_detail.php?id=28571&keywords=python&tid=87&lid=0
解析第4个职位: https://hr.tencent.com/position_detail.php?id=35186&keywords=python&tid=87&lid=0
解析第5个职位: https://hr.tencent.com/position_detail.php?id=32215&keywords=python&tid=87&lid=0
解析第6个职位: https://hr.tencent.com/position_detail.php?id=35183&keywords=python&tid=87&lid=0
解析第7个职位: https://hr.tencent.com/position_detail.php?id=3