In [1]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser
from concurrent.futures import ThreadPoolExecutor

In [2]:
fake = faker.Faker(locale='en')

In [3]:
tencent_headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'content-type': 'application/json',
    'referer': 'https://careers.tencent.com/en-us/search.html'
}

In [4]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace('\n', '\n ')).strip()

In [5]:
API_URL = "http://careers.tencent.com/tencentcareer/api/post/Query?countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&language=en-us&area=us"
VACANCY_API_URL = "http://careers.tencent.com/tencentcareer/api/post/ByPostId?language=en-us"

In [6]:
search_params = {
  'pageSize': 5000,
  'pageIndex': 1,
  'timestamp': int(time.time() * 1000)
}

In [7]:
jobs_infos = requests.get(API_URL, headers=tencent_headers, params=search_params).json()['Data']['Posts']
len(jobs_infos)

1933

In [8]:
def get_vacancy_dict(job_info: dict):
    vacancy_info_params = {
        'postId': job_info['PostId'],
        'timestamp': int(time.time() * 1000)
    }
    try:
        detailed_info = requests.get(VACANCY_API_URL, headers=tencent_headers, params=vacancy_info_params).json()['Data']
        job_dict = {
            'title': detailed_info['RecruitPostName'],
            'internal_id' : detailed_info['RecruitPostId'],
            'url': detailed_info['PostURL'],
            'description': None,
            'responsibilities': clear_string(detailed_info['Responsibility']),
            'qualifications': clear_string(detailed_info['Requirement']),
            'location': detailed_info['LocationName'],
            'category': detailed_info['CategoryName'],
            'company': 'Tencent',
            'publish_date': parser.parse(detailed_info['LastUpdateTime'])
        }
        return job_dict
    except:
        print(f'Error while parsing {job_info["PostURL"]}')
        return None

In [9]:
with ThreadPoolExecutor(max_workers=10) as executor:
    job_dicts = list(tqdm(executor.map(get_vacancy_dict, jobs_infos), total=len(jobs_infos)))

 99%|█████████▉| 1921/1933 [07:39<00:02,  5.17it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1372556351879782400
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1407545929359171584
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1341365418496892928


 99%|█████████▉| 1923/1933 [07:40<00:02,  4.26it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1349959821117366272
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1407612467235790848
Error while parsing http://careers.tencent.com/jobdesc.html?postId=0
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1379980118021120000


100%|█████████▉| 1924/1933 [07:41<00:02,  3.04it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1379980116385341440
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1123175324201259008


100%|██████████| 1933/1933 [07:42<00:00,  4.18it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=0





In [10]:
snapshot = pd.DataFrame(filter(lambda x: x is not None, job_dicts))
snapshot.sample(5)

Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,location,category,company,publish_date
68,Senior Optical System Engineer / Engineering Lead,5296954002,https://boards.greenhouse.io/tencent/jobs/6196...,,Responsibility: \n 1.\tResearch and developmen...,Minimum Qualifications: \n •\tBachelor or abov...,Palo Alto,Technology,Tencent,2022-06-24
1434,52644-教育云KA销售（北上深）（上海）,90177,http://careers.tencent.com/jobdesc.html?postId...,,1、负责面向教育企业行业，腾讯云计算服务等相关产品的商务工作，包括大客户的关系维护，深度挖掘...,1、本科以上学历，通信和计算机等工科专业，5年以上工作经验，具备服务 TO B客户的大客户销...,Shanghai,"Sales,Service & Support",Tencent,2022-06-29
1823,43452-游戏研发项目管理,76046,http://careers.tencent.com/jobdesc.html?postId...,,与投资公司核心团队保持紧密沟通，充分评估了解其研发项目，及时发现并跟踪解决游戏项目问题，有效...,五年以上游戏行业工作经验，至少三年游戏研发项目管理经验，完整参与从产品立项之初到游戏发布的全...,Shanghai,Product,Tencent,2022-05-27
1592,40931-自动驾驶前端开发工程师（CSIG全资子公司）,91338,http://careers.tencent.com/jobdesc.html?postId...,,1.负责仿真孪生产品前端需求分析、新功能开发、问题修复、性能优化等\n 2.参与仿真孪生编辑...,1.熟练掌握HTML、CSS、JavaScript语言以及ES6规范等;\n 2.熟练使用V...,Wuhan,Technology,Tencent,2022-06-21
1080,48547-腾讯游戏数据科学家,86872,http://careers.tencent.com/jobdesc.html?postId...,,1. 立足于腾讯游戏海量数据，利用数据统计分析、数据挖掘、建模、实验测试等方式，构建服务于大...,1. 计算机科学、数据科学、机器学习、统计学、应用数学等领域硕士及以上学历优先；\n 2. ...,Shenzhen,Technology,Tencent,2022-07-14


In [11]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1923 entries, 0 to 1922
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   title             1923 non-null   object        
 1   internal_id       1923 non-null   int64         
 2   url               1923 non-null   object        
 3   description       0 non-null      object        
 4   responsibilities  1923 non-null   object        
 5   qualifications    1923 non-null   object        
 6   location          1923 non-null   object        
 7   category          1923 non-null   object        
 8   company           1923 non-null   object        
 9   publish_date      1923 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 150.4+ KB


In [12]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'10-08-2022'

In [13]:
snapshot.to_csv(f'../data/tencent/{current_date}.csv')
snapshot.to_excel(f'../data/tencent/{current_date}.xlsx', engine='xlsxwriter')