## **PARSING TENCENT VACANCY**

In [1]:
############################################
###     Research Trending Vacancies      ###
###     Sber Dep. Research&Innovation    ### 
###   Ivanov Arseny, Sergey Bratchikov   ###
###       A. Efimov, D. Asonov           ###
############################################

In [2]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser
from concurrent.futures import ThreadPoolExecutor

In [3]:
fake = faker.Faker(locale='en')

In [4]:
tencent_headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'content-type': 'application/json',
    'referer': 'https://careers.tencent.com/en-us/search.html'
}

In [5]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace('\n', '\n ')).strip()

In [8]:
API_URL = "http://careers.tencent.com/tencentcareer/api/post/Query?countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&language=en-us&area=us"
VACANCY_API_URL = "http://careers.tencent.com/tencentcareer/api/post/ByPostId?language=en-us"

In [9]:
search_params = {
  'pageSize': 5000,
  'pageIndex': 1,
  'timestamp': int(time.time() * 1000)
}

In [10]:
jobs_infos = requests.get(API_URL, headers=tencent_headers, params=search_params).json()['Data']['Posts']
len(jobs_infos)

2020

In [11]:
def get_vacancy_dict(job_info: dict):
    vacancy_info_params = {
        'postId': job_info['PostId'],
        'timestamp': int(time.time() * 1000)
    }
    try:
        detailed_info = requests.get(VACANCY_API_URL, headers=tencent_headers, params=vacancy_info_params).json()['Data']
        job_dict = {
            'title': detailed_info['RecruitPostName'],
            'internal_id' : detailed_info['RecruitPostId'],
            'url': detailed_info['PostURL'],
            'description': None,
            'responsibilities': clear_string(detailed_info['Responsibility']),
            'qualifications': clear_string(detailed_info['Requirement']),
            'location': detailed_info['LocationName'],
            'category': detailed_info['CategoryName'],
            'company': 'Tencent',
            'publish_date': parser.parse(detailed_info['LastUpdateTime'])
        }
        return job_dict
    except:
        print(f'Error while parsing {job_info["PostURL"]}')
        return None

In [12]:
with ThreadPoolExecutor(max_workers=10) as executor:
    job_dicts = list(tqdm(executor.map(get_vacancy_dict, jobs_infos), total=len(jobs_infos)))

 99%|█████████▉| 2009/2020 [06:24<00:02,  4.69it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1372556351879782400
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1123175324201259008


100%|█████████▉| 2013/2020 [06:25<00:01,  5.41it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1407545929359171584
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1407612467235790848
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1341365418496892928


100%|█████████▉| 2016/2020 [06:25<00:00,  6.52it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1349959821117366272


100%|█████████▉| 2017/2020 [06:25<00:00,  5.73it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=0


100%|█████████▉| 2018/2020 [06:25<00:00,  5.08it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=1379980118021120000


100%|██████████| 2020/2020 [06:26<00:00,  5.23it/s]

Error while parsing http://careers.tencent.com/jobdesc.html?postId=0
Error while parsing http://careers.tencent.com/jobdesc.html?postId=1379980116385341440





In [16]:
snapshot = pd.DataFrame(filter(lambda x: x is not None, job_dicts))
snapshot.sample(5)

Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,location,category,company,publish_date
856,48549-Senior Product Manager - Gaming Community,90871,http://careers.tencent.com/jobdesc.html?postId=0,,- Establish product vision and strategy for th...,- Bachelor’s degree or equivalent years of pro...,Shenzhen,Product,Tencent,2022-06-18
1510,43476-External Development Business Manager,90877,http://careers.tencent.com/jobdesc.html?postId=0,,1- Maintain good relationship with important b...,1- Have acquired the bachelor or master’s degr...,Shenzhen,"Sales,Service & Support",Tencent,2022-05-26
320,15571-《UE4次世代PVP射击手游》客户端开发工程师（深圳/广州）,82206,http://careers.tencent.com/jobdesc.html?postId...,,负责游戏中的模块设计与开发，根据需求制定合理技术方案并实现；\n 负责游戏中的关键技术预研，...,熟悉UE4或者Unity开发，2年以上游戏开发经验；\n 熟练使用C/C++，熟悉Types...,Shenzhen,Technology,Tencent,2022-06-30
1854,17229-和平精英-游戏服务器开发工程师（深圳）,90418,http://careers.tencent.com/jobdesc.html?postId=0,,负责游戏服务器端设计及开发，参与分布式后台系统开发、解决外网的各种bug；\n 能与客户端策...,3年以上后台开发工作经验；\n 3年以上分布式系统开发经验，对海量用户进行并行系统设计及系统...,Shenzhen,Technology,Tencent,2022-04-19
825,41073-iOS SDK 高级开发工程师（CSIG全资子公司）,79607,http://careers.tencent.com/jobdesc.html?postId...,,负责腾讯云AI核身SDK产品的迭代开发与持续改进工作；\n 负责对接产品需求，参与技术方案的...,计算机或相关专业本科及以上学历，3年以上iOS项目开发经验；\n 熟练掌握Objective...,Xi'an,Technology,Tencent,2022-06-20


In [17]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010 entries, 0 to 2009
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   title             2010 non-null   object        
 1   internal_id       2010 non-null   int64         
 2   url               2010 non-null   object        
 3   description       0 non-null      object        
 4   responsibilities  2010 non-null   object        
 5   qualifications    2010 non-null   object        
 6   location          2010 non-null   object        
 7   category          2010 non-null   object        
 8   company           2010 non-null   object        
 9   publish_date      2010 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 157.2+ KB


In [18]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'03-07-2022'

In [20]:
snapshot.to_csv(f'../data/tencent/{current_date}.csv')