In [1]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [86]:
from concurrent.futures import ThreadPoolExecutor

In [6]:
fake = faker.Faker(locale='en')

In [13]:
huawei_headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'Host': 'career.huawei.com',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'referer': 'https://career.huawei.com/reccampportal/portal5/social-recruitment.html?jobFamilyCode=&countryCode='
}

In [77]:
API_URL = "https://career.huawei.com/socRecruitment/services/portal3/portalnew/getJobList/page/1000/1"
VACANCY_API_URL = "https://career.huawei.com/socRecruitment/services/portal/portalpub/getJobDetail?jobId="

In [83]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace('\n', '\n ')).strip() if x else None

In [59]:
search_params = {
    'jobFamilyCode': '',
    'deptCode': '',
    'keywords': '',
    'searchType': 1,
    'orderBy': 'P_COUNT_DESC',
    'jobType': 1
}

In [65]:
jobs_list_result = requests.get(API_URL, params=search_params, headers=huawei_headers).json()
jobs_list_result['pageVO']

{'startIndex': 1,
 'endIndex': 1000,
 'totalRows': 781,
 'curPage': 1,
 'pageSize': 1000,
 'resultMode': 0,
 'orderBy': None,
 'filterStr': '781',
 'totalPages': 1,
 'mysqlStartIndex': 0,
 'mysqlEndIndex': 1000}

In [87]:
def get_vacancy(job_info: dict):

    job = requests.get(VACANCY_API_URL + str(job_info['jobId']), headers=huawei_headers).json()

    job_dict = {
        'title': job['jobname'],
        'internal_id' : job_info['jobId'],
        'url': VACANCY_API_URL + str(job_info['jobId']),
        'description': clear_string(job['jobDesc']),
        'responsibilities': clear_string(job['mainBusiness']),
        'qualifications': clear_string(job['jobRequire']),
        'location': job['jobArea'],
        'type': job['jobFamilyName'],
        'company': 'Huawei',
        'publish_date': parser.parse(job['creationDate'])
    }

    return job_dict

In [90]:
with ThreadPoolExecutor(max_workers=10) as executor:
    jobs_dicts = list(tqdm(executor.map(get_vacancy, jobs_list_result['result']), total=len(jobs_list_result['result'])))

100%|██████████| 781/781 [00:49<00:00, 15.79it/s]


In [91]:
snapshot = pd.DataFrame(jobs_dicts)
snapshot.sample(5)

Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,location,type,company,publish_date
240,【专业类】试制验证工程师,237312,https://career.huawei.com/socRecruitment/servi...,,1、基于试制计划，组织试制准备活动，管理产品工程设计、制造工程文件、制造要素等试制资源的齐套...,知识：\n 1、\t了解ICT产品知识、设计原理、工艺路线、制造应用场景等专业知识。\n 2...,中国/东莞,制造族,Huawei,2022-06-04 00:00:00+08:00
768,高级解决方案架构师,137369,https://career.huawei.com/socRecruitment/servi...,,"""1、负责行业数字化转型咨询解决方案offering，通过洞察主动发现新机会点，前瞻性地识别...","业务技能要求：\n ""1、5+年以上行业数字化转型咨询、规划和实施经验；\n 2、对企业架构...",中国/北京,云服务族,Huawei,2020-08-03 00:00:00+08:00
406,系统工程研究员,232428,https://career.huawei.com/socRecruitment/servi...,,1、洞察全球科技与产业发展趋势，形成未来信息领域的战略假设，组织公司内外部科学家/专家进行交...,1、系统工程、运筹学、控制论等相关专业硕士以上学历，中英文流利，英文能作为工作语言。\n 2...,中国/深圳,研发族,Huawei,2022-05-06 00:00:00+08:00
377,工程师A,233700,https://career.huawei.com/socRecruitment/servi...,,标准职责\n 供应中心面向区域中大型代表处/国家端到端订单履行方案的执行人，为所负责国家/代...,业务技能要求：\n 技能： 1、订单需求管理：备发货条件管理；订单计划制定与变更管理；订单状...,Hungary/Budapest,供应链族,Huawei,2022-05-12 00:00:00+08:00
569,General Software Engineer,227923,https://career.huawei.com/socRecruitment/servi...,,"Responsible for demand analysis, Including of ...",Business Skills Requirements:\n 1) familiar wi...,Turkey/Istanbul,研发族,Huawei,2022-04-11 00:00:00+08:00


In [92]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype                                
---  ------            --------------  -----                                
 0   title             781 non-null    object                               
 1   internal_id       781 non-null    int64                                
 2   url               781 non-null    object                               
 3   description       0 non-null      object                               
 4   responsibilities  781 non-null    object                               
 5   qualifications    781 non-null    object                               
 6   location          781 non-null    object                               
 7   type              781 non-null    object                               
 8   company           781 non-null    object                               
 9   publish_date      781 non-null    datetime6

In [93]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'05-07-2022'

In [94]:
snapshot.to_csv(f'../data/huawei//{current_date}.csv')