In [1]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [2]:
from concurrent.futures import ThreadPoolExecutor

In [3]:
fake = faker.Faker(locale='en')

In [4]:
huawei_headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'Host': 'career.huawei.com',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'referer': 'https://career.huawei.com/reccampportal/portal5/social-recruitment.html?jobFamilyCode=&countryCode='
}

In [5]:
API_URL = "https://career.huawei.com/socRecruitment/services/portal3/portalnew/getJobList/page/1000/1"
VACANCY_API_URL = "https://career.huawei.com/socRecruitment/services/portal/portalpub/getJobDetail?jobId="

In [6]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace('\n', '\n ')).strip() if x else None

In [7]:
search_params = {
    'jobFamilyCode': '',
    'deptCode': '',
    'keywords': '',
    'searchType': 1,
    'orderBy': 'P_COUNT_DESC',
    'jobType': 1
}

In [8]:
jobs_list_result = requests.get(API_URL, params=search_params, headers=huawei_headers).json()
jobs_list_result['pageVO']

{'startIndex': 1,
 'endIndex': 1000,
 'totalRows': 773,
 'curPage': 1,
 'pageSize': 1000,
 'resultMode': 0,
 'orderBy': None,
 'filterStr': '773',
 'totalPages': 1,
 'mysqlStartIndex': 0,
 'mysqlEndIndex': 1000}

In [9]:
def get_vacancy(job_info: dict):

    job = requests.get(VACANCY_API_URL + str(job_info['jobId']), headers=huawei_headers).json()

    job_dict = {
        'title': job['jobname'],
        'internal_id' : job_info['jobId'],
        'url': VACANCY_API_URL + str(job_info['jobId']),
        'description': clear_string(job['jobDesc']),
        'responsibilities': clear_string(job['mainBusiness']),
        'qualifications': clear_string(job['jobRequire']),
        'location': job['jobArea'],
        'type': job['jobFamilyName'],
        'company': 'Huawei',
        'publish_date': parser.parse(job['creationDate'])
    }

    return job_dict

In [10]:
with ThreadPoolExecutor(max_workers=10) as executor:
    jobs_dicts = list(tqdm(executor.map(get_vacancy, jobs_list_result['result']), total=len(jobs_list_result['result'])))

100%|██████████| 773/773 [00:43<00:00, 17.57it/s]


In [11]:
snapshot = pd.DataFrame(jobs_dicts)
snapshot.sample(5)

Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,location,type,company,publish_date
728,华为公有云产品经理,169454,https://career.huawei.com/socRecruitment/servi...,,1. 公有云产品经理作为华为公有云解决方案和产品定义的Owner，对服务和产品的竞争力、体验...,业务技能要求：\n 1. 具有优秀的沟通技巧、团队合作经验、敬业精神和学习能力。能实现规划到...,中国/西安;中国/成都;中国/杭州;中国/深圳;中国/北京,云服务族,Huawei,2021-05-18 00:00:00+08:00
38,Senior Channel Manager,245061,https://career.huawei.com/socRecruitment/servi...,,Channel Account Manager must engage and influe...,Business Skills Requirements:\n · Around 5-10 ...,Sweden/Stockholm,销售族,Huawei,2022-08-01 00:00:00+08:00
766,财经经理（财务数据架构师）,124941,https://career.huawei.com/socRecruitment/servi...,,1、负责提供收入应收各领域的数据需求落地解决方案，以及财经主数据及核心基础数据的管理工作；\...,1、工作经验要求：\n 1）至少4年以上IT领域工作经验；\n 2）有数字化转型或运营经验者...,中国/深圳,财经族,Huawei,2020-03-26 00:00:00+08:00
262,【专业类岗位】销售与运作计划高级工程师,238974,https://career.huawei.com/socRecruitment/servi...,,标准职责\n 根据要货预测、产品供应能力等，制定S&OP建议计划、主计划/物料需求计划/加工...,业务技能要求：\n •\t具备制定产品计划和存货分析的技能\n 专业知识要求：\n •\t熟...,中国/东莞,供应链族,Huawei,2022-06-14 00:00:00+08:00
40,Research Engineer,245041,https://career.huawei.com/socRecruitment/servi...,,•\tResearch and Develop proof of concept innov...,Business Skills Requirements:\n •\tThe followi...,Turkey/Istanbul,研发族,Huawei,2022-08-01 00:00:00+08:00


In [12]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 773 entries, 0 to 772
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype                                
---  ------            --------------  -----                                
 0   title             773 non-null    object                               
 1   internal_id       773 non-null    int64                                
 2   url               773 non-null    object                               
 3   description       0 non-null      object                               
 4   responsibilities  773 non-null    object                               
 5   qualifications    773 non-null    object                               
 6   location          773 non-null    object                               
 7   type              773 non-null    object                               
 8   company           773 non-null    object                               
 9   publish_date      773 non-null    datetime6

In [13]:
snapshot['publish_date'] = snapshot['publish_date'].apply(lambda x: x.replace(tzinfo=None))
snapshot['publish_date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 773 entries, 0 to 772
Series name: publish_date
Non-Null Count  Dtype         
--------------  -----         
773 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 6.2 KB


In [14]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'10-08-2022'

In [15]:
snapshot.to_csv(f'../data/huawei//{current_date}.csv')
snapshot.to_excel(f'../data/huawei/{current_date}.xlsx', engine='xlsxwriter')