## **PARSING APPLE VACANCY**

In [1]:
############################################
###     Research Trending Vacancies      ###
###     Sber Dep. Research&Innovation    ### 
###   Ivanov Arseny, Sergey Bratchikov   ###
###       A. Efimov, D. Asonov           ###
############################################

In [2]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [3]:
from concurrent.futures import ThreadPoolExecutor

In [4]:
fake = faker.Faker(locale='en')

In [5]:
apple_headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'countrycode': 'USA',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'origin': 'http://jobs.apple.com',
    'Host': 'jobs.apple.com',
    'referer': 'http://jobs.apple.com/en-us/search'
}

In [6]:
MAIN_URL = "http://jobs.apple.com"
HTML_URL = "http://jobs.apple.com/en-us/search"
CSRF_TOKEN = "http://jobs.apple.com/api/csrfToken"
API_URL = "http://jobs.apple.com/api/role/search"

## Через парсинг страниц
Сначала собираем ссылки и начальную инфу для детального парсинга каждой ваки

In [7]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace('\n', '\n ')).strip()

In [8]:
jobs_list = []
null_counter = 0
for page in tqdm(range(0, 1000)):
    search_params = {
        'page': page
    }
    html_result = requests.get(HTML_URL, headers=apple_headers, params=search_params)
    soup = BeautifulSoup(html_result.text)

    results_table = soup.find('div', attrs={'class': 'results__table'})
    if results_table:
        null_counter = 0

        jobs = results_table.find('table').find_all('tbody')
        for job_block in jobs:
            job_info = job_block.findNext('td')

            name = job_info.find('a').text
            url = job_info.find('a')['href']
            internal_id = job_info.find('a')['id'].split('-')[-1]
            publish_date = parser.parse(job_info.find('span').findNext().text)

            jobs_list.append((url, internal_id, name, publish_date))

        time.sleep(0.2) # sleep for 200 ms
    else:
        null_counter += 1
        if null_counter == 3:
            break
        else:
            continue

 46%|████▌     | 456/1000 [24:11<28:51,  3.18s/it]  


In [9]:
len(jobs_list)

9064

In [11]:
jobs_list[100]

('/en-us/details/114438110/nl-creative?team=APPST',
 '114438110',
 'NL-Creative',
 datetime.datetime(2022, 7, 3, 0, 0))

#### Детальный парсинг страниц вакансий

In [12]:
def get_vacancy(params):
    url, internal_id, name, publish_date = params

    job_website = requests.get(MAIN_URL + url, headers=apple_headers)
    soup = BeautifulSoup(job_website.text)

    try:
        description = clear_string(soup.find('div', id='jd-job-summary').span.text)
        qualifications = clear_string(soup.find('div', id='jd-key-qualifications').text)
        responsibilities = clear_string(soup.find('div', id='jd-description').text)
    except:
        print(f'Error while parsing {job_website.url}')
        return None

    job_dict = {
        'title': name,
        'internal_id' : internal_id,
        'url': MAIN_URL + url,
        'description': description,
        'responsibilities': responsibilities,
        'qualifications': qualifications,
        'company': 'Apple',
        'publish_date': publish_date
    }

    return job_dict

In [13]:
with ThreadPoolExecutor(max_workers=10) as executor:
    job_dicts = list(tqdm(executor.map(get_vacancy, jobs_list), total=len(jobs_list)))

  1%|          | 95/9064 [00:31<56:48,  2.63it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200396220/administrative-assistant?team=SFTWR


  1%|          | 96/9064 [02:30<53:43:45, 21.57s/it]

Error while parsing https://www.apple.com/careers/us/maintenance.html


  1%|          | 103/9064 [02:32<19:44:18,  7.93s/it]

Error while parsing https://www.apple.com/careers/us/maintenance.html


  6%|▋         | 574/9064 [03:22<34:36,  4.09it/s]   

Error while parsing https://jobs-prz.apple.com/en-us/details/200395855/administration-assistant?team=HRDWR


  9%|▉         | 849/9064 [04:49<28:53,  4.74it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200396023/administrative-assistant-hardware-teams?team=HRDWR


 11%|█▏        | 1042/9064 [05:56<1:50:58,  1.20it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200371813/administrative-assistant?team=HRDWR


 13%|█▎        | 1159/9064 [06:26<29:16,  4.50it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200390116/administrative-assistant?team=SFTWR
Error while parsing https://jobs-prz.apple.com/en-us/details/200394781/administrative-assistant-tdg?team=HRDWR


 20%|█▉        | 1774/9064 [09:35<46:20,  2.62it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200326298/administrative-assistant?team=SFTWR


 29%|██▉       | 2638/9064 [13:47<21:41,  4.94it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200370189/administrative-assistant?team=CORSV


 29%|██▉       | 2660/9064 [13:53<22:13,  4.80it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200388008/administrative-assistant-digital-marketing-and-media?team=MKTG


 30%|██▉       | 2712/9064 [14:09<43:06,  2.46it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200385592/platform-specialist-ad-platforms-partner-development-greater-china?team=SFTWR


 35%|███▍      | 3148/9064 [16:18<37:21,  2.64it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200385074/administrative-assistant-apps-program?team=NA


 37%|███▋      | 3370/9064 [17:21<49:48,  1.91it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200384270/administrative-assistant-apps-team?team=SFTWR


 39%|███▊      | 3510/9064 [18:02<18:23,  5.03it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200367040/administrative-assistant?team=SFTWR


 46%|████▋     | 4209/9064 [21:24<23:59,  3.37it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200377865/administrative-assistant?team=HRDWR


 48%|████▊     | 4322/9064 [21:55<23:51,  3.31it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200377842/administrative-assistant-special-projects-group?team=CORSV


 50%|████▉     | 4523/9064 [23:01<51:01,  1.48it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200367123/administrative-assistant-ai-ml?team=SFTWR


 61%|██████    | 5486/9064 [27:46<29:54,  1.99it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200366854/administrative-assistant?team=CORSV


 69%|██████▊   | 6225/9064 [31:27<13:57,  3.39it/s]  

Error while parsing https://jobs-prz.apple.com/en-us/details/200354990/administrative-assistant?team=SFTWR


 81%|████████  | 7345/9064 [37:06<05:42,  5.02it/s]

Error while parsing https://jobs-prz.apple.com/en-us/details/200331546/administrative-assistant-security-engineering-architecture?team=SFTWR


100%|██████████| 9064/9064 [46:07<00:00,  3.28it/s]


In [14]:
snapshot = pd.DataFrame(filter(lambda x: x is not None, job_dicts))
print(len(snapshot))
snapshot.sample(5)

9043


Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,company,publish_date
918,Apple Shop Leader - Istanbul (European side - ...,200359842,http://jobs.apple.com/en-us/details/200359842/...,"Imagine, what you could do here? \n At Apple, ...","We connect customers to Apple's solutions, and...","You are passionate, approachable and know how ...",Apple,2022-06-27
6876,Contract Recruiting Coordinator,200344894,http://jobs.apple.com/en-us/details/200344894/...,Do you have a passion for helping others succe...,The Contract Recruiting Coordinator provides s...,"Able to work 10am-7pm ET, Monday through Frida...",Apple,2022-02-14
7834,mmWave IC Design Engineer,200199535,http://jobs.apple.com/en-us/details/200199535/...,Would you like to part of Apple’s fast-growing...,"As a mmWave Design Engineer at Apple, you will...",We would like you to have experience with mmWa...,Apple,2021-11-10
7195,"Nanoimprint Lithography Process Engineer, Disp...",200337515,http://jobs.apple.com/en-us/details/200337515/...,Do something different! Apple is seeking a har...,- Take total ownership to drive nanoimprint li...,Proficiency and hands-on experience in Nanoimp...,Apple,2022-01-27
3611,"AI/ML - Sr Client Engineer, Siri",200342698,http://jobs.apple.com/en-us/details/200342698/...,Would you like to play a part in the next revo...,Join Apple's Siri team as a Senior Client Engi...,Minimum 5 years of industry experience in soft...,Apple,2022-05-21


In [15]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9043 entries, 0 to 9042
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   title             9043 non-null   object        
 1   internal_id       9043 non-null   object        
 2   url               9043 non-null   object        
 3   description       9043 non-null   object        
 4   responsibilities  9043 non-null   object        
 5   qualifications    9043 non-null   object        
 6   company           9043 non-null   object        
 7   publish_date      9043 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(7)
memory usage: 565.3+ KB


In [16]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'03-07-2022'

In [17]:
snapshot.to_csv(f'../data/apple/{current_date}.csv')
snapshot.to_csv(f'../data/apple/{current_date}.tsv', sep='\t')

#### Проверка на единичной вакансии

In [202]:
job_website = requests.get(MAIN_URL + jobs_list[0][0], headers=apple_headers)
job_website

<Response [200]>

In [204]:
soup = BeautifulSoup(job_website.text)

In [214]:
description = soup.find('div', id='jd-job-summary').span.text
clear_string(description)

'Do you love how it feels to help others? After customers purchase our products, you’re the one who helps them get more out of their new Apple technology. Your day in the Apple Store is filled with a range of focused support and service tasks. Whether you’re helping customers get started with the Mac or finding answers to their questions about other Apple devices, you’re ready to share knowledge and provide exceptional assistance. You gain satisfaction from bringing resolution and insight to each customer, elevating his or her relationship with Apple to the next level.\n \n Both full-time and part-time jobs are available.'

In [212]:
soup.find('div', id='jd-key-qualifications').text

'Ability to assess customers’ support needs when they arrive, then provide solutions or refer them to other team membersFlexibility to regularly rotate through different technical specialities and skill setsAbility to thrive on change as products evolve'

In [213]:
soup.find('div', id='jd-description').text

'As a Technical Specialist, you help new owners get started and current ones get quick, efficient support — developing strong, positive relationships with Apple. When a customer needs assistance, you quickly assess their situation. Sometimes you take care of customers with advice or a solution on the spot, using your knowledge of current Apple technology to help with iPod, iPhone and iPad devices. At other times, you refer customers to support team members who get them up and running again. You even provide personal training for new customers, helping them acquire the basic skills they need to get started on photo, video and music projects. The entire store team benefits from your commitment to providing the best care for customers. By helping Apple maintain strong relationships with customers, you are instrumental to our success.\n\nDiscover even more benefits of doing what you love. Apple’s most important resource, our soul, is our people. Apple benefits help further the well-being o

## Через API бэкенда
ВОозникают проблемы, поэтому выбран обычный парсинг

In [14]:
session = requests.Session()
session.headers = apple_headers

In [84]:
result = session.get(CSRF_TOKEN)
result

<Response [200]>

In [85]:
session.headers['X-Apple-CSRF-Token'] = result.headers['X-Apple-CSRF-Token']

In [86]:
session.headers

{'User-Agent': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_9_7) AppleWebKit/535.0 (KHTML, like Gecko) Chrome/55.0.854.0 Safari/535.0',
 'accept-language': 'en-US,en;q=0.9',
 'pragma': 'np-cache',
 'countrycode': 'USA',
 'sec-fetch-mode': 'cors',
 'sec-fetch-site': 'same-origin',
 'origin': 'https://jobs.apple.com',
 'Host': 'jobs.apple.com',
 'referer': 'https://jobs.apple.com/en-us/search',
 'X-Apple-CSRF-Token': '2da62a5db304cb69f52f5ca8ddec1f1eb8aff5bca75024853a4959f5ba64c62a'}

In [89]:
search_payload = {
    'page': 1,
    'query': '',
    'filters': {"range":{"standardWeeklyHours":{"start":None,"end":None}}},
    'locals': 'en-us',
    'sort': 'relevance',
}

In [90]:
result = session.post(API_URL, data=search_payload, allow_redirects=True)
result

<Response [403]>

In [None]:
result.text

In [None]:
print(result.text)