## **PARSING GOOGLE VACANCY**

In [1]:
############################################
###     Research Trending Vacancies      ###
###     Sber Dep. Research&Innovation    ### 
###   Ivanov Arseny, Sergey Bratchikov   ###
###       A. Efimov, D. Asonov           ###
############################################

In [2]:
import time

import requests
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [3]:
fake = faker.Faker(locale='en')

In [4]:
headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'referer': 'http://careers.google.com/jobs/results/'
}

In [5]:
API_URL = "http://careers.google.com/api/v3/search/"
VACANCY_URL = 'http://careers.google.com/jobs/results/'

In [6]:
search_params = {
    'page': 1,
    # 'company': 'Google', # One of Alphabet holding's company
    'q': '',
    'sort_by': 'date',
    'page_size': 100 # must be 20, 50, 75, 100
}

In [7]:
result = requests.get(API_URL, params=search_params, headers=headers)
result

<Response [200]>

In [8]:
total_count = result.json()['count']
max_pages = total_count // 100
print(total_count, max_pages)

7261 72


In [9]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', ' ', x).replace("&nbsp;", "").replace("&amp;", "&").replace('\n', '\n ')).strip()
title_to_url = lambda x: "-".join(re.sub('\s+', ' ', re.sub(r'[^a-zA-Z\s]', '', x.lower())).split(' '))

In [10]:
search_results = []
null_counter = 0
for page in tqdm(range(max_pages)):
    search_params['page'] = page

    result = requests.get(API_URL, params=search_params, headers=headers).json()
    found_jobs = result['jobs']

    if len(found_jobs) == 0:
        null_counter += 1
        if null_counter == 3:
            break
    else:
        null_counter = 0

    for job_json in found_jobs:
        internal_id = job_json['id'].split('/')[-1]
        result_dict = {
            'title': job_json['title'],
            'internal_id': internal_id,
            'url': VACANCY_URL + internal_id + '-' + title_to_url(job_json['title']),
            'description': clear_string(job_json['description']),
            'responsibilities': clear_string(job_json['responsibilities']),
            'qualifications': clear_string(job_json['qualifications']),
            'company': job_json['company_name'],
            # 'country_code': job_json['country_code'],
            'publish_date': parser.parse(job_json['publish_date'])
        }
        search_results.append(result_dict)

    time.sleep(0.2) # sleep for 200 ms

 74%|███████▎  | 53/72 [01:49<00:39,  2.06s/it]


In [11]:
snapshot = pd.DataFrame(search_results)
print(len(snapshot))
snapshot.sample(5)

5100


Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,company,publish_date
4729,"Chrome Enterprise Customer Engineer (English, ...",72377254216114886,http://careers.google.com/jobs/results/7237725...,"As a Chrome Enterprise Customer Engineer, you'...","Work with companies, partners, and government ...",Minimum qualifications: \n 3 years of experien...,Google,2022-04-27 15:02:28.468000+00:00
5082,"Vice President, Strategy, Geo",142182423656833734,http://careers.google.com/jobs/results/1421824...,"As Vice President for Geo, you will be part of...",Serve as a key thought partner to the Geo orga...,Minimum qualifications: \n Bachelor’s degree i...,Google,2022-04-18 19:31:47.614000+00:00
1233,"Technical Program Manager, Cloud Web",79188482351604422,http://careers.google.com/jobs/results/7918848...,A problem isn’t truly solved until it’s solved...,"Plan requirements, establish project plans, bu...",Minimum qualifications: \n Experience in proje...,Google,2022-06-22 15:32:01.607000+00:00
1288,"Director, Agency, Google Customer Solutions (E...",72543128025735878,http://careers.google.com/jobs/results/7254312...,"As Director, Agency, Google Customer Solutions...",Set and lead the strategy for all German Agenc...,Minimum qualifications: \n Experience leading ...,Google,2022-06-21 20:31:49.218000+00:00
3089,"Product Manager, Content Safety Defaults and S...",96268531395371718,http://careers.google.com/jobs/results/9626853...,"In this role, you will be providing strategic ...",Make critical contributions to the Trust strat...,Minimum qualifications: \n Bachelor's degree i...,Google,2022-05-27 18:31:56.237000+00:00


In [12]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype                  
---  ------            --------------  -----                  
 0   title             5100 non-null   object                 
 1   internal_id       5100 non-null   object                 
 2   url               5100 non-null   object                 
 3   description       5100 non-null   object                 
 4   responsibilities  5100 non-null   object                 
 5   qualifications    5100 non-null   object                 
 6   company           5100 non-null   object                 
 7   publish_date      5100 non-null   datetime64[ns, tzutc()]
dtypes: datetime64[ns, tzutc()](1), object(7)
memory usage: 318.9+ KB


In [13]:
snapshot['company'].value_counts()

Google          4766
YouTube          239
Fitbit            93
Google Fiber       2
Name: company, dtype: int64

In [14]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'03-07-2022'

In [15]:
snapshot.to_csv(f'../data/google/{current_date}.csv')
snapshot.to_csv(f'../data/google/{current_date}.tsv', sep='\t')