## **PARSING GOOGLE VACANCY**

In [None]:
############################################
###     Research Trending Vacancies      ###
###     Sber Dep. Research&Innovation    ### 
###   Ivanov Arseny, Sergey Bratchikov   ###
###       A. Efimov, D. Asonov           ###
############################################

In [1]:
import time

import requests
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [2]:
fake = faker.Faker(locale='en')

In [3]:
headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'referer': 'http://careers.google.com/jobs/results/'
}

In [5]:
API_URL = "http://careers.google.com/api/v3/search/"
VACANCY_URL = 'http://careers.google.com/jobs/results/'

In [9]:
search_params = {
    'page': 1,
    # 'company': 'Google', # One of Alphabet holding's company
    'q': '',
    'sort_by': 'date',
    'page_size': 100 # must be 20, 50, 75, 100
}

In [10]:
result = requests.get(API_URL, params=search_params, headers=headers)
result

<Response [200]>

In [14]:
total_count = result.json()['count']
max_pages = total_count // 100
print(total_count, max_pages)

7483 74


In [15]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', '', x).replace("&nbsp;", "").replace("&amp;", "&").replace('\n', '\n ')).strip()
title_to_url = lambda x: "-".join(re.sub('\s+', ' ', re.sub(r'[^a-zA-Z\s]', '', x.lower())).split(' '))

In [19]:
search_results = []
null_counter = 0
for page in tqdm(range(max_pages)):
    search_params['page'] = page

    result = requests.get(API_URL, params=search_params, headers=headers).json()
    found_jobs = result['jobs']

    if len(found_jobs) == 0:
        null_counter += 1
        if null_counter == 3:
            break
    else:
        null_counter = 0

    for job_json in found_jobs:
        internal_id = job_json['id'].split('/')[-1]
        result_dict = {
            'title': job_json['title'],
            'internal_id': internal_id,
            'url': VACANCY_URL + internal_id + '-' + title_to_url(job_json['title']),
            'description': clear_string(job_json['description']),
            'responsibilities': clear_string(job_json['responsibilities']),
            'qualifications': clear_string(job_json['qualifications']),
            'company': job_json['company_name'],
            # 'country_code': job_json['country_code'],
            'publish_date': parser.parse(job_json['publish_date'])
        }
        search_results.append(result_dict)

    time.sleep(0.2) # sleep for 200 ms

 72%|███████▏  | 53/74 [01:52<00:44,  2.13s/it]


In [20]:
snapshot = pd.DataFrame(search_results)
print(len(snapshot))
snapshot.sample(5)

5100


Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,company,publish_date
1152,"Strategy and Sales Operations Analyst, Google ...",108237416348689094,http://careers.google.com/jobs/results/1082374...,"Google Cloud teams work with schools, companie...",Work with the cross-functional leadership team...,Minimum qualifications:\n Bachelor's degree or...,Google,2022-06-15 18:01:52.949000+00:00
1673,"Release Engineer, Fitbit",130713460171252422,http://careers.google.com/jobs/results/1307134...,How do you release updates and changes to the ...,Work directly with team members that need to r...,Minimum qualifications:\n Bachelor's degree in...,Fitbit,2022-06-09 19:32:02.254000+00:00
2493,Account Strategy Lead (Spanish),99377076252877510,http://careers.google.com/jobs/results/9937707...,Businesses that partner with Google come in al...,Lead partnerships with Google Shopping clients...,Minimum qualifications:\n Bachelor's degree or...,Google,2022-06-01 06:01:56.140000+00:00
4870,"Technical Writer, Systems Infrastructure Platform",114474748219400902,http://careers.google.com/jobs/results/1144747...,Technical writers communicate complex informat...,Manage multiple competing priorities in a fast...,Minimum qualifications:\n Bachelor's degree or...,Google,2022-04-20 19:31:55.761000+00:00
2027,Regional Workplace Services Manager,84308472331412166,http://careers.google.com/jobs/results/8430847...,"As a Regional Workplace Services Manager, you ...",Lead initiatives to develop services to suppor...,Minimum qualifications:\n Bachelor's degree or...,Google,2022-06-07 05:02:20.098000+00:00


In [21]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype                  
---  ------            --------------  -----                  
 0   title             5100 non-null   object                 
 1   internal_id       5100 non-null   object                 
 2   url               5100 non-null   object                 
 3   description       5100 non-null   object                 
 4   responsibilities  5100 non-null   object                 
 5   qualifications    5100 non-null   object                 
 6   company           5100 non-null   object                 
 7   publish_date      5100 non-null   datetime64[ns, tzutc()]
dtypes: datetime64[ns, tzutc()](1), object(7)
memory usage: 318.9+ KB


In [22]:
snapshot['company'].value_counts()

Google     4769
YouTube     245
Fitbit       86
Name: company, dtype: int64

In [23]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'24-06-2022'

In [25]:
snapshot.to_csv(f'../data/google/{current_date}.csv')
snapshot.to_csv(f'../data/google/{current_date}.tsv', sep='\t')