In [1]:
import requests
import pandas as pd

In [2]:
# Get area id by name.
# Returns area id that is needed for vacancy requests.
# Trows RuntimeError if area is not found.
# Usage: msc_id = get_area_id_by_name('Москва')
def get_area_id_by_name(area_name):
    url = 'https://api.hh.ru/areas'
    areas_key = 'areas'
    id_key = 'id'
    name_key = 'name'

    response = requests.get(url)
    response_json = response.json()

    for area_town in response_json:
        if area_town[name_key] == area_name:
            return area_town[id_key]

        for area_state in area_town[areas_key]:
            if area_state[name_key] == area_name:
                return area_state[id_key]

            for area_city in area_state[areas_key]:
                if area_city[name_key] == area_name:
                    return area_city[id_key]

    raise RuntimeError(f"Cannot find provided area {area_name}")

In [3]:
# Gets a number of vacancies for given position in given area_id
# Usage count = get_vacancies_count('Data Analyst', get_area_id_by_name('Москва'))
# search_parameters can be found here: https://github.com/hhru/api/blob/master/docs/vacancies.md#поиск-по-вакансиям
def get_vacancies_count(position, area_id, search_parameters={}):
    url = 'https://api.hh.ru/vacancies'

    parameters = {
        'text': position,
        'area': area_id,
        'per_page': 2,
        'page': 1
    }
    all_parameters = {**parameters, **search_parameters}
    response = requests.get(url, params=all_parameters)
    vacancies_count = response.json()['found']
    return vacancies_count

minsk_id = get_area_id_by_name('Минск')
get_vacancies_count('Data Analyst', minsk_id, {"employment": "full"})

86

In [4]:
# Download all vacancies for given position in given area_id
# Returns a list of json format with vacancies info
# Usage: vacancies = download_vacancies('Data Analyst', get_area_id_by_name('Москва'))
# search_parameters can be found here: https://github.com/hhru/api/blob/master/docs/vacancies.md#поиск-по-вакансиям
def download_vacancies(position, area_id, search_parameters={}):
    vacancies_count = get_vacancies_count(position, area_id, search_parameters)
    if vacancies_count == 0:
        return []

    url = 'https://api.hh.ru/vacancies'
    vacancies = []
    vacancies_per_page = 100
    pages_count = round(vacancies_count / vacancies_per_page)
    if pages_count == 0:
        pages_count = 1

    for page in range(0, pages_count):
        print(f"Parsing {page} page...")
        parameters = {
            'text': position,
            'area': area_id,
            'per_page': vacancies_per_page,
            'page': page
        }
        all_parameters = {**parameters, **search_parameters}
        response = requests.get(url, params=all_parameters)
        vacancies += response.json()['items']

    return vacancies

minsk_id = get_area_id_by_name('Минск')
vacancies = download_vacancies('Data Analyst', minsk_id, {"employment": "full"})
print(vacancies[5])

Parsing 0 page...
{'id': '38783919', 'premium': False, 'name': 'Data Analyst', 'department': None, 'has_test': False, 'response_letter_required': False, 'area': {'id': '1002', 'name': 'Минск', 'url': 'https://api.hh.ru/areas/1002'}, 'salary': None, 'type': {'id': 'open', 'name': 'Открытая'}, 'address': {'city': 'Минск', 'street': 'Зыбицкая улица', 'building': '10', 'description': None, 'lat': 53.906337, 'lng': 27.556603, 'raw': 'Минск, Зыбицкая улица, 10', 'metro': {'station_name': 'Немига', 'line_name': 'Автозаводская', 'station_id': '63.422', 'line_id': '63', 'lat': 53.905615, 'lng': 27.55415}, 'metro_stations': [{'station_name': 'Немига', 'line_name': 'Автозаводская', 'station_id': '63.422', 'line_id': '63', 'lat': 53.905615, 'lng': 27.55415}], 'id': '1733385'}, 'response_url': None, 'sort_point_distance': None, 'employer': {'id': '3603119', 'name': 'Апподил', 'url': 'https://api.hh.ru/employers/3603119', 'alternate_url': 'https://hh.ru/employer/3603119', 'logo_urls': {'original': '

In [5]:
moscow_id = get_area_id_by_name('Москва')
minsk_id = get_area_id_by_name('Минск')

print(get_vacancies_count('data analyst', moscow_id))

# Each vacancy has such fields (they may be NULL) as
# ['name'] : str,
# ['salary']['from'] : int,
# ['salary']['to'] : int,
# ['salary']['gross'] : boolean,
# ['salary']['currency'] : str (RUR, BYN, USD, EUR, etc),
# ['snippet']['requirement'] : str
# ['snippet']['responsibility'] : str
# The full list of fields is described here: https://github.com/hhru/api/blob/master/docs/vacancies.md#ответ-1
vacancies = download_vacancies('Data analyst', moscow_id)

print(vacancies[0]['salary']['from'])

886
Parsing 0 page...
Parsing 1 page...
Parsing 2 page...
Parsing 3 page...
Parsing 4 page...
Parsing 5 page...
Parsing 6 page...
Parsing 7 page...
Parsing 8 page...
100000


In [6]:
vacancies

[{'id': '38812932',
  'premium': False,
  'name': 'Analyst/Consultant',
  'department': None,
  'has_test': False,
  'response_letter_required': False,
  'area': {'id': '1', 'name': 'Москва', 'url': 'https://api.hh.ru/areas/1'},
  'salary': {'from': 100000, 'to': 250000, 'currency': 'RUR', 'gross': True},
  'type': {'id': 'open', 'name': 'Открытая'},
  'address': None,
  'response_url': None,
  'sort_point_distance': None,
  'employer': {'id': '4460664',
   'name': 'Senteo Inc',
   'url': 'https://api.hh.ru/employers/4460664',
   'alternate_url': 'https://hh.ru/employer/4460664',
   'logo_urls': {'original': 'https://hhcdn.ru/employer-logo-original/709016.png',
    '90': 'https://hhcdn.ru/employer-logo/3277180.png',
    '240': 'https://hhcdn.ru/employer-logo/3277181.png'},
   'vacancies_url': 'https://api.hh.ru/vacancies?employer_id=4460664',
   'trusted': True},
  'published_at': '2020-08-25T11:31:05+0300',
  'created_at': '2020-08-25T11:31:05+0300',
  'archived': False,
  'apply_alte

In [7]:
# Create an empty dataframe with columns we want to get from json request
df = pd.DataFrame(columns=['Job_title', 'Salary', 'Organization', 'Metro_station', 'Job_description'])

In [11]:
# Fill in dataframe
for i in range(len(vacancies)):
    department = vacancies[i]['department']
    snippet = vacancies[i]['snippet']
    address = vacancies[i]['address']
    metro = address['metro'] if address else None
    
    df = df.append({'Job_title': vacancies[i]['name'],
                     'Salary': vacancies[i]['salary'],
                    'Organization': department['name'] if department else None,
                    'Metro_station': metro['station_name'] if metro else None,
                    'Job_description': snippet['requirement'] if snippet else None
                   }, ignore_index=True) 

In [12]:
df.head()

Unnamed: 0,Job_title,Salary,Organization,Metro_station,Job_description
0,Analyst/Consultant,"{'from': 100000, 'to': 250000, 'currency': 'RU...",,,Deep knowledge of Excel and working with <high...
1,Data Analyst (Digital),"{'from': 2700, 'to': 3400, 'currency': 'EUR', ...",,,Being able to formalize and visualize the resu...
2,Data Analyst в Пикабу (удаленно),"{'from': 70000, 'to': 130000, 'currency': 'RUR...",,,Уверенный SQL. Базовый Python + Pandas. Матста...
3,Руководитель департамента data аналитики,"{'from': 280000, 'to': 300000, 'currency': 'RU...",,Кропоткинская,Умение ставить и контролировать качество выпол...
4,"Intern, Data Science",,,Тульская,Знание методов статистического анализа: аппара...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886 entries, 0 to 885
Data columns (total 5 columns):
Job_title          886 non-null object
Salary             134 non-null object
Organization       174 non-null object
Metro_station      361 non-null object
Job_description    883 non-null object
dtypes: object(5)
memory usage: 34.7+ KB
