In [1]:
from bs4 import BeautifulSoup as bs
import requests
from time import sleep
from pprint import pprint

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54,gzip(gfe)'

In [3]:
headers = {
    'User-Agent': user_agent
}

In [4]:
url = 'https://ekaterinburg.hh.ru/vacancies/programmist_python?hhtmFromLabel=rainbow_profession&hhtmFrom=main&customDomain=1'

In [24]:
def parse_vacancy_hh(dom_vacancy):
    sleep(0.1)
    vacancy_name = dom_vacancy.find('a').text
    
    vacancy_salary = dom_vacancy.find('span', {'class', 'bloko-header-section-3'})
    if vacancy_salary:
        vacancy_salary = vacancy_salary.text
        min_salary, max_salary, currency_salary = clean_salary(vacancy_salary)
    else:
        min_salary, max_salary, currency_salary = None, None, None
            
    vacancy_link = dom_vacancy.find('a')['href']
    
    return {
      'vacancy_name': vacancy_name,
      'vacancy_salary': vacancy_salary,
      'min_salary': min_salary,
      'max_salary': max_salary,
      'currency_salary': currency_salary,
      'vacancy_link': vacancy_link,
      'vacancy_source': 'hh.ru',
    }

In [25]:
def parse_hh(url_page, headers, result=[], index_page=1):
    response = requests.get(url_page, headers=headers)
    if response.status_code != 200:
        print('Parsing completed')
        return result
    else:
        print(f'Page №{index_page}, link: {response.url}')

    dom = bs(response.content, 'html.parser')
    vacancies = dom.find_all('div', {'class': 'vacancy-serp-item__layout'})
    for vacancy in vacancies:
        result.append(parse_vacancy_hh(vacancy))

    link_next_page = dom.find('a', {'data-qa': 'pager-next'})
    if link_next_page:
        link_next_page = 'https://ekaterinburg.hh.ru' + link_next_page['href']
    else:
        print('Parsing completed')
        return result

    result = parse_hh(link_next_page, headers, result, index_page + 1)   
    return result

In [35]:
def clean_salary(vacancy_salary_text, min_salary=None, max_salary=None, currency_salary=None):
    list_salary = vacancy_salary_text.replace('\u202f000', ' ').split()
    for i in range(len(list_salary) - 1):
        if list_salary[i] == 'от':
            min_salary = int(list_salary[i + 1])
        elif list_salary[i] == 'до':
            max_salary = int(list_salary[i + 1])
        elif list_salary[i] == '–':
            min_salary = int(list_salary[i - 1])
            max_salary = int(list_salary[i + 1])
    currency_salary = list_salary[-1]
    
    return min_salary, max_salary, currency_salary

In [36]:
result = parse_hh(url, headers)

Page №1, link: https://ekaterinburg.hh.ru/vacancies/programmist_python?hhtmFromLabel=rainbow_profession&hhtmFrom=main&customDomain=1
Page №2, link: https://ekaterinburg.hh.ru/vacancies/programmist_python?hhtmFromLabel=rainbow_profession&hhtmFrom=main&customDomain=1&page=1
Page №3, link: https://ekaterinburg.hh.ru/vacancies/programmist_python?hhtmFromLabel=rainbow_profession&hhtmFrom=main&customDomain=1&page=2
Parsing completed


In [37]:
len(result)

381

In [38]:
result

[{'vacancy_name': 'Middle Python разработчик',
  'vacancy_salary': '100\u202f000 – 200\u202f000 руб.',
  'min_salary': 100000,
  'max_salary': 200000,
  'currency_salary': 'руб.',
  'vacancy_link': 'https://ekaterinburg.hh.ru/vacancy/73444689?query=%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81%D1%82+Python',
  'vacancy_source': 'hh.ru'},
 {'vacancy_name': 'Python разработчик',
  'vacancy_salary': '100\u202f000 – 200\u202f000 руб.',
  'min_salary': 100000,
  'max_salary': 200000,
  'currency_salary': 'руб.',
  'vacancy_link': 'https://ekaterinburg.hh.ru/vacancy/73712164?query=%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81%D1%82+Python',
  'vacancy_source': 'hh.ru'},
 {'vacancy_name': 'Backend-разработчик на Python',
  'vacancy_salary': 'от 60\u202f000 руб.',
  'min_salary': 60000,
  'max_salary': None,
  'currency_salary': 'руб.',
  'vacancy_link': 'https://ekaterinburg.hh.ru/vacancy/73112663?query=%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B8%D1%81