In [10]:
import pandas as pd
import json
import requests
from tqdm import tqdm
import time
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials
from concurrent.futures import ThreadPoolExecutor, as_completed

job_title = "'Аналитик данных' and 'data analyst'"
area = 1


def getPage(page=0):
    params = {
        'text': job_title,
        'area': area,
        'page': page,
        'per_page': 100
    }
    req = requests.get('https://api.hh.ru/vacancies', params)
    req.raise_for_status()  # проверяем статус ответа на ошибки
    data = req.json()  # используем метод json() для декодирования JSON-ответа
    req.close()
    return data


def parse_page(page):
    jsObj = getPage(page)
    return jsObj['items']


with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(parse_page, page) for page in range(0, 20)]
    data = []
    for future in tqdm(as_completed(futures), desc='Парсим по запросу ' + job_title):
        data.extend(future.result())

df = pd.json_normalize(data).rename(columns=lambda c: c.replace('.', '_'))

df = df[['id', 'name',
         'published_at',
         'url',
         'area_name',
         'salary_from',
         'salary_to',
         'salary_currency',
         'employer_name',
         'employer_url',
         'snippet_requirement',
         'snippet_responsibility',
         'address_raw', 'address_lat',
         'address_lng', 'address_metro_station_name']]


def highlighttext(column):
    return column.str.replace('[<\>\/]', '', regex=True).str.replace('highlighttext', '', regex=True)


df.snippet_responsibility = highlighttext(df.snippet_responsibility)
df.snippet_requirement = highlighttext(df.snippet_requirement)

timestr = time.strftime("%Y%m%d-%H%M%S")
csv_name = job_title + ".csv"
csv_name_with_timestamp = job_title + timestr + ".csv"
df.to_csv(csv_name, index=False)
print('Парсинг выполнен' + ' в', pd.Timestamp.now())

"""# Выгрузка в Google Sheets
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

my_mail = 'your_mail'  # почта
path_to_credentials = 'crdentials.json'

# Authorization
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    'your_keyfile_name.json', scope)
gs = gspread.authorize(credentials)

table_name = 'HH'
sheet = gs.create('HH')

sheet.share(my_mail, perm_type='user', role='writer')

spreadsheet_name = 'HH'
sheet = 'HH'
d2g.upload(df, table_name, sheet, credentials=credentials, row_names=True)
print('Парсинг выполнен в', pd.Timestamp.now())
"""

Парсим по запросу 'Аналитик данных' and 'data analyst': 20it [00:01, 19.01it/s]


Парсинг выполнен в 2023-03-09 21:24:23.686556


"# Выгрузка в Google Sheets\nscope = ['https://spreadsheets.google.com/feeds',\n         'https://www.googleapis.com/auth/drive']\n\nmy_mail = 'your_mail'  # почта\npath_to_credentials = 'crdentials.json'\n\n# Authorization\ncredentials = ServiceAccountCredentials.from_json_keyfile_name(\n    'your_keyfile_name.json', scope)\ngs = gspread.authorize(credentials)\n\ntable_name = 'HH'\nsheet = gs.create('HH')\n\nsheet.share(my_mail, perm_type='user', role='writer')\n\nspreadsheet_name = 'HH'\nsheet = 'HH'\nd2g.upload(df, table_name, sheet, credentials=credentials, row_names=True)\nprint('Парсинг выполнен в', pd.Timestamp.now())\n"