In [7]:
import pandas as pd
import json
import requests
from tqdm import tqdm
import time
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials
from concurrent.futures import ThreadPoolExecutor, as_completed
import gspread

job_title = "'Аналитик данных' and 'data analyst'"
area = 1


def get_vacancies(page):
    params = {
        'text': job_title,
        'area': area,
        'page': page,
        'per_page': 100
    }
    req = requests.get('https://api.hh.ru/vacancies', params)
    req.raise_for_status()
    data = req.json()
    req.close()
    return data['items']


with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_vacancies, page) for page in tqdm(range(0, 20))]
    data = []
    for future in as_completed(futures):
        data.extend(future.result())

df = pd.json_normalize(data).rename(columns=lambda c: c.replace('.', '_'))

df = df[['id', 'name',
         'published_at',
         'url',
         'area_name',
         'salary_from',
         'salary_to',
         'salary_currency',
         'employer_name',
         'employer_url',
         'snippet_requirement',
         'snippet_responsibility',
         'address_raw', 'address_lat',
         'address_lng', 'address_metro_station_name']]

df['url'] = df['url'].str.replace('https://api.hh.ru/vacancies/', 'https://hh.ru/vacancy/')
df['url'] = df['url'].str.split('?').str[0]




def highlighttext(column):
    return column.str.replace('[<\>\/]', '', regex=True).str.replace('highlighttext', '', regex=True)


df.snippet_responsibility = highlighttext(df.snippet_responsibility)
df.snippet_requirement = highlighttext(df.snippet_requirement)

timestr = time.strftime("%Y%m%d-%H%M%S")
csv_name = job_title + ".csv"
csv_name_with_timestamp = job_title + timestr + ".csv"
df.to_csv(csv_name, index=False)
print('Парсинг выполнен' + ' в', pd.Timestamp.now())
# Выгрузка в Google Sheets
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

my_mail = 'kirill.st.ks@gmail.com'  # почта
path_to_credentials = 'crdentials.json'

try:
    # Authorization
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        'key.json', scope)
    gs = gspread.authorize(credentials)

    table_name = 'HH'
    sheet = gs.create('HH')

    sheet.share(my_mail, perm_type='user', role='writer')

    spreadsheet_name = 'HH'
    sheet = 'HH'
    d2g.upload(df, table_name, sheet, credentials=credentials, row_names=True)
    print('Выполнено')

except FileNotFoundError:
    print('json не найден!')



100%|██████████| 20/20 [00:00<00:00, 1333.18it/s]


Парсинг выполнен в 2023-03-22 18:10:55.031684
json не найден!


  df['url'] = df['url'].str.replace('https://api.hh.ru/vacancies/', 'https://hh.ru/vacancy/')
