In [1]:
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import sleep
import time
from bs4 import BeautifulSoup
import re

In [2]:
URL = "https://api.hh.ru/vacancies"

In [3]:
job_titles_list = ["data scientist", "data analyst"]

In [4]:
base_params_template = {
    'area': 113,           # –†–µ–≥–∏–æ–Ω: –†–æ—Å—Å–∏—è
    'per_page': 100,       
    'order_by': 'publication_time' 
}

In [5]:
all_vacancies_data = []

In [6]:
MAX_PAGES = 4

In [13]:
def download_vacancy(base_params, page_num):
    """
    –í—ã–ø–æ–ª–Ω—è–µ—Ç HTTP-–∑–∞–ø—Ä–æ—Å –∫ API HH.ru —Å –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ–º –Ω–æ–º–µ—Ä–∞ —Å—Ç—Ä–∞–Ω–∏—Ü—ã.
    """
    # 1. –ö–û–ü–ò–†–£–ï–ú –±–∞–∑–æ–≤—ã–π —à–∞–±–ª–æ–Ω –∏ –æ–±–Ω–æ–≤–ª—è–µ–º –Ω–æ–º–µ—Ä —Å—Ç—Ä–∞–Ω–∏—Ü—ã (–ö–õ–Æ–ß–ï–í–û–ô –®–ê–ì!)
    current_params = base_params.copy() 
    current_params["page"] = page_num 
    
    try:
        response = requests.get(URL, params=current_params)
        response.raise_for_status() # –í—ã–∑—ã–≤–∞–µ—Ç –∏—Å–∫–ª—é—á–µ–Ω–∏–µ –¥–ª—è 4xx/5xx –æ—à–∏–±–æ–∫
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"–û—à–∏–±–∫–∞ HTTP –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page_num}: {e}")
        return None
    except json.JSONDecodeError:
        print(f"–û—à–∏–±–∫–∞ –¥–µ–∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è JSON –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page_num}")
        return None

In [15]:
def get_full_details(vacancy_id):
    """
    –ü–æ–ª—É—á–∞–µ—Ç –ø–æ–ª–Ω–æ–µ –æ–ø–∏—Å–∞–Ω–∏–µ –≤–∞–∫–∞–Ω—Å–∏–∏ –ø–æ –µ–µ ID.
    """
    detail_url = f"https://api.hh.ru/vacancies/{vacancy_id}"
    try:
        response = requests.get(detail_url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"–û—à–∏–±–∫–∞ –ø–æ–ª—É—á–µ–Ω–∏—è –¥–µ—Ç–∞–ª–µ–π –¥–ª—è ID {vacancy_id}: {e}")
        return None

In [17]:
# 3. –§–£–ù–ö–¶–ò–Ø –û–ë–†–ê–ë–û–¢–ö–ò –î–ê–ù–ù–´–• (–°–ø–ª—é—â–∏–≤–∞–Ω–∏–µ)
def process_vacancy(vacancy, search_term):
    salary_data = vacancy.get('salary')
    min_salary = None
    if salary_data and isinstance(salary_data, dict) and salary_data.get('from'):
        min_salary = salary_data['from']
        
    # 2. –†–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª—å: –≤–ª–æ–∂–µ–Ω–Ω–æ–µ –ø–æ–ª–µ
    employer_info = vacancy.get('employer', {})
    employer_name = employer_info.get('name', 'N/A')
    
    # 3. –ì–æ—Ä–æ–¥: –≤–ª–æ–∂–µ–Ω–Ω–æ–µ –ø–æ–ª–µ
    area_info = vacancy.get('area', {})
    city_name = area_info.get('name', 'N/A')
    
    # 4. –¢—Ä–µ–±–æ–≤–∞–Ω–∏—è –∏ –æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏ (–∏–∑–≤–ª–µ–∫–∞–µ–º —á–∏—Å—Ç—ã–π —Ç–µ–∫—Å—Ç)
    snippet = vacancy.get('snippet', {})
    requirement = snippet.get('requirement', '')
    responsibility = snippet.get('responsibility', '')
    
    # 5. –û–ø—ã—Ç: –≤–ª–æ–∂–µ–Ω–Ω–æ–µ –ø–æ–ª–µ
    experience_info = vacancy.get('experience', {})
    experience_name = experience_info.get('name', 'N/A')

    # 1. –ü–æ–ª—É—á–∞–µ–º ID –≤–∞–∫–∞–Ω—Å–∏–∏
    vacancy_id = vacancy.get('id')
    full_description = "N/A"
    
    if vacancy_id:
        # 2. –î–µ–ª–∞–µ–º –Ω–æ–≤—ã–π –∑–∞–ø—Ä–æ—Å –∑–∞ –¥–µ—Ç–∞–ª—è–º–∏
        full_data = get_full_details(vacancy_id)
        
        # 3. –ò–∑–≤–ª–µ–∫–∞–µ–º –ø–æ–ª–µ 'description' (—Ç–∞–º HTML-–∫–æ–¥)
        if full_data and full_data.get('description'):
            full_description = full_data['description']
        
        # –û–ß–ï–ù–¨ –í–ê–ñ–ù–û: –î–æ–±–∞–≤–ª—è–µ–º –Ω–µ–±–æ–ª—å—à—É—é –ø–∞—É–∑—É –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞ –¥–µ—Ç–∞–ª–µ–π
        time.sleep(0.1)

    # –°–æ–∑–¥–∞–Ω–∏–µ –ø–ª–æ—Å–∫–æ–≥–æ —Å–ª–æ–≤–∞—Ä—è
    flat_row = {
        'full_description': full_description,
        'search_term': search_term, # –ö–∞–∫—É—é –¥–æ–ª–∂–Ω–æ—Å—Ç—å –º—ã –∏—Å–∫–∞–ª–∏
        'vacancy_id': vacancy.get('id'),
        'vacancy_name': vacancy.get('name'),
        'city_name': city_name,
        'min_salary': min_salary,
        'employer_name': employer_name,
        'published_at': vacancy.get('published_at'),
        'experience': experience_name,
        'schedule': vacancy.get('schedule', {}).get('name'),
        'employment': vacancy.get('employment', {}).get('name'),
        'requirement': requirement,
        'responsibility': responsibility,
        # –î–æ–±–∞–≤—å—Ç–µ –¥—Ä—É–≥–∏–µ –ø–æ–ª—è –ø–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏
    }
    return flat_row

In [19]:
# 4. –û–°–ù–û–í–ù–û–ô –¶–ò–ö–õ –ü–ê–†–°–ò–ù–ì–ê
for job in job_titles_list:
    print(f"--- –ù–∞—á–∏–Ω–∞–µ–º –ø–∞—Ä—Å–∏–Ω–≥ –¥–ª—è: '{job}' ---")
    
    # 1. –û–ø—Ä–µ–¥–µ–ª—è–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –¥–ª—è —Ç–µ–∫—É—â–µ–π –ø—Ä–æ—Ñ–µ—Å—Å–∏–∏ (–¥–æ–±–∞–≤–ª—è–µ–º 'text')
    current_search_params = base_params_template.copy()
    current_search_params['text'] = job
    
    for page in range(MAX_PAGES):
        vac_data = download_vacancy(current_search_params, page)
        
        # –ï—Å–ª–∏ –∑–∞–≥—Ä—É–∑–∫–∞ –Ω–µ —É–¥–∞–ª–∞—Å—å (–≤–µ—Ä–Ω—É–ª–∞ None –∏–ª–∏ –ø—É—Å—Ç–æ–π —Å–ª–æ–≤–∞—Ä—å)
        if not vac_data:
            print(f"–ü—Ä–æ–ø—É—Å–∫ —Å—Ç—Ä–∞–Ω–∏—Ü—ã {page} –∏–∑-–∑–∞ –æ—à–∏–±–∫–∏ –∑–∞–ø—Ä–æ—Å–∞.")
            continue # –ü–µ—Ä–µ—Ö–æ–¥–∏–º –∫ —Å–ª–µ–¥—É—é—â–µ–π –∏—Ç–µ—Ä–∞—Ü–∏–∏ —Ü–∏–∫–ª–∞
        
        # –ë–ï–ó–û–ü–ê–°–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê (—É—Å—Ç—Ä–∞–Ω—è–µ—Ç KeyError, –∫–∞–∫ –º—ã –æ–±—Å—É–∂–¥–∞–ª–∏)
        if 'items' in vac_data:
            vacancies_on_page = vac_data['items']
            
            if not vacancies_on_page:
                print(f"–°—Ç—Ä–∞–Ω–∏—Ü–∞ {page} –ø—É—Å—Ç–∞. –ó–∞–≤–µ—Ä—à–∞–µ–º —Å–±–æ—Ä –¥–ª—è '{job}'.")
                break # –í—ã—Ö–æ–¥–∏–º –∏–∑ —Ü–∏–∫–ª–∞ –ø–æ —Å—Ç—Ä–∞–Ω–∏—Ü–∞–º, —Ç–∞–∫ –∫–∞–∫ –≤–∞–∫–∞–Ω—Å–∏–∏ –∑–∞–∫–æ–Ω—á–∏–ª–∏—Å—å
            
            for vacancy in vacancies_on_page:
                # 2. –°–ø–ª—é—â–∏–≤–∞–µ–º –∏ –¥–æ–±–∞–≤–ª—è–µ–º –ø–ª–æ—Å–∫–∏–π —Å–ª–æ–≤–∞—Ä—å –≤ —Ñ–∏–Ω–∞–ª—å–Ω—ã–π —Å–ø–∏—Å–æ–∫
                flat_row = process_vacancy(vacancy, job)
                all_vacancies_data.append(flat_row)
                
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ {len(vacancies_on_page)} –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page}. –í—Å–µ–≥–æ: {len(all_vacancies_data)}")
            
        else:
            print(f"–û—à–∏–±–∫–∞ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞–Ω–Ω—ã—Ö –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ {page}: –ù–µ—Ç –∫–ª—é—á–∞ 'items'.")
            
        time.sleep(0.2) # –ü–∞—É–∑–∞ –º–µ–∂–¥—É –∑–∞–ø—Ä–æ—Å–∞–º–∏

--- –ù–∞—á–∏–Ω–∞–µ–º –ø–∞—Ä—Å–∏–Ω–≥ –¥–ª—è: 'data scientist' ---
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 0. –í—Å–µ–≥–æ: 100
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 1. –í—Å–µ–≥–æ: 200
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 2. –í—Å–µ–≥–æ: 300
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 60 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 3. –í—Å–µ–≥–æ: 360
--- –ù–∞—á–∏–Ω–∞–µ–º –ø–∞—Ä—Å–∏–Ω–≥ –¥–ª—è: 'data analyst' ---
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 0. –í—Å–µ–≥–æ: 460
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 1. –í—Å–µ–≥–æ: 560
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 2. –í—Å–µ–≥–æ: 660
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 –≤–∞–∫–∞–Ω—Å–∏–π –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ 3. –í—Å–µ–≥–æ: 760


In [21]:
# 5. –°–û–ó–î–ê–ù–ò–ï DATAFRAME –ò CSV
print("\n--- –ó–∞–≤–µ—Ä—à–µ–Ω–∏–µ —Å–±–æ—Ä–∞ –¥–∞–Ω–Ω—ã—Ö ---")
print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–æ–±—Ä–∞–Ω–Ω—ã—Ö –∑–∞–ø–∏—Å–µ–π: {len(all_vacancies_data)}")

if all_vacancies_data:
    df = pd.DataFrame(all_vacancies_data)
    
    # –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—É—Å—Ç—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è (None) –≤ –∑–∞—Ä–ø–ª–∞—Ç–µ –Ω—É–ª–µ–º –¥–ª—è —É–¥–æ–±—Å—Ç–≤–∞ –∞–Ω–∞–ª–∏–∑–∞
    df['min_salary'] = df['min_salary'].fillna(0) 
    
    filename = 'hh_vacancies_data.csv'
    # index=False –∏—Å–∫–ª—é—á–∞–µ—Ç —Å—Ç–æ–ª–±–µ—Ü —Å –∏–Ω–¥–µ–∫—Å–∞–º–∏ –∏–∑ CSV
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\n‚úÖ –î–∞—Ç–∞—Å–µ—Ç —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ —Ñ–∞–π–ª: {filename}")
else:
    print("‚ùå –°–ø–∏—Å–æ–∫ –≤–∞–∫–∞–Ω—Å–∏–π –ø—É—Å—Ç. –§–∞–π–ª CSV –Ω–µ —Å–æ–∑–¥–∞–Ω.")


--- –ó–∞–≤–µ—Ä—à–µ–Ω–∏–µ —Å–±–æ—Ä–∞ –¥–∞–Ω–Ω—ã—Ö ---
–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–æ–±—Ä–∞–Ω–Ω—ã—Ö –∑–∞–ø–∏—Å–µ–π: 760

‚úÖ –î–∞—Ç–∞—Å–µ—Ç —É—Å–ø–µ—à–Ω–æ —Å–æ–∑–¥–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ —Ñ–∞–π–ª: hh_vacancies_data.csv


In [23]:
df.head()

Unnamed: 0,full_description,search_term,vacancy_id,vacancy_name,city_name,min_salary,employer_name,published_at,experience,schedule,employment,requirement,snippet,responsibility
0,<p>–ü—Ä–∏–≤–µ—Ç! –≠—Ç–æ –∫–æ–º–∞–Ω–¥–∞ Research-–ø—Ä–æ–µ–∫—Ç–æ–≤ –ø–æ–∏—Å–∫...,data scientist,121071193,"–°—Ç–∞—Ä—à–∏–π Data Scientist, RND –ø—Ä–æ–µ–∫—Ç—ã –ø–æ–∏—Å–∫–∞",–ú–æ—Å–∫–≤–∞,0.0,Ozon,2025-10-25T10:49:11+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ <highlighttext>Data</highlightte...,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ <highlighttext>...,"–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Å–∏—Å—Ç–µ–º—ã –∞–≥—Ä–µ–≥–∞—Ü–∏–∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –ø–æ ""—Ä–µ..."
1,<p><strong>–ü—Ä–∏–≤–µ—Ç!</strong></p> <p>–ú—ã —Å–ø–æ—Ä—Ç–∏–≤–Ω...,data scientist,125971527,Data Scientist / ML Engineer,–ú–æ—Å–∫–≤–∞,250000.0,–†–ë,2025-10-24T20:41:14+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å ML/DL –æ—Ç 3 –ª–µ—Ç. –ó–Ω–∞–Ω–∏–µ –∞–ª–≥–æ—Ä–∏—Ç–º–æ...,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å ML/DL –æ—Ç 3 –ª–µ—Ç....,–£—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ —É–ª—É—á—à–µ–Ω–∏–∏ –ø—Ä–æ–¥—É–∫—Ç–∞ –Ω–∞ –≤—Å–µ—Ö —ç—Ç–∞–ø–∞...
2,<p><strong>–ú–ï–ñ–î–£–ù–ê–†–û–î–ù–ê–Ø –ö–û–ú–ê–ù–î–ê –ò–©–ï–¢ –ü–†–û–ì–†–ê–ú–ú...,data scientist,126926550,–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç hr-–ø—Ä–æ–µ–∫—Ç–∞,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,184000.0,–ö–∞–º–∞–ª–æ–≤ –õ–∏–Ω–∞—Ä –ó—É—Ñ–∞—Ä–æ–≤–∏—á,2025-10-24T19:51:33+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–ü—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä —Å –æ–ø—ã—Ç–æ–º –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—è –º–µ–∂–¥—É–Ω–∞—Ä...,{'requirement': '–ü—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä —Å –æ–ø—ã—Ç–æ–º –ø–æ...,–°—Ç–∞—Ä—à–∏–π –¥–∏–∑–∞–π–Ω–µ—Ä –∏–∑ —ç–∫–æ—Å–∏—Å—Ç–µ–º—ã Terra ‚Äî –æ—Ç–≤–µ—á–∞–µ...
3,<p>–ü—Ä–∏–≤–µ—Ç! –≠—Ç–æ –∫–æ–º–∞–Ω–¥–∞ –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∑–∞–ø—Ä–æ—Å–æ–≤<br ...,data scientist,118916097,"–°—Ç–∞—Ä—à–∏–π Data Scientist, –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∑–∞–ø—Ä–æ—Å–æ–≤",–ú–æ—Å–∫–≤–∞,0.0,Ozon,2025-10-24T18:20:23+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,"–û—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ Python, —É–º–µ–Ω–∏–µ –±—ã—Å—Ç—Ä–æ –ø–∏—Å–∞—Ç—å —á...","{'requirement': '–û—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ Python, —É–º–µ–Ω–∏...","–ú—ã –∏—â–µ–º —á–µ–ª–æ–≤–µ–∫–∞, –∫–æ—Ç–æ—Ä—ã–π –ø–æ–º–æ–∂–µ—Ç –∫–æ–º–∞–Ω–¥–µ —Å –∏—Å..."
4,<p><strong>–ì–ö ¬´–¢–µ—Ö–Ω–æ–ø–∞—Ä–∫¬ª</strong> ‚Äî –ø—Ä–æ–∏–∑–≤–æ–¥–∏...,data scientist,126793682,Data Engineer/ ML Engineer,–ù–∏–∂–Ω–∏–π –ù–æ–≤–≥–æ—Ä–æ–¥,250000.0,–¢–µ—Ö–Ω–æ–ø–∞—Ä–∫,2025-10-24T17:36:18+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,3+ –ª–µ—Ç –≤ —Ä–æ–ª–∏ <highlighttext>Data</highlightte...,{'requirement': '3+ –ª–µ—Ç –≤ —Ä–æ–ª–∏ <highlighttext>...,"–°–±–æ—Ä, –æ—á–∏—Å—Ç–∫–∞ –∏ –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –∏–∑ —Ä–∞–∑–Ω—ã—Ö..."


In [25]:
df['employer_name'].value_counts()

employer_name
–°–ë–ï–†                  53
Ozon                  21
–±–∏–ª–∞–π–Ω                18
WILDBERRIES           13
–ê–ª—å—Ñ–∞-–ë–∞–Ω–∫            13
                      ..
–ú–∞—Ä—Ç—Ñ–∞—Ä–º               1
–°–±–µ—Ä–ú–µ–¥–ò–ò              1
Data Acquisition       1
Meetology              1
Lenkep recruitment     1
Name: count, Length: 425, dtype: int64

In [29]:
df['full_description'][0]

'<p>–ü—Ä–∏–≤–µ—Ç! –≠—Ç–æ –∫–æ–º–∞–Ω–¥–∞ Research-–ø—Ä–æ–µ–∫—Ç–æ–≤ –ø–æ–∏—Å–∫–∞ Ozon</p> <p>–ú—ã —Ä–∞–±–æ—Ç–∞–µ–º –Ω–∞–¥ –∏–Ω–Ω–æ–≤–∞—Ü–∏–æ–Ω–Ω—ã–º–∏ DS-–∑–∞–¥–∞—á–∞–º–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–∞–ø—Ä—è–º—É—é –≤–ª–∏—è—é—Ç –Ω–∞ –∫–∞—á–µ—Å—Ç–≤–æ –ø–æ–∏—Å–∫–æ–≤–æ–π –≤—ã–¥–∞—á–∏ –∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π –æ–ø—ã—Ç –º–∏–ª–ª–∏–æ–Ω–æ–≤ –∫–ª–∏–µ–Ω—Ç–æ–≤. –ù–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –ø–æ–∏—Å–∫–∞ –≤–æ–∑–≥–ª–∞–≤–ª—è–µ—Ç –ò–≥–æ—Ä—å –ö—É—Ä–∞–ª–µ–Ω–æ–∫, –≤ –ø—Ä–æ—à–ª–æ–º —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å –æ—Ç–¥–µ–ª–∞ –æ—Ü–µ–Ω–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –ø–æ–∏—Å–∫–∞ Yandex, R&amp;D –ª–∞–±–æ—Ä–∞—Ç–æ—Ä–∏–∏ –≤ Huawei –∏ –∞–≤—Ç–æ—Ä –¥–µ—Å—è—Ç–∫–æ–≤ –Ω–∞—É—á–Ω—ã—Ö –ø–µ–π–ø–µ—Ä–æ–≤ –ø—Ä–æ –ø–æ–∏—Å–∫, ML –∏ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã–µ –≤—ã—á–∏—Å–ª–µ–Ω–∏—è.</p> <p>–ú—ã —Ä–µ—à–∞–µ–º —Å–ª–æ–∂–Ω—ã–µ –∏ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ –∑–∞–¥–∞—á–∏ –Ω–∞ —Å—Ç—ã–∫–µ –º–∞—à–∏–Ω–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è, –∞–Ω–∞–ª–∏—Ç–∏–∫–∏ –∏ –±–∏–∑–Ω–µ—Å–∞, –≤–æ—Ç –ø—Ä–∏–º–µ—Ä –æ—Å–Ω–æ–≤–Ω—ã—Ö –ø—Ä–æ–µ–∫—Ç–æ–≤ –Ω–∞ –±–ª–∏–∂–∞–π—à–∏–π –ø–µ—Ä–∏–æ–¥:</p> <ul> <li

In [35]:
#—á–∏—Å–ª–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏–π
unique_description = df['full_description'].nunique()
unique_description

700

In [39]:
unique_employers = df['employer_name'].nunique()
total_vacancies = len(df)
print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–∞–∫–∞–Ω—Å–∏–π: {total_vacancies}")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Ä–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª–µ–π: {unique_employers}")

–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–∞–∫–∞–Ω—Å–∏–π: 760
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Ä–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª–µ–π: 425


In [41]:
top_employers = df['employer_name'].value_counts().head(10)
print("\n–¢–æ–ø-10 —Ä–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª–µ–π:")
print(top_employers)


–¢–æ–ø-10 —Ä–∞–±–æ—Ç–æ–¥–∞—Ç–µ–ª–µ–π:
employer_name
–°–ë–ï–†                         53
Ozon                         21
–±–∏–ª–∞–π–Ω                       18
WILDBERRIES                  13
–ê–ª—å—Ñ–∞-–ë–∞–Ω–∫                   13
X5 Tech                      12
–†—É—Å—Å–∫–∏–π –§–æ–Ω–¥ –ù–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏    11
–ú–¢–°                          10
–¢-–ë–∞–Ω–∫                        8
Aston                         7
Name: count, dtype: int64


In [43]:
df_alt = df.copy()

—Å–æ–∑–¥–∞–º –ø–æ—Ä–æ–≥ –∏ –ø–æ—á–∏—â—É –¥–∞—Ç–∞—Å–µ—Ç –æ—Ç –Ω—É–ª–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π

In [45]:
df_alt.notnull().sum()

full_description    760
search_term         760
vacancy_id          760
vacancy_name        760
city_name           760
min_salary          760
employer_name       760
published_at        760
experience          760
schedule            760
employment          760
requirement         757
snippet             760
responsibility      758
dtype: int64

In [47]:
#df_alt = df_alt.loc[:, df_alt.notnull().sum() >= treshhold]
#df_alt.info()

In [49]:
df_alt.head(2)

Unnamed: 0,full_description,search_term,vacancy_id,vacancy_name,city_name,min_salary,employer_name,published_at,experience,schedule,employment,requirement,snippet,responsibility
0,<p>–ü—Ä–∏–≤–µ—Ç! –≠—Ç–æ –∫–æ–º–∞–Ω–¥–∞ Research-–ø—Ä–æ–µ–∫—Ç–æ–≤ –ø–æ–∏—Å–∫...,data scientist,121071193,"–°—Ç–∞—Ä—à–∏–π Data Scientist, RND –ø—Ä–æ–µ–∫—Ç—ã –ø–æ–∏—Å–∫–∞",–ú–æ—Å–∫–≤–∞,0.0,Ozon,2025-10-25T10:49:11+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ <highlighttext>Data</highlightte...,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ <highlighttext>...,"–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Å–∏—Å—Ç–µ–º—ã –∞–≥—Ä–µ–≥–∞—Ü–∏–∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –ø–æ ""—Ä–µ..."
1,<p><strong>–ü—Ä–∏–≤–µ—Ç!</strong></p> <p>–ú—ã —Å–ø–æ—Ä—Ç–∏–≤–Ω...,data scientist,125971527,Data Scientist / ML Engineer,–ú–æ—Å–∫–≤–∞,250000.0,–†–ë,2025-10-24T20:41:14+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å ML/DL –æ—Ç 3 –ª–µ—Ç. –ó–Ω–∞–Ω–∏–µ –∞–ª–≥–æ—Ä–∏—Ç–º–æ...,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å ML/DL –æ—Ç 3 –ª–µ—Ç....,–£—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ —É–ª—É—á—à–µ–Ω–∏–∏ –ø—Ä–æ–¥—É–∫—Ç–∞ –Ω–∞ –≤—Å–µ—Ö —ç—Ç–∞–ø–∞...


–û—á–∏—â—É –¥–∞—Ç–∞—Å–µ—Ç –¥–æ –º–∏–Ω–∏–º—É–º–∞ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏

In [33]:
#features_to_save = ['id', 'name', 'full_description', 'snippet.responsibility', 'snippet.requirement',
                    #'schedule.name', 'experience.name', 'employment.name', 'address.city']
#df_final = df_alt[features_to_save]

In [34]:
#df_final.fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.fillna('', inplace=True)


In [53]:
df_alt.fillna('', inplace=True)

–æ—á–∏—â—É –¥–∞–Ω–Ω—ã–µ –æ—Ç html —Å–∏–º–≤–æ–ª–æ–≤, –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏ –∏ —Ç–¥

In [56]:
import nltk
import emoji
import unicodedata
import contractions
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import stanza

In [38]:
#nltk.download('stopwords')
#nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [58]:
nlp_ru = stanza.Pipeline(lang='ru', processors='tokenize,lemma')
russian_stopwords = set(stopwords.words('russian'));

2025-10-25 13:27:30 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  ‚Ä¶

2025-10-25 13:27:30 INFO: Downloaded file to C:\Users\User\stanza_resources\resources.json
2025-10-25 13:27:30 INFO: Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| lemma     | syntagrus_nocharlm |

2025-10-25 13:27:30 INFO: Using device: cpu
2025-10-25 13:27:30 INFO: Loading: tokenize
2025-10-25 13:27:42 INFO: Loading: lemma
2025-10-25 13:27:45 INFO: Done loading processors!


In [68]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞
def clean_and_lemmatize_russian(input_text, nlp_pipeline, stop_words_set):    
    if pd.isna(input_text) or input_text is None:
        return ""

    clean_text = str(input_text)
    # HTML-—Ç–µ–≥–∏
    clean_text = re.sub('<[^<]+?>', '', clean_text)
    
    # URL –∏ —Å—Å—ã–ª–∫–∏
    clean_text = re.sub(r'http\S+', '', clean_text)

    clean_text = clean_text.lower()
    # –£–±–∏—Ä–∞–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã –∏ –∑–Ω–∞–∫–∏ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è
    clean_text = re.sub(r'[^a-zA-Z–∞-—è–ê-–Ø—ë–Å0-9\s]', ' ', clean_text)

    # 5. –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ —Å–ª–æ–≤–∞ –∏ —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤
    words = clean_text.split(' ')

    # –£–±–∏—Ä–∞–µ–º –≤—Å–µ –ø—Ä–æ–±–µ–ª—ã
    clean_text = re.sub('\s+', ' ', clean_text)
    
    # –§–∏–ª—å—Ç—Ä—É–µ–º —Å–ª–æ–≤–∞, —É–±–∏—Ä–∞—è —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ –∏ –ø—É—Å—Ç—ã–µ —Å—Ç—Ä–æ–∫–∏ (–≤–æ–∑–Ω–∏–∫—à–∏–µ –ø—Ä–∏ –æ—á–∏—Å—Ç–∫–µ)
    filtered_words: List[str] = [
        word for word in words 
        if word and word not in stop_words_set
    ]
    
    # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –æ—á–∏—â–µ–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç, —Å–æ—Å—Ç–æ—è—â–∏–π –∏–∑ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤ (–±–µ–∑ –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏–∏)
    return ' '.join(filtered_words)

  clean_text = re.sub('\s+', ' ', clean_text)


In [70]:
text = '–û –∫–æ–º–ø–∞–Ω–∏–∏ –ú—ã ‚Äî —É—Å–ø–µ—à–Ω—ã–π fashion-–±—Ä–µ–Ω–¥ —Å 15-–ª–µ—Ç–Ω–µ–π –∏—Å—Ç–æ—Ä–∏–µ–π –∏ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã–º –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ–º –∂–µ–Ω—Å–∫–æ–π –æ–¥–µ–∂–¥—ã. –ü–æ–ª–Ω—ã–π —Ü–∏–∫–ª: –æ—Ç –∑–∞–∫—É–ø–∫–∏ —Ç–∫–∞–Ω–∏ –¥–æ –ø—Ä–æ–¥–∞–∂–∏ –Ω–∞ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–∞—Ö. –ù–∞—à –æ—Å–Ω–æ–≤–Ω–æ–π –æ–±–æ—Ä–æ—Ç –∏–¥—ë—Ç —á–µ—Ä–µ–∑ Wildberries –∏ Ozon (–≤—Ö–æ–¥–∏–º –≤ —Ç–æ–ø-1% –ø—Ä–æ–¥–∞–≤—Ü–æ–≤ –æ–¥–µ–∂–¥—ã –Ω–∞ Ozon ). –°–µ–π—á–∞—Å –º—ã –∑–∞–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–µ–∫—Ç –ø–æ –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—é –∞–Ω–∞–ª–∏—Ç–∏—á–µ—Å–∫–æ–π/—Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ç–µ–ª—å–Ω–æ–π –ø–ª–∞—Ç—Ñ–æ—Ä–º—ã —Å –Ω—É–ª—è . –¶–µ–ª—å ‚Äî —Å–¥–µ–ª–∞—Ç—å —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç–æ–º, —Ü–µ–Ω–∞–º–∏, –∑–∞–∫—É–ø–∫–∞–º–∏, –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ–º –∏ —Ä–µ–∫–ª–∞–º–æ–π –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –ø—Ä–æ–∑—Ä–∞—á–Ω—ã–º –∏ –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–º –Ω–∞ –¥–∞–Ω–Ω—ã—Ö . –ï—Å–ª–∏ –≤–Ω—É—Ç—Ä–µ–Ω–Ω—è—è —Å–∏—Å—Ç–µ–º–∞ –ø–æ–∫–∞–∂–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç, –º—ã –ø–ª–∞–Ω–∏—Ä—É–µ–º –≤—ã–≤–µ—Å—Ç–∏ –µ—ë –Ω–∞ —Ä—ã–Ω–æ–∫ –∫–∞–∫ –æ—Ç–¥–µ–ª—å–Ω—ã–π SaaS-–ø—Ä–æ–¥—É–∫—Ç –¥–ª—è –¥—Ä—É–≥–∏—Ö –ø—Ä–æ–¥–∞–≤—Ü–æ–≤ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–æ–≤. –≠—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –ø—Ä–∏—Å–æ–µ–¥–∏–Ω–∏—Ç—å—Å—è –Ω–∞ —Å—Ç–∞—Ä—Ç–µ –∏ –≤–ª–∏—è—Ç—å –Ω–∞ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É –ø—Ä–æ–¥—É–∫—Ç–∞.(—É–∂–µ –µ—Å—Ç—å –∑–∞–∏–Ω—Ç–µ—Ä–µ—Å–æ–≤–∞–Ω–Ω—ã–µ —Å–µ–ª–ª–µ—Ä—ã) –ó–∞–¥–∞—á–∏ 1. –°–±–æ—Ä –¥–∞–Ω–Ω—ã—Ö –∏ –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏–∏ –ü–æ–¥–∫–ª—é—á–µ–Ω–∏–µ API Wildberries, Ozon –∏ –¥—Ä—É–≥–∏—Ö –ø–ª–æ—â–∞–¥–æ–∫. –ü–æ–ª—É—á–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö: –ü—Ä–æ–¥–∞–∂–∏ (–≤—ã—Ä—É—á–∫–∞, –∑–∞–∫–∞–∑—ã, –≤—ã–∫—É–ø—ã, –≤–æ–∑–≤—Ä–∞—Ç—ã, —Å—Ç–∞—Ç—É—Å—ã). –û—Å—Ç–∞—Ç–∫–∏ –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö, –ª–æ–≥–∏—Å—Ç–∏–∫–∞. –†–µ–∫–ª–∞–º–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ (–ø–æ–∫–∞–∑—ã, –∫–ª–∏–∫–∏, CTR, CPC, CPA, ROI). –¶–µ–Ω—ã, –¥–∏–Ω–∞–º–∏–∫–∞ —Å–∫–∏–¥–æ–∫, —É—á–∞—Å—Ç–∏–µ –≤ –∞–∫—Ü–∏—è—Ö. –ü–æ–∑–∏—Ü–∏–∏ –≤ –ø–æ–∏—Å–∫–µ, –æ—Ç–∑—ã–≤—ã, —Ä–µ–π—Ç–∏–Ω–≥. –ò–Ω—Ç–µ–≥—Ä–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö –∏–∑ 1–° (–ø–æ ODATA) . –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ Excel/CSV. –ü–∞—Ä—Å–∏–Ω–≥ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–æ–≤ –¥–ª—è –∫–æ–Ω–∫—É—Ä–µ–Ω—Ç–Ω–æ–≥–æ –∞–Ω–∞–ª–∏–∑–∞. 2. –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ —Ö—Ä–∞–Ω–∏–ª–∏—â–∞ –¥–∞–Ω–Ω—ã—Ö –ü—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –∏ —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è Data LakeHouse . –•—Ä–∞–Ω–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –≤ S3 –∏–ª–∏ –∞–Ω–∞–ª–æ–≥–∞—Ö , –ø–µ—Ä–≤–∏—á–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞. –°–æ–∑–¥–∞–Ω–∏–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –¥–∞–Ω–Ω—ã—Ö (—Å—ã—Ä—ã–µ ‚Üí –æ—á–∏—â–µ–Ω–Ω—ã–µ ‚Üí –≤–∏—Ç—Ä–∏–Ω—ã). –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ ClickHouse –∏–ª–∏ –¥—Ä—É–≥–∏—Ö –∫–æ–ª–æ–Ω–æ—á–Ω—ã—Ö –ë–î. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –±–∞–∑–æ–≤—ã—Ö ETL/ELT-–ø—Ä–æ—Ü–µ—Å—Å–æ–≤ . 3. –ê–Ω–∞–ª–∏—Ç–∏–∫–∞ –∏ –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä–æ–≤–∞–Ω–∏–µ –ü—Ä–æ–≥–Ω–æ–∑ —Å–ø—Ä–æ—Å–∞ –∏ –ø—Ä–æ–¥–∞–∂ (–ø–æ —Ç–æ–≤–∞—Ä–∞–º –∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º). –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –æ—Å—Ç–∞—Ç–∫–æ–≤ –∏ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ—Å—Ç–∞–≤–æ–∫ –ø–æ —Å–∫–ª–∞–¥–∞–º WB/Ozon . –ê–≤—Ç–æ–º–∞—Ç–∏–∑–∞—Ü–∏—è —Ä–µ–∫–ª–∞–º–Ω—ã—Ö –∫–∞–º–ø–∞–Ω–∏–π: –¥–∏–Ω–∞–º–∏—á–µ—Å–∫–∏–µ —Å—Ç–∞–≤–∫–∏, —É–¥–∞–ª–µ–Ω–∏–µ –Ω–µ—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã—Ö –∫–ª—é—á–µ–π –∏ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤, –∞–Ω–∞–ª–∏–∑ CTR/–∫–æ–Ω–≤–µ—Ä—Å–∏–π. –†–∞—Å—á—ë—Ç –º–∞—Ä–∂–∏–Ω–∞–ª—å–Ω–æ—Å—Ç–∏ –∏ –ø—Ä–∏–±—ã–ª–∏. –û—Ü–µ–Ω–∫–∞ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏ SKU, –ø–æ—Å—Ç–∞–≤–æ–∫ –∏ –ª–æ–≥–∏—Å—Ç–∏–∫–∏. 4. –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –∏ –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –¥–∞—à–±–æ—Ä–¥–æ–≤ –≤ Yandex DataLens . –°–æ–∑–¥–∞–Ω–∏–µ —Ç–∞–±–ª–∏—Ü –∏ –∞–Ω–∞–ª–∏—Ç–∏—á–µ—Å–∫–∏—Ö –æ—Ç—á—ë—Ç–æ–≤. –í–æ–∑–º–æ–∂–Ω–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —á–∞—Å—Ç–∏ –∞–Ω–∞–ª–∏—Ç–∏–∫–∏ –≤ —Å–∞–º–æ–ø–∏—Å–Ω–æ–º –≤–µ–±-–ø—Ä–∏–ª–æ–∂–µ–Ω–∏–∏ . –¢—Ä–µ–±–æ–≤–∞–Ω–∏—è –û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å –¥–∞–Ω–Ω—ã–º–∏: ETL, SQL, Python . –ó–Ω–∞–Ω–∏–µ API: —Ä–∞–±–æ—Ç–∞ —Å REST API –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–æ–≤, –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏—è —Å 1–° ODATA. –û–ø—ã—Ç —Å –ë–î: PostgreSQL, ClickHouse (–∏–ª–∏ –∞–Ω–∞–ª–æ–≥–∞–º–∏). –ò–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã: Docker, Git . –ë–∏–±–ª–∏–æ—Ç–µ–∫–∏: pandas, requests, airflow (–∏–ª–∏ –æ–ø—ã—Ç –¥—Ä—É–≥–∏—Ö –ø–∞–π–ø–ª–∞–π–Ω-–º–µ–Ω–µ–¥–∂–µ—Ä–æ–≤). BI: DataLens (–æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ), –ø–æ–Ω–∏–º–∞–Ω–∏–µ –º–µ—Ç—Ä–∏–∫ –∞–Ω–∞–ª–∏—Ç–∏–∫–∏. –£–º–µ–Ω–∏–µ —Å—Ç—Ä–æ–∏—Ç—å –ø—Ä–æ–≥–Ω–æ–∑—ã (time series, ML ‚Äî –ø–ª—é—Å). –£–º–µ–Ω–∏–µ —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ –¥–æ–≤–æ–¥–∏—Ç—å –∑–∞–¥–∞—á–∏ –¥–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞. –ü–ª—é—Å–æ–º –±—É–¥–µ—Ç –û–ø—ã—Ç –ø—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏—è Data LakeHouse. –û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–∞–º–∏ (WB/Ozon). –ó–Ω–∞–Ω–∏–µ MLOps. –û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ e-commerce –∏–ª–∏ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–µ. –ù–∞–≤—ã–∫–∏ FastAPI/Flask –¥–ª—è –∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏–π –∏ —Å–µ—Ä–≤–∏—Å–æ–≤. –ú—ã –ø—Ä–µ–¥–ª–∞–≥–∞–µ–º –£—á–∞—Å—Ç–∏–µ –≤ –ø—Ä–æ–µ–∫—Ç–µ —Å –Ω—É–ª—è ‚Äî –∫–ª—é—á–µ–≤–∞—è —Ä–æ–ª—å. –í–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –≤–ª–∏—è—Ç—å –Ω–∞ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É –∏ —Ä–µ—à–µ–Ω–∏—è. –†–æ—Å—Ç –≤–º–µ—Å—Ç–µ —Å –ø—Ä–æ–¥—É–∫—Ç–æ–º. –ì–∏–±–∫–∏–π –≥—Ä–∞—Ñ–∏–∫, –≥–∏–±—Ä–∏–¥–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç (–ø–æ—Å–ª–µ –ò–° ‚Äî —á–∞—Å—Ç–∏—á–Ω–æ —É–¥–∞–ª—ë–Ω–∫–∞). –†–µ–∞–ª—å–Ω–∞—è —Å–≤–æ–±–æ–¥–∞ –≤—ã–±–æ—Ä–∞ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –∏ —Ä–µ—à–µ–Ω–∏–π. –î–æ–ª–≥–æ—Å—Ä–æ—á–Ω–∞—è —Ä–∞–±–æ—Ç–∞ –Ω–∞–¥ –ø—Ä–æ–¥—É–∫—Ç–æ–º, –∞ –Ω–µ ¬´—Ç–∞—Å–∫-–º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç¬ª. –í–∞–∂–Ω–æ ‚Äî –ø–µ—Ä–µ–¥ –æ—Ç–∫–ª–∏–∫–æ–º –ú—ã –∏—â–µ–º —Å–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω–æ–≥–æ —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç–∞ , –∞ –Ω–µ —Å—Ç–∞–∂—ë—Ä–∞ –∏–ª–∏ –Ω–∞—á–∏–Ω–∞—é—â–µ–≥–æ —É—Ä–æ–≤–Ω—è. –ù–∞ —Å—Ç–∞—Ä—Ç–µ —É –Ω–∞—Å –Ω–µ—Ç —Ä–µ—Å—É—Ä—Å–æ–≤ —É—á–∏—Ç—å —Å –Ω—É–ª—è , –ø–æ—ç—Ç–æ–º—É: ‚ùó –ï—Å–ª–∏ –≤—ã –Ω–µ —É–º–µ–µ—Ç–µ –∏–ª–∏ –Ω–µ –≥–æ—Ç–æ–≤—ã –±—ã—Å—Ç—Ä–æ –Ω–∞—É—á–∏—Ç—å—Å—è: —Ä–∞–±–æ—Ç–∞—Ç—å —Å API WB/Ozon, ETL, Python, —Ä–∞–±–æ—Ç–∞—Ç—å —Å ClickHouse/PostgreSQL, —Å—Ç—Ä–æ–∏—Ç—å –¥–∞—à–±–æ—Ä–¥—ã –≤ DataLens(–∏–ª–∏ –¥—Ä—É–≥–∏—Ö —Å–∏—Å—Ç–µ–º–∞—Ö) ‚Äî ‚Äî –Ω–µ –æ—Ç–∫–ª–∏–∫–∞–π—Ç–µ—Å—å –Ω–∞ –≤–∞–∫–∞–Ω—Å–∏—é. –í —Å–æ–ø—Ä–æ–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ–º –ø–∏—Å—å–º–µ —É–∫–∞–∂–∏—Ç–µ: –í–∞—à —É—Ä–æ–≤–µ–Ω—å –ø–æ –Ω–∞–≤—ã–∫–∞–º: Python / SQL / API / ClickHouse / DataLens (–ø–æ 10-–±–∞–ª–ª—å–Ω–æ–π —à–∫–∞–ª–µ). –†–µ–∞–ª—å–Ω—ã–π –æ–ø—ã—Ç: 2‚Äì3 –ø—Ä–æ–µ–∫—Ç–∞ –∏–ª–∏ –∑–∞–¥–∞—á–∏, –∫–æ—Ç–æ—Ä—ã–º–∏ –≤—ã –≥–æ—Ä–¥–∏—Ç–µ—Å—å. –ñ–µ–ª–∞—Ç–µ–ª—å–Ω—ã–π —É—Ä–æ–≤–µ–Ω—å –¥–æ—Ö–æ–¥–∞ –∏ —Ñ–æ—Ä–º–∞—Ç —Ä–∞–±–æ—Ç—ã (–æ—Ñ–∏—Å/–≥–∏–±—Ä–∏–¥). ‚Äî –Ω–∞ –∏—Å–ø—ã—Ç–∞—Ç–µ–ª—å–Ω—ã–π —Å—Ä–æ–∫ –º—ã –∏—â–µ–º —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–æ–≤ –∏—Å–∫–ª—é—á–∏—Ç–µ–ª—å–Ω–æ –≤ –æ—Ñ–∏—Å!!! , –Ω–µ –æ—Ç–∫–ª–∏–∫–∞–π—Ç–µ—Å—å –Ω–∞ –≤–∞–∫–∞–Ω—Å–∏—é –µ—Å–ª–∏ —ç—Ç–æ –≤–∞—Å –Ω–µ —É—Å—Ç—Ä–∞–∏–≤–∞–µ—Ç! p.s.: –ú—ã –Ω–µ —Ö–æ—Ç–∏–º –¥–µ–ª–∞—Ç—å –∫–æ–ø–∏—é —Å—É—â–µ—Å—Ç–≤—É—é—â–∏—Ö –Ω–∞ —Ä—ã–Ω–∫–µ –ø—Ä–æ–¥—É–∫—Ç–æ–≤, –º—ã —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–ª–∏ –º–Ω–æ–≥–∏–µ, –Ω–æ –æ–Ω–∏ –Ω–µ –æ—Ç–≤–µ—á–∞—é—Ç –Ω–∞—à–∏–º –∑–∞–¥–∞—á–∞–º –∏ –∑–∞–¥–∞—á–∞–º —Å–µ–ª–ª–µ—Ä–æ–≤ —Å –∫–µ–º –º—ã –∑–Ω–∞–∫–æ–º—ã, —É –Ω–∞—Å —É–∂–µ –µ—Å—Ç—å –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –Ω–∞—Ä–∞–±–æ—Ç–∫–∏, –ø–æ—ç—Ç–æ–º—É —ç—Ç–æ —Å–∫–æ—Ä–µ–µ –≥–∏–±—Ä–∏–¥ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤, —á—Ç–æ–±—ã –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –∞–≤—Ç–æ–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ä—É—Ç–∏–Ω–Ω—ã–µ –∑–∞–¥–∞—á–∏ –∏ –±—ã—Å—Ç—Ä–æ –ø—Ä–∏–Ω–∏–º–∞—Ç—å —É–ø—Ä–∞–≤–ª–µ–Ω—á–µ—Å–∫–∏–µ —Ä–µ—à–µ–Ω–∏—è.'

In [72]:
cleaned_text = clean_and_lemmatize_russian(text, nlp_ru, russian_stopwords)

In [74]:
print(cleaned_text)

–∫–æ–º–ø–∞–Ω–∏–∏ —É—Å–ø–µ—à–Ω—ã–π fashion –±—Ä–µ–Ω–¥ 15 –ª–µ—Ç–Ω–µ–π –∏—Å—Ç–æ—Ä–∏–µ–π —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã–º –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ–º –∂–µ–Ω—Å–∫–æ–π –æ–¥–µ–∂–¥—ã –ø–æ–ª–Ω—ã–π —Ü–∏–∫–ª –∑–∞–∫—É–ø–∫–∏ —Ç–∫–∞–Ω–∏ –ø—Ä–æ–¥–∞–∂–∏ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–∞—Ö –Ω–∞—à –æ—Å–Ω–æ–≤–Ω–æ–π –æ–±–æ—Ä–æ—Ç –∏–¥—ë—Ç wildberries ozon –≤—Ö–æ–¥–∏–º —Ç–æ–ø 1 –ø—Ä–æ–¥–∞–≤—Ü–æ–≤ –æ–¥–µ–∂–¥—ã ozon –∑–∞–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–µ–∫—Ç –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—é –∞–Ω–∞–ª–∏—Ç–∏—á–µ—Å–∫–æ–π —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ç–µ–ª—å–Ω–æ–π –ø–ª–∞—Ç—Ñ–æ—Ä–º—ã –Ω—É–ª—è —Ü–µ–ª—å —Å–¥–µ–ª–∞—Ç—å —É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç–æ–º —Ü–µ–Ω–∞–º–∏ –∑–∞–∫—É–ø–∫–∞–º–∏ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ–º —Ä–µ–∫–ª–∞–º–æ–π –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –ø—Ä–æ–∑—Ä–∞—á–Ω—ã–º –æ—Å–Ω–æ–≤–∞–Ω–Ω—ã–º –¥–∞–Ω–Ω—ã—Ö –≤–Ω—É—Ç—Ä–µ–Ω–Ω—è—è —Å–∏—Å—Ç–µ–º–∞ –ø–æ–∫–∞–∂–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–ª–∞–Ω–∏—Ä—É–µ–º –≤—ã–≤–µ—Å—Ç–∏ –µ—ë —Ä—ã–Ω–æ–∫ –æ—Ç–¥–µ–ª—å–Ω—ã–π saas –ø—Ä–æ–¥—É–∫—Ç –¥—Ä—É–≥–∏—Ö –ø—Ä–æ–¥–∞–≤—Ü–æ–≤ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–æ–≤ —ç—Ç–æ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –ø—Ä–∏—Å–

In [80]:
'''plt.figure(figsize=(10, 5))

sns.histplot(data=df_alt,
             x='experience_name',
             bins=15,
             kde=False,
             palette='Set2',
             multiple='stack')
plt.xlabel('–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ª–µ—Ç')
plt.ylabel('–ß–∞—Å—Ç–æ—Ç–∞')

plt.tight_layout()
plt.show()'''
             
             

"plt.figure(figsize=(10, 5))\n\nsns.histplot(data=df_alt,\n             x='experience_name',\n             bins=15,\n             kde=False,\n             palette='Set2',\n             multiple='stack')\nplt.xlabel('–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ª–µ—Ç')\nplt.ylabel('–ß–∞—Å—Ç–æ—Ç–∞')\n\nplt.tight_layout()\nplt.show()"

–û–±—Ä–∞–±–æ—Ç–∞—é —Ç–µ–∫—Å—Ç–æ–≤—ã–µ —Ñ–∏—á–∏ —Ñ—É–Ω–∫—Ü–∏–µ–π –¥–ª—è —É–¥–∞–ª–µ–Ω–∏—è html, –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏, –ø—Ä–∏–≤–µ–¥–µ–Ω–∏—è –∫—Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–æ–º—É –≤–∏–¥—É

In [96]:
df_alt['description_lemmatized'] = df_alt['full_description'].apply(
    lambda text: clean_and_lemmatize_russian(text, nlp_ru, russian_stopwords)
)

In [100]:
df_alt['requirement_lemmatized'] = df_alt['requirement'].apply(
    lambda text: clean_and_lemmatize_russian(text, nlp_ru, russian_stopwords))

In [106]:
df_alt['requirement_lemmatized'][1]

'–æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã ml dl 3 –ª–µ—Ç –∑–Ω–∞–Ω–∏–µ –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ ml dl –º–µ—Ç—Ä–∏–∫ –æ—Ü–µ–Ω–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –º–æ–¥–µ–ª–µ–π –æ–ø—ã—Ç –ø–æ–ª–Ω–æ–≥–æ —Ü–∏–∫–ª–∞'

In [108]:
df_final = df_alt.copy()

In [110]:
df_final = df_final.drop(['full_description', 'requirement'], axis=1)

In [112]:
df_final.head()

Unnamed: 0,search_term,vacancy_id,vacancy_name,city_name,min_salary,employer_name,published_at,experience,schedule,employment,snippet,responsibility,description_lemmatized,snippet.requirement_lemmatized,snippet_lemmatized,requirement_lemmatized
0,data scientist,121071193,"–°—Ç–∞—Ä—à–∏–π Data Scientist, RND –ø—Ä–æ–µ–∫—Ç—ã –ø–æ–∏—Å–∫–∞",–ú–æ—Å–∫–≤–∞,0.0,Ozon,2025-10-25T10:49:11+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–ü–æ–ª–Ω—ã–π –¥–µ–Ω—å,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã –≤ <highlighttext>...,"–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Å–∏—Å—Ç–µ–º—ã –∞–≥—Ä–µ–≥–∞—Ü–∏–∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –ø–æ ""—Ä–µ...",–ø—Ä–∏–≤–µ—Ç —ç—Ç–æ –∫–æ–º–∞–Ω–¥–∞ research –ø—Ä–æ–µ–∫—Ç–æ–≤ –ø–æ–∏—Å–∫–∞ oz...,–æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã data science 3—Ö –ª–µ—Ç –≥–ª—É–±–æ–∫–∏–µ –∑–Ω–∞–Ω–∏...,requirement –æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã data science 3—Ö –ª–µ—Ç –≥–ª...,–æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã data science 3—Ö –ª–µ—Ç –≥–ª—É–±–æ–∫–∏–µ –∑–Ω–∞–Ω–∏...
1,data scientist,125971527,Data Scientist / ML Engineer,–ú–æ—Å–∫–≤–∞,250000.0,–†–ë,2025-10-24T20:41:14+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,{'requirement': '–û–ø—ã—Ç —Ä–∞–±–æ—Ç—ã —Å ML/DL –æ—Ç 3 –ª–µ—Ç....,–£—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å –≤ —É–ª—É—á—à–µ–Ω–∏–∏ –ø—Ä–æ–¥—É–∫—Ç–∞ –Ω–∞ –≤—Å–µ—Ö —ç—Ç–∞–ø–∞...,–ø—Ä–∏–≤–µ—Ç —Å–ø–æ—Ä—Ç–∏–≤–Ω–æ–µ digital —Å–º–∏ —á–∏—Ç–∞—é—Ç 10 —Å—Ç—Ä–∞–Ω–∞...,–æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã ml dl 3 –ª–µ—Ç –∑–Ω–∞–Ω–∏–µ –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ ml d...,requirement –æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã ml dl 3 –ª–µ—Ç –∑–Ω–∞–Ω–∏–µ –∞–ª–≥...,–æ–ø—ã—Ç —Ä–∞–±–æ—Ç—ã ml dl 3 –ª–µ—Ç –∑–Ω–∞–Ω–∏–µ –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ ml d...
2,data scientist,126926550,–ü—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç hr-–ø—Ä–æ–µ–∫—Ç–∞,–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥,184000.0,–ö–∞–º–∞–ª–æ–≤ –õ–∏–Ω–∞—Ä –ó—É—Ñ–∞—Ä–æ–≤–∏—á,2025-10-24T19:51:33+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,{'requirement': '–ü—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä —Å –æ–ø—ã—Ç–æ–º –ø–æ...,–°—Ç–∞—Ä—à–∏–π –¥–∏–∑–∞–π–Ω–µ—Ä –∏–∑ —ç–∫–æ—Å–∏—Å—Ç–µ–º—ã Terra ‚Äî –æ—Ç–≤–µ—á–∞–µ...,–º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω–∞—è –∫–æ–º–∞–Ω–¥–∞ –∏—â–µ—Ç –ø—Ä–æ–≥—Ä–∞–º–º–∏—Å—Ç–∞ hr –ø—Ä–æ...,–ø—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä –æ–ø—ã—Ç–æ–º –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—è –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥...,requirement –ø—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä –æ–ø—ã—Ç–æ–º –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏...,–ø—Ä–æ–¥—É–∫—Ç–æ–≤—ã–π –ª–∏–¥–µ—Ä –æ–ø—ã—Ç–æ–º –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—è –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥...
3,data scientist,118916097,"–°—Ç–∞—Ä—à–∏–π Data Scientist, –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∑–∞–ø—Ä–æ—Å–æ–≤",–ú–æ—Å–∫–≤–∞,0.0,Ozon,2025-10-24T18:20:23+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,"{'requirement': '–û—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ Python, —É–º–µ–Ω–∏...","–ú—ã –∏—â–µ–º —á–µ–ª–æ–≤–µ–∫–∞, –∫–æ—Ç–æ—Ä—ã–π –ø–æ–º–æ–∂–µ—Ç –∫–æ–º–∞–Ω–¥–µ —Å –∏—Å...",–ø—Ä–∏–≤–µ—Ç —ç—Ç–æ –∫–æ–º–∞–Ω–¥–∞ –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –∑–∞–ø—Ä–æ—Å–æ–≤–Ω–∞—à–∞ –∫–æ–º...,–æ—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ python —É–º–µ–Ω–∏–µ –±—ã—Å—Ç—Ä–æ –ø–∏—Å–∞—Ç—å —á–∏...,requirement –æ—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ python —É–º–µ–Ω–∏–µ –±—ã—Å—Ç...,–æ—Ç–ª–∏—á–Ω–æ–µ –∑–Ω–∞–Ω–∏–µ python —É–º–µ–Ω–∏–µ –±—ã—Å—Ç—Ä–æ –ø–∏—Å–∞—Ç—å —á–∏...
4,data scientist,126793682,Data Engineer/ ML Engineer,–ù–∏–∂–Ω–∏–π –ù–æ–≤–≥–æ—Ä–æ–¥,250000.0,–¢–µ—Ö–Ω–æ–ø–∞—Ä–∫,2025-10-24T17:36:18+0300,–û—Ç 3 –¥–æ 6 –ª–µ—Ç,–£–¥–∞–ª–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞,–ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å,{'requirement': '3+ –ª–µ—Ç –≤ —Ä–æ–ª–∏ <highlighttext>...,"–°–±–æ—Ä, –æ—á–∏—Å—Ç–∫–∞ –∏ –∞–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –∏–∑ —Ä–∞–∑–Ω—ã—Ö...",–≥–∫ —Ç–µ—Ö–Ω–æ–ø–∞—Ä–∫ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å –ø–æ—Å—Ç–∞–≤—â–∏–∫ –æ–±–æ—Ä—É–¥–æ–≤–∞...,3 –ª–µ—Ç —Ä–æ–ª–∏ data scientist ml engineer —É–≤–µ—Ä–µ–Ω–Ω–æ...,requirement 3 –ª–µ—Ç —Ä–æ–ª–∏ data scientist ml engin...,3 –ª–µ—Ç —Ä–æ–ª–∏ data scientist ml engineer —É–≤–µ—Ä–µ–Ω–Ω–æ...


–ß–∞–Ω–∫–æ–≤–∞–Ω–∏–µ

In [54]:
df_super_final['meta_header'] = (
    '–í–∞–∫–∞–Ω—Å–∏—è: ' + df_super_final['name'].fillna('') + '.'
    ' –≥. ' + df_super_final['address.city'].fillna('') + '.' +
    " –û–ø—ã—Ç: " + df_super_final['experience.name'].fillna('') + ". " +
    "–ì—Ä–∞—Ñ–∏–∫: " + df_super_final['schedule.name'].fillna('') + ". " +
    "–ó–∞–Ω—è—Ç–æ—Å—Ç—å: " + df_super_final['employment.name'].fillna('') + ". ")
    
    

In [55]:
df_super_final['description_lemmatized'][0]

'–º–∏—Å—Å–∏—è –Ω–∞—à –∫–æ–º–∞–Ω–¥–∞ –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–π ai —Ä–µ—à–µ–Ω–∏–µ –∏–∑–º–µ—Ä–∏–º—ã–π —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–π —ç—Ñ—Ñ–µ–∫—Ç –≤–µ—Å—å —ç–∫–æ—Å–∏—Å—Ç–µ–º–∞ —Å–±–µ—Ä–∞–π –ø–æ–º–æ–≥–∞–π –∫–∞–∂–¥—ã–π –∫–ª–∏–µ–Ω—Ç –ø–æ–ª—É—á–∞—Ç—å –ø–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –∫—Ä–µ–¥–∏—Ç –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏—è —Å—Ç—Ä–∞—Ö–æ–≤–æ–≤–∞—Ç—å —Å–ø–µ—Ü–ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π —Ç–æ—á–Ω–æ –Ω—É–∂–Ω—ã–π –º–æ–º–µ–Ω—Ç –≤–∞—à –º–æ–¥–µ–ª—å –Ω–∞–ø—Ä—è–º—É—é –≤–ª–∏—è—Ç—å —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–π —Ä–µ—à–µ–Ω–∏–µ –º–∏–ª–ª–∏–æ–Ω —á–µ–ª–æ–≤–µ–∫ —Ö–æ—Ç–µ—Ç—å —Å—Ç—Ä–æ–∏—Ç—å ml —Å–∏—Å—Ç–µ–º–∞ –∫–æ—Ç–æ—Ä—ã–π —Ä–∞–±–æ—Ç–∞—Ç—å –º–∞—Å—à—Ç–∞–± –≤–µ—Å—å —Å—Ç—Ä–∞–Ω–∞ –ø—Ä–∏—Å—ã–ª–∞—Ç—å —Ä–µ–∑—é–º–µ —Ä–∞—Å—Å–∫–∞–∑–∞—Ç—å —Å–≤–æ–π —Å–∞–º—ã–π —Å–ª–æ–∂–Ω—ã–π ml –ø—Ä–æ–µ–∫—Ç –æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç—å —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ ml –º–æ–¥–µ–ª—å –∫–ª–∞—Å—Å–∏—á–µ—Å–∫–∏–π –∞–ª–≥–æ—Ä–∏—Ç–º –Ω–µ–π—Ä–æ—Å–µ—Ç —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç –ø–µ—Ä–µ–¥–æ–≤–æ–π —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è gigachat –º—É–ª—å—Ç–∏–∞–≥–µ–Ω—Ç–Ω—ã–π —Å–∏—Å—Ç–µ–º–∞ –ø–æ–ª–Ω—ã–π —

In [56]:
df_requirements = df_super_final.copy()
df_requirements['rag_chunk'] = (df_requirements['meta_header'] +
                                '–¢—Ä–µ–±–æ–≤–∞–Ω–∏—è: ' + df_requirements['snippet.requirement_lemmatized'].fillna('—Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è –Ω–µ —É–∫–∞–∑–∞–Ω—ã')+ '.'
                               " –û–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏: " + df_requirements['snippet.responsibility'].fillna('–û–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏ –Ω–µ —É–∫–∞–∑–∞–Ω—ã.'))

In [57]:
df_requirements = df_requirements[['id', 'rag_chunk']]

In [58]:
df_requirements.head()

Unnamed: 0,id,rag_chunk
0,126836771,–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
1,126836682,–í–∞–∫–∞–Ω—Å–∏—è: Senior IT & Digital Recruiter (–ú–æ—Å–∫–≤...
2,126834799,–í–∞–∫–∞–Ω—Å–∏—è: –í–µ–¥—É—â–∏–π Data Scientist. –≥. . –û–ø—ã—Ç: –ë...
3,126832050,–í–∞–∫–∞–Ω—Å–∏—è: ML engineer / Data Scientist. –≥. . –û...
4,125890390,–í–∞–∫–∞–Ω—Å–∏—è: Data Scientist (ML/LLM). –≥. –ú–æ—Å–∫–≤–∞. ...


In [59]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [60]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""] # –ü–æ–ø—ã—Ç–∫–∞ —Ä–∞–∑–±–∏—Ç—å —Å–Ω–∞—á–∞–ª–∞ –ø–æ –∞–±–∑–∞—Ü–∞–º, –ø–æ—Ç–æ–º –ø–æ —Å—Ç—Ä–æ–∫–∞–º, –ø–æ—Ç–æ–º –ø–æ –ø—Ä–æ–±–µ–ª–∞–º
)

In [61]:
def create_general_chunks(row, splitter):
    """
    –†–∞–∑–±–∏–≤–∞–µ—Ç –¥–ª–∏–Ω–Ω–æ–µ –ª–µ–º–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ –æ–ø–∏—Å–∞–Ω–∏–µ –Ω–∞ —á–∞—Å—Ç–∏ (—Å—É–±-—á–∞–Ω–∫–∏),
    –ø—Ä–∏–∫—Ä–µ–ø–ª—è—è –∫ –∫–∞–∂–¥–æ–π —á–∞—Å—Ç–∏ –∑–∞–≥–æ–ª–æ–≤–æ–∫ —Å –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–º–∏.
    
    –ê—Ä–≥—É–º–µ–Ω—Ç—ã:
        row (pd.Series): –¢–µ–∫—É—â–∞—è —Å—Ç—Ä–æ–∫–∞ DataFrame.
        splitter (RecursiveCharacterTextSplitter): –ü—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –æ–±—ä–µ–∫—Ç LangChain.
        
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç:
        list: –°–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π, –∫–∞–∂–¥—ã–π –∏–∑ –∫–æ—Ç–æ—Ä—ã—Ö –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç —Å–æ–±–æ–π –≥–æ—Ç–æ–≤—ã–π —á–∞–Ω–∫.
    """
    # 1. –ò–∑–≤–ª–µ—á–µ–Ω–∏–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö –¥–∞–Ω–Ω—ã—Ö
    meta_header = row['meta_header']
    description_text = row['description_lemmatized']
    vacancy_id = row['id']

    if pd.isna(description_text) or not description_text:
        return [] # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç–æ–π —Å–ø–∏—Å–æ–∫, –µ—Å–ª–∏ –Ω–µ—Ç –æ–ø–∏—Å–∞–Ω–∏—è

    # 2. –†–∞–∑–±–∏–µ–Ω–∏–µ –¥–ª–∏–Ω–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞
    # –†–∞–∑–±–∏–≤–∞—Ç–µ–ª—å –ø—Ä–∏–Ω–∏–º–∞–µ—Ç —Å–ø–∏—Å–æ–∫ —Å—Ç—Ä–æ–∫, –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å–ø–∏—Å–æ–∫ –æ–±—ä–µ–∫—Ç–æ–≤ Document
    chunks = splitter.create_documents([description_text])
    
    general_chunks_list = []
    
    # 3. –§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω—ã—Ö —á–∞–Ω–∫–æ–≤
    for chunk in chunks:
        # –¢–µ–∫—Å—Ç —á–∞–Ω–∫–∞ –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ chunk.page_content
        chunk_text = chunk.page_content.strip()

        # –°–æ–∑–¥–∞–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ RAG-—á–∞–Ω–∫–∞
        final_rag_chunk = (
            meta_header + 
            "–û–ë–©–ï–ï –û–ü–ò–°–ê–ù–ò–ï: " + chunk_text
        )
        
        # –î–æ–±–∞–≤–ª–µ–Ω–∏–µ —á–∞–Ω–∫–∞ –≤ —Å–ø–∏—Å–æ–∫ —Å ID –¥–ª—è –æ—Ç—Å–ª–µ–∂–∏–≤–∞–Ω–∏—è
        general_chunks_list.append({
            'id': vacancy_id, 
            'rag_chunk': final_rag_chunk
        })
        
    return general_chunks_list

In [62]:
all_general_chunks = []
for index, row in df_super_final.iterrows():
    # üí° –í—ã–∑—ã–≤–∞–µ–º –Ω–∞—à—É —Ñ—É–Ω–∫—Ü–∏—é –¥–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è —Å–ø–∏—Å–∫–∞ —á–∞–Ω–∫–æ–≤ –∏–∑ –æ–¥–Ω–æ–π —Å—Ç—Ä–æ–∫–∏
    chunks_for_row = create_general_chunks(row, splitter=text_splitter)
    
    # üí° –î–æ–±–∞–≤–ª—è–µ–º –≤—Å–µ —á–∞–Ω–∫–∏ –∏–∑ —Ç–µ–∫—É—â–µ–π —Å—Ç—Ä–æ–∫–∏ –≤ –æ–±—â–∏–π —Å–ø–∏—Å–æ–∫
    all_general_chunks.extend(chunks_for_row)

In [63]:
df_general_chunks = pd.DataFrame(all_general_chunks)
df_general_chunks.head()

Unnamed: 0,id,rag_chunk
0,126836771,–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
1,126836771,–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
2,126836771,–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
3,126836771,–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
4,126836682,–í–∞–∫–∞–Ω—Å–∏—è: Senior IT & Digital Recruiter (–ú–æ—Å–∫–≤...


In [64]:
print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —á–∞–Ω–∫–æ–≤, —Å–æ–∑–¥–∞–Ω–Ω—ã—Ö –∏–∑ –æ–ø–∏—Å–∞–Ω–∏–π: {len(df_general_chunks)}")

–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —á–∞–Ω–∫–æ–≤, —Å–æ–∑–¥–∞–Ω–Ω—ã—Ö –∏–∑ –æ–ø–∏—Å–∞–Ω–∏–π: 2012


In [65]:
df_final_indexing = pd.concat([df_requirements, df_general_chunks], ignore_index=True)

In [66]:
df_final_indexing['rag_chunk'][0]

'–í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberCampaigning). –≥. . –û–ø—ã—Ç: –û—Ç 1 –≥–æ–¥–∞ –¥–æ 3 –ª–µ—Ç. –ì—Ä–∞—Ñ–∏–∫: –ü–æ–ª–Ω—ã–π –¥–µ–Ω—å. –ó–∞–Ω—è—Ç–æ—Å—Ç—å: –ü–æ–ª–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å. –¢—Ä–µ–±–æ–≤–∞–Ω–∏—è: 2 –≥–æ–¥ –æ–ø—ã—Ç ml pandas sklearn lightgbm pytorch catboost numpy –Ω–∞–≤—ã–∫ —Ä–∞–±–æ—Ç–∞ –±–æ–ª—å—à–æ–π –¥–∞–Ω–Ω—ã–µ spark sql hadoop. –û–±—è–∑–∞–Ω–Ω–æ—Å—Ç–∏: –†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ ML-–º–æ–¥–µ–ª–µ–π (–æ—Ç –∫–ª–∞—Å—Å–∏—á–µ—Å–∫–∏—Ö –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ –¥–æ –Ω–µ–π—Ä–æ—Å–µ—Ç–µ–π). –≠–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç—ã —Å –ø–µ—Ä–µ–¥–æ–≤—ã–º–∏ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è–º–∏ (GigaChat, –º—É–ª—å—Ç–∏–∞–≥–µ–Ω—Ç–Ω—ã–µ —Å–∏—Å—Ç–µ–º—ã). –ü–æ–ª–Ω—ã–π —Ü–∏–∫–ª: –æ—Ç –≥–∏–ø–æ—Ç–µ–∑—ã –∏...'

In [67]:
total_chunks = len(df_final_indexing)
print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —á–∞–Ω–∫–æ–≤ –≤ —Ñ–∏–Ω–∞–ª—å–Ω–æ–º –¥–∞—Ç–∞—Å–µ—Ç–µ: {total_chunks}")
print(df_final_indexing.head())

–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —á–∞–Ω–∫–æ–≤ –≤ —Ñ–∏–Ω–∞–ª—å–Ω–æ–º –¥–∞—Ç–∞—Å–µ—Ç–µ: 2412
          id                                          rag_chunk
0  126836771  –í–∞–∫–∞–Ω—Å–∏—è: Middle Data Scientist (–∫–æ–º–∞–Ω–¥–∞ SberC...
1  126836682  –í–∞–∫–∞–Ω—Å–∏—è: Senior IT & Digital Recruiter (–ú–æ—Å–∫–≤...
2  126834799  –í–∞–∫–∞–Ω—Å–∏—è: –í–µ–¥—É—â–∏–π Data Scientist. –≥. . –û–ø—ã—Ç: –ë...
3  126832050  –í–∞–∫–∞–Ω—Å–∏—è: ML engineer / Data Scientist. –≥. . –û...
4  125890390  –í–∞–∫–∞–Ω—Å–∏—è: Data Scientist (ML/LLM). –≥. –ú–æ—Å–∫–≤–∞. ...


In [68]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

In [69]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [70]:
embeddings = HuggingFaceEmbeddings(model_name=model_name);

  embeddings = HuggingFaceEmbeddings(model_name=model_name);


In [71]:
documents = [
    Document(
        # page_content: —Ç–µ–∫—Å—Ç, –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –≤–µ–∫—Ç–æ—Ä–∏–∑–æ–≤–∞–Ω –∏ –≤–æ–∑–≤—Ä–∞—â–µ–Ω
        page_content=row['rag_chunk'],
        # metadata: –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è, –ø—Ä–∏–≤—è–∑–∞–Ω–Ω–∞—è –∫ –≤–µ–∫—Ç–æ—Ä—É
        metadata={"id": row['id']} 
    ) 
    # –ü—Ä–∏–º–µ–Ω—è–µ–º –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ –∫ –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–µ DataFrame
    for index, row in df_final_indexing.iterrows()
]

In [72]:
#–°–æ–∑–¥–∞–Ω–∏–µ FAISS-–∏–Ω–¥–µ–∫—Å–∞ –≤ –ø–∞–º—è—Ç–∏
vectorstore = FAISS.from_documents(
    documents=documents, 
    embedding=embeddings
)

In [73]:
index_path = "./faiss_index_vacancies"
vectorstore.save_local(index_path)