In [26]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from openai import OpenAI
from typing import List, Dict

In [27]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
openai = OpenAI()

In [5]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
HH_API_BASE_URL = "https://api.hh.ru"

In [21]:
BASE_URL = "https://api.hh.ru"

def fetch_jobs(text=None, area=1, count=10) -> List[Dict]:
    if not text:
        text = "ML OR Machine Learning OR LLM OR RAG OR LoRA OR Data Scientist OR AI OR GPT"

    url = f"{BASE_URL}/vacancies"
    params = {
        "text": text,
        "area": area,
        "per_page": min(count, 100),
        "professional_role": [164, 165, 96],  # ML, DS
        "page": 0,
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    items = response.json().get("items", [])[:count]

    job_descriptions = []
    for vacancy in items:
        vacancy_id = vacancy.get("id")
        detail_resp = requests.get(f"{BASE_URL}/vacancies/{vacancy_id}")
        if detail_resp.status_code != 200:
            continue

        detail = detail_resp.json()
        raw_description = detail.get("description", "")
        cleaned_description = clean_html(raw_description).strip()

        if not cleaned_description:
            continue

        job_descriptions.append({
            "name": detail.get("name", "No title"),
            "description": cleaned_description
        })

    return job_descriptions

In [22]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

def extract_keywords_from_jobs(jobs) -> dict:
    results = {}

    for job in jobs:
        prompt = f"""
You are given a job posting. Extract keywords ONLY in these 4 categories:

- programming languages
- databases
- tools
- job title

Return the output in this format:
{{
  "job title": "...",
  "programming languages": [...],
  "databases": [...],
  "tools": [...]
}}

Job posting:
Title: {job['name']}
Description:
{job['description']}
"""

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts structured keyword data from job descriptions."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )

        try:
            result_text = response.choices[0].message.content.strip()
            results[job['name']] = result_text
        except Exception as e:
            results[job['name']] = f"⚠️ Error parsing result: {e}"

    return results

In [25]:
def main():
    print("Fetching job postings from HH.ru...")
    jobs = fetch_jobs(text="машинное обучение", count=10)

    print(f"✅ {len(jobs)} job postings fetched.")
    for job in jobs:
        print(f"- {job['name']}: {len(job['description'])} characters")

    print("Sending each job to OpenAI for keyword extraction...")
    per_job_keywords = extract_keywords_from_jobs(jobs)

    print("\n✅ Extracted Keywords Per Job:")
    for job_title, keywords in per_job_keywords.items():
        print(f"\n🧩 {job_title}:\n{keywords}")
        
if __name__ == "__main__":
    main()

Fetching job postings from HH.ru...
✅ 10 job postings fetched.
- Frontend-разработчик (React): 2220 characters
- Frontend-разработчик: 2130 characters
- Data Scientist: 3978 characters
- Middle data scientist (классический ml): 1888 characters
- Программист с обучением: 1955 characters
- Junior Data Scientist: 1831 characters
- Middle Java Developer: 2201 characters
- Python ML Engineer: 3066 characters
- ML-инженер (Classic ML, CVM): 2880 characters
- Quantitative researcher / исследователь-аналитик (количественные методы): 2611 characters
Sending each job to OpenAI for keyword extraction...

✅ Extracted Keywords Per Job:

🧩 Frontend-разработчик (React):
{
  "job title": "Frontend-разработчик",
  "programming languages": ["TypeScript", "JavaScript", "React"],
  "databases": [],
  "tools": ["React.js", "Redux", "RTK", "Linux", "Docker", "GitLab", "Jira", "Confluence", "vite", "webpack", "grpc", "jest", "vitest"]
}

🧩 Frontend-разработчик:
{
  "job title": "Frontend-разработчик",
  "pro