In [0]:
# Install openai package if missing
%pip install -q openai

In [0]:
dbutils.library.restartPython()

In [0]:
from openai import OpenAI
import os
import time

In [0]:
def get_matching_score(description):
  client = OpenAI(
    api_key=dbutils.secrets.get("job_notification", "PAT_Token"),
    base_url="https://dbc-ecfaa4af-d4ab.cloud.databricks.com/serving-endpoints"
  )

  chat_completion = client.chat.completions.create(
    messages=[
    {
      "role": "system",
      "content": "You are a recruiter"
    },
    {
      "role": "user",
      "content": """Give the profile matching percentage in range 0 to 100, Just return the output as number and don't return any additional info 
      Ex-1 output: 10, Ex-2 output: 90
      my details: {
      "Objective": "Data Engineer with 2+ years of experience designing and optimizing scalable ETL pipelines, data warehouses, and cloud-based solutions. Skilled in Azure, PySpark, and SQL with a proven record of improving cost efficiency, reliability, and performance. Seeking to leverage expertise in Azure Data Engineering and modern cloud platforms to build robust, data-driven solutions that support business growth.",
      "Technical Skills": 
      {
          "Programming Language": "Python, SQL, PySpark",
          "Cloud & Data Services": "Azure Databricks, Azure Data Factory (ADF), ADLS, Azure Synapse Analytics (ASA)",
          "Databases": "SQL Server, Oracle, Snowflake",
          "Cloud Platforms": "Azure",
          "Tools": "GitHub, Azure DevOps CI/CD, Prefect",
          "Concepts": "ETL/ELT, Medallion Architecture, Data Modeling, Data Warehousing, API Integration"
      }
  }
  Job description: """ + description
    }
    ],
    model="databricks-llama-4-maverick",
    max_tokens=1000
  )

  return(chat_completion.choices[0].message.content)

In [0]:
all_jobs = spark.table("raw_catalogue.jobs.linkedjobs")
jobs = all_jobs.collect()

In [0]:
scored_jobs = []
for job in jobs:
    description = job['description']
    if job['skills']:
        description += ", " + job['skills']
    time.sleep(1)
    score = get_matching_score(description)
    data = {
        "url": job['url'],
        "score": int(score)
    }
    scored_jobs.append(data)
display(scored_jobs)

In [0]:
scored_df = spark.createDataFrame(scored_jobs)
df = scored_df.join(all_jobs, on=["url"])
filtered_df = df.filter(df.score > 60)
filtered_df.write.mode('overwrite').option("mergeSchema", "true").saveAsTable("staging_catalogue.jobs.linkedJobs")

In [0]:
display(filtered_df)