# Set-Up Code

In [None]:
OPENAI_API_KEY="*********"
OPENAI_PROJECT="*********"
OPENAI_ORG="*********"

In [2]:
# from utils.envs import OPENAI_API_KEY
import os
import base64
from typing import List, Dict
from openai import OpenAI
import openai
import glob
import pprint
import tqdm
import numpy as np
import json
import tiktoken
import pandas as pd
import re
import ast
import time
from datetime import datetime

### Initialize OpenAI Client

In [3]:
client = OpenAI(api_key=OPENAI_API_KEY, project=OPENAI_PROJECT, organization=OPENAI_ORG)

### Test that I can use Assistants & Responses API

In [4]:
# test that i can prompt a model

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello! How can I use OpenAI's API to classify job descriptions?"}
    ]
)   

print(response.choices[0].message.content)

To use OpenAI's API for classifying job descriptions, you can leverage the powerful natural language processing capabilities of the API to analyze and classify the text data. Here's a general overview of the steps you can take:

1. Get an API key: Sign up for access to the OpenAI API on their website and obtain the necessary API key.

2. Install the OpenAI Python client: You can use the OpenAI Python client to interact with the API. You can install it using pip:
   ```
   pip install openai
   ```

3. Prepare your job descriptions: Collect or input the job descriptions that you want to classify. Make sure the text data is clean and formatted properly for input to the API.

4. Make a request to the API: Use the OpenAI Python client to make a classification request to the API. You can use the `classification` endpoint for this purpose. Provide the job description text as the input data along with any relevant parameters for classification.

5. Receive and interpret the response: The API 

In [5]:
response = client.responses.create(
    model="gpt-5",
    input = "Hello! How can I use OpenAI's API to classify job descriptions?",
    instructions = "You are a helpful assistant."
)

print(response.output[1].content[0].text)

Great use case. Here’s a practical way to classify job descriptions with OpenAI, with structured JSON output you can trust in code.

High-level approach
- Define a clear taxonomy (categories, seniority, etc.).
- Prompt a model with strict instructions and use JSON Schema output so results are machine-parseable.
- Set temperature to 0 for consistency.
- Optionally add a few labeled examples to improve accuracy.

Python example (single-label classification)
- Installs: pip install openai
- Model: gpt-4.1-mini (good quality/cost for classification)
- Output: strict JSON matching your schema

```python
from openai import OpenAI
import json

client = OpenAI()

JOB_SCHEMA = {
  "name": "job_classification",
  "schema": {
    "type": "object",
    "additionalProperties": False,
    "properties": {
      "primary_category": {
        "type": "string",
        "description": "Best single category for this job.",
        "enum": [
          "Software Engineering", "Data Science / ML", "Product M

In [6]:
client.vector_stores.list()

SyncCursorPage[VectorStore](data=[VectorStore(id='vs_69025034e2fc81918fdcad91301db0cf', created_at=1761759285, file_counts=FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1), last_active_at=1761759310, metadata={}, name='column_mapper_v1', object='vector_store', status='completed', usage_bytes=9184, expires_after=None, expires_at=None, description=None), VectorStore(id='vs_68fdb1fa7e688191a6a4f690441695ea', created_at=1761456634, file_counts=FileCounts(cancelled=0, completed=5, failed=0, in_progress=0, total=5), last_active_at=1761456685, metadata={}, name='job_classifier_GPT_isha', object='vector_store', status='completed', usage_bytes=5282022, expires_after=None, expires_at=None, description=None), VectorStore(id='vs_68fba18f05408191853bfea00e219bc4', created_at=1761321359, file_counts=FileCounts(cancelled=0, completed=5, failed=0, in_progress=0, total=5), last_active_at=1761321615, metadata={}, name='job_classifier_gpt_no_aliases', object='vector_store', status='

# Read in Competitor extracted columns

In [7]:
with open("./../Responses/Extracted_Columns/Argenx_competitor_columns.txt", "r") as f:
    df_comp_columns = [line.strip() for line in f if line.strip()]

print(df_comp_columns)

['ID', 'Gender', 'Business Title', 'Country', 'argenx job level', 'Currency', 'Actual Annual Salary', 'Currency.1', 'Variable Pay - Plan', 'Target Variable Pay - Percent', 'Allowance - Car', 'Allowance - Phone', 'Allowance - Representation', 'Equity Amount - Annual', 'Cost Center - Name']


In [8]:
print(len(df_comp_columns))
print(df_comp_columns[:10])
print(df_comp_columns[-10:])

15
['ID', 'Gender', 'Business Title', 'Country', 'argenx job level', 'Currency', 'Actual Annual Salary', 'Currency.1', 'Variable Pay - Plan', 'Target Variable Pay - Percent']
['Currency', 'Actual Annual Salary', 'Currency.1', 'Variable Pay - Plan', 'Target Variable Pay - Percent', 'Allowance - Car', 'Allowance - Phone', 'Allowance - Representation', 'Equity Amount - Annual', 'Cost Center - Name']


# PROMPTING

### System Instructions

In [12]:
system_instructions = f"""
You are an expert data-mapping model trained on Korn Ferry’s column architecture.

You have access to the file `fixed_KF_template.txt`, which contains three columns:
- **KF_Column_Title**: the canonical Korn Ferry column name
- **Description**: a detailed explanation of what that column represents
- **Status**: one of "Required", "Optional", or "If Applicable"

Your task is to map input column names from another dataset to the most semantically similar columns in Korn Ferry’s architecture.

You will receive a list of {len(df_comp_columns)} input column names.

For each input column name, you must:
1. Compare it to all KF_Column_Titles and their Descriptions.
2. Select the **three most relevant Korn Ferry columns** based on meaning, function, or usage similarity.
3. Return your output in **strict JSON format** as follows:

{{
  "input_column": "<input_column_1_name>",
  "predictions": {{
      "matched_KF_column_1": "<prediction_1_of_KF_Column_Title>",
      "matched_KF_column_2": "<prediction_2_of_KF_Column_Title>",
      "matched_KF_column_3": "<prediction_3_of_KF_Column_Title>"
  }},
  "input_column": "<input_column_2_name>",
  "predictions": {{
      "matched_KF_column_1": "<prediction_1_of_KF_Column_Title>",
      "matched_KF_column_2": "<prediction_2_of_KF_Column_Title>",
      "matched_KF_column_3": "<prediction_3_of_KF_Column_Title>"
  }},
  ...
}}

Guidelines:
- Always return **three** matches per input column — no fewer.
- Use both title and description context from Korn Ferry’s architecture to infer matches.
- If unsure, select the closest conceptual matches based on meaning.
- Do **not** include commentary, explanations, or text outside the JSON output.
- Preserve exact spelling and casing of all KF_Column_Title values.
"""


In [13]:
user_query = f"""
You are given the following {len(df_comp_columns)} input column names that need to be mapped to Korn Ferry’s column architecture:

{df_comp_columns}

Please map each input column to its **three closest** KF_Column_Titles as described in your system instructions.
"""

In [14]:
print(len(df_comp_columns))
df_comp_columns

15


['ID',
 'Gender',
 'Business Title',
 'Country',
 'argenx job level',
 'Currency',
 'Actual Annual Salary',
 'Currency.1',
 'Variable Pay - Plan',
 'Target Variable Pay - Percent',
 'Allowance - Car',
 'Allowance - Phone',
 'Allowance - Representation',
 'Equity Amount - Annual',
 'Cost Center - Name']

In [16]:
print(system_instructions)


You are an expert data-mapping model trained on Korn Ferry’s column architecture.

You have access to the file `fixed_KF_template.txt`, which contains three columns:
- **KF_Column_Title**: the canonical Korn Ferry column name
- **Description**: a detailed explanation of what that column represents
- **Status**: one of "Required", "Optional", or "If Applicable"

Your task is to map input column names from another dataset to the most semantically similar columns in Korn Ferry’s architecture.

You will receive a list of 15 input column names.

For each input column name, you must:
1. Compare it to all KF_Column_Titles and their Descriptions.
2. Select the **three most relevant Korn Ferry columns** based on meaning, function, or usage similarity.
3. Return your output in **strict JSON format** as follows:

{
  "input_column": "<input_column_1_name>",
  "predictions": {
      "matched_KF_column_1": "<prediction_1_of_KF_Column_Title>",
      "matched_KF_column_2": "<prediction_2_of_KF_Colum

In [17]:
print(user_query)


You are given the following 15 input column names that need to be mapped to Korn Ferry’s column architecture:

['ID', 'Gender', 'Business Title', 'Country', 'argenx job level', 'Currency', 'Actual Annual Salary', 'Currency.1', 'Variable Pay - Plan', 'Target Variable Pay - Percent', 'Allowance - Car', 'Allowance - Phone', 'Allowance - Representation', 'Equity Amount - Annual', 'Cost Center - Name']

Please map each input column to its **three closest** KF_Column_Titles as described in your system instructions.



# Testing the prompts

In [18]:
start_time = time.time()
response_0 = client.responses.create(input=user_query,
                                    instructions=system_instructions,
                                    model="gpt-5",
                                    tools=[{"type": "file_search",
                                            "vector_store_ids": ["vs_69025034e2fc81918fdcad91301db0cf"],
                                            }]
                                    )
end_time = time.time()

In [19]:
latency = end_time - start_time
print(f"Response time: {latency} seconds")

Response time: 147.80661606788635 seconds


In [20]:
# write latency to .txt file

with open("./../Responses/Latency/Argenx_predictions_latency.txt", "w") as f:
    f.write(f'Response time: {latency} seconds')
    f.write('\n')
    f.write(f'Mapped {len(df_comp_columns)} columns')

In [21]:
output = response_0.output[-1].content[-1].text
print(type(output))
print(output)

<class 'str'>
[
  {
    "input_column": "ID",
    "predictions": {
      "matched_KF_column_1": "Employee ID",
      "matched_KF_column_2": "Manager Employee  ID",
      "matched_KF_column_3": "Korn Ferry Reference  Job Code"
    }
  },
  {
    "input_column": "Gender",
    "predictions": {
      "matched_KF_column_1": "Gender",
      "matched_KF_column_2": "Executive/Non-Executive Indicator (E / N)",
      "matched_KF_column_3": "Expat"
    }
  },
  {
    "input_column": "Business Title",
    "predictions": {
      "matched_KF_column_1": "Employee  Job Title",
      "matched_KF_column_2": "Department",
      "matched_KF_column_3": "Legal Entity / Business Unit"
    }
  },
  {
    "input_column": "Country",
    "predictions": {
      "matched_KF_column_1": "Country",
      "matched_KF_column_2": "Employee Work Location / Zip/Postal Code",
      "matched_KF_column_3": "Legal Entity / Business Unit"
    }
  },
  {
    "input_column": "argenx job level",
    "predictions": {
      "matche

In [23]:
# turn output into JSON object
mapping_results = json.loads(output)
type(mapping_results)


list

In [24]:
# length
len(mapping_results)

15

In [25]:
import pandas as pd

df = pd.DataFrame([
    {
        "input_column": item["input_column"],
        "matched_KF_column_1": item["predictions"]["matched_KF_column_1"],
        "matched_KF_column_2": item["predictions"]["matched_KF_column_2"],
        "matched_KF_column_3": item["predictions"]["matched_KF_column_3"]
    }
    for item in mapping_results
])

df.head()

Unnamed: 0,input_column,matched_KF_column_1,matched_KF_column_2,matched_KF_column_3
0,ID,Employee ID,Manager Employee ID,Korn Ferry Reference Job Code
1,Gender,Gender,Executive/Non-Executive Indicator (E / N),Expat
2,Business Title,Employee Job Title,Department,Legal Entity / Business Unit
3,Country,Country,Employee Work Location / Zip/Postal Code,Legal Entity / Business Unit
4,argenx job level,Employee Grade / Band,Korn Ferry Reference Level (not required if ...,Korn Ferry Total Points


In [26]:
df.tail()

Unnamed: 0,input_column,matched_KF_column_1,matched_KF_column_2,matched_KF_column_3
10,Allowance - Car,Car Allowance (Annual full-time equivalent),Car Eligibility (Y/N),Transportation / Commuting Allowance (Annual f...
11,Allowance - Phone,Telecommunication Allowance (Annual full-time ...,All Other Allowances (Annual full-time equival...,Other Benefits and Allowances (Annual full-tim...
12,Allowance - Representation,All Other Allowances (Annual full-time equival...,Other Benefits and Allowances (Annual full-tim...,Meal Allowance (Annual full-time equivalent)
13,Equity Amount - Annual,Actual Fair Value of Total LTI Granted,Target Fair Value of Total LTI Granted as % of...,Long-term Incentive Eligibility (Y/N)
14,Cost Center - Name,Department,Legal Entity / Business Unit,Function Code (not required if Ref. Job Code ...


In [None]:
# save to CSV
df.to_csv(f"./../Responses/Predictions/Argenx_GPT5_predictions.csv", index=False)