# Set-Up Code

In [None]:
OPENAI_API_KEY = '*********'
OPENAI_PROJECT="*********"
OPENAI_ORG="*********"


In [1]:
# from utils.envs import OPENAI_API_KEY
import os
import base64
from typing import List, Dict
from openai import OpenAI
import openai
import glob
import pprint
import tqdm
import numpy as np
import json
import tiktoken
import pandas as pd
import re
import ast
import time
from datetime import datetime

### Initialize OpenAI Client

In [3]:
client = OpenAI(api_key=OPENAI_API_KEY, project=OPENAI_PROJECT, organization=OPENAI_ORG)

### Test that I can use Assistants & Responses API

In [4]:
# test that i can prompt a model

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello! How can I use OpenAI's API to classify job descriptions?"}
    ]
)   

print(response.choices[0].message.content)

To classify job descriptions using OpenAI's API, you can follow these general steps:

1. Obtain access to OpenAI's API: Sign up for an account on OpenAI's platform and get access to the API keys required to make requests.

2. Prepare your job description data: Gather a dataset of job descriptions that you want to classify. Ensure that the data is properly formatted and ready for analysis.

3. Choose a suitable model: Determine which model provided by OpenAI's API would be best suited for your classification task. OpenAI offers several models, such as GPT-3 or DALL-E, that can be used for text analysis tasks.

4. Make API requests: Use the API keys provided by OpenAI to make requests to the chosen model with your job descriptions as input data. The model will generate predictions based on the input data.

5. Evaluate the results: Review the output of the model to see how well it has classified the job descriptions. You may need to fine-tune the model or adjust parameters to improve perf

In [5]:
response = client.responses.create(
    model="gpt-5",
    input = "Hello! How can I use OpenAI's API to classify job descriptions?",
    instructions = "You are a helpful assistant."
)

print(response.output[1].content[0].text)

Great question. There are three common ways to classify job descriptions with OpenAI’s API, depending on your data and constraints:

1) Zero/few-shot classification with structured JSON output (fastest to start)
- Use a small reasoning model like o4-mini or a lightweight model like gpt-4o-mini.
- Constrain the output with a JSON Schema so you always get machine-parseable labels.
- Add a few in-prompt examples for tricky edge cases.
- Set temperature=0 for deterministic results.

Python example (single-call classifier with strict JSON output)
- Replace YOUR_JOB_TEXT with your JD text.

from openai import OpenAI
import json

client = OpenAI()

job_schema = {
  "name": "job_classification",
  "schema": {
    "type": "object",
    "additionalProperties": False,
    "properties": {
      "job_family": {
        "type": "string",
        "enum": [
          "Software Engineering","Data Science","Product Management",
          "Sales","Marketing","Design","Human Resources",
          "Finance

In [7]:
client.vector_stores.list()

SyncCursorPage[VectorStore](data=[VectorStore(id='vs_69025034e2fc81918fdcad91301db0cf', created_at=1761759285, file_counts=FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1), last_active_at=1761759310, metadata={}, name='column_mapper_v1', object='vector_store', status='completed', usage_bytes=9184, expires_after=None, expires_at=None, description=None), VectorStore(id='vs_68fdb1fa7e688191a6a4f690441695ea', created_at=1761456634, file_counts=FileCounts(cancelled=0, completed=5, failed=0, in_progress=0, total=5), last_active_at=1761456685, metadata={}, name='job_classifier_GPT_isha', object='vector_store', status='completed', usage_bytes=5282022, expires_after=None, expires_at=None, description=None), VectorStore(id='vs_68fba18f05408191853bfea00e219bc4', created_at=1761321359, file_counts=FileCounts(cancelled=0, completed=5, failed=0, in_progress=0, total=5), last_active_at=1761321615, metadata={}, name='job_classifier_gpt_no_aliases', object='vector_store', status='

# Read in Competitor Data

In [2]:
df_competitor = pd.read_excel('./../Files/Competitor/All Countries - 2025 Submission_WTW_test 1.xlsx')
print(df_competitor.shape)
df_competitor.head()

(9, 198)


Unnamed: 0,Country Code,Case Number,Reports to Case Number,Area/City Code,Postal/ZIP Code,Unique Incumbent Identifier,Department Name,Internal Position Title,Internal Grade/Level/Band,Paterson Grade,...,Number of Shares/Units Granted.2,Grant Structure (SO and PP plans only).2,Long-Term Incentive Plan Type.3,Long-Term Incentive Plan Number.3,Grant Date.3,Grant Price Currency.3,Grant Price Per Share/Unit.3,Accounting Value per Share/Unit of Award.3,Number of Shares/Units Granted.3,Grant Structure (SO and PP plans only).3
0,,,,"APAC, EMEA, LATAM ONLY","APAC, EMEA, CANADA, U.S. ONLY",,,,,SUB-SAHARAN AFRICA ONLY,...,,"APAC, EMEA, LATAM ONLY",,,,,,,,"APAC, EMEA, LATAM ONLY"
1,AUT,1.0,,AT001,,183019.0,0.0,Retail Operative (Sales Assistant),1.0,,...,,,,,,,,,,
2,AUT,2.0,,AT001,,183038.0,0.0,Supervisor,2.0,,...,,,,,,,,,,
3,AUT,3.0,,AT001,,183064.0,0.0,Retail Operative (Sales Assistant),1.0,,...,,,,,,,,,,
4,AUT,4.0,,AT001,,183087.0,0.0,Retail Operative (Sales Assistant),1.0,,...,,,,,,,,,,


In [10]:
df_comp_columns = df_competitor.columns.tolist()
print(len(df_comp_columns))
print(df_comp_columns[:10])
print(df_comp_columns[-10:])

198
['Country Code', 'Case Number', 'Reports to Case Number', 'Area/City Code', 'Postal/ZIP Code', 'Unique Incumbent Identifier', 'Department Name', 'Internal Position Title', 'Internal Grade/Level/Band', 'Paterson Grade']
['Number of Shares/Units Granted.2', 'Grant Structure (SO and PP plans only).2', 'Long-Term Incentive Plan Type.3', 'Long-Term Incentive Plan Number.3', 'Grant Date.3', 'Grant Price Currency.3', 'Grant Price Per Share/Unit.3', 'Accounting Value per Share/Unit of Award.3', 'Number of Shares/Units Granted.3', 'Grant Structure (SO and PP plans only).3']


# PROMPTING

### System Instructions

In [11]:
system_instructions = f"""
You are an expert data-mapping model trained on Korn Ferry’s column architecture.

You have access to the file `fixed_KF_template.txt`, which contains three columns:
- **KF_Column_Title**: the canonical Korn Ferry column name
- **Description**: a detailed explanation of what that column represents
- **Status**: one of "Required", "Optional", or "If Applicable"

Your task is to map input column names from another dataset to the most semantically similar columns in Korn Ferry’s architecture.

You will receive a list of {len(df_comp_columns)} input column names.

For each input column name, you must:
1. Compare it to all KF_Column_Titles and their Descriptions.
2. Select the **three most relevant Korn Ferry columns** based on meaning, function, or usage similarity.
3. Return your output in **strict JSON format** as follows:

{{
  "input_column": "<input_column_1_name>",
  "predictions": {{
      "matched_KF_column_1": "<prediction_1_of_KF_Column_Title>",
      "matched_KF_column_2": "<prediction_2_of_KF_Column_Title>",
      "matched_KF_column_3": "<prediction_3_of_KF_Column_Title>"
  }},
  "input_column": "<input_column_2_name>",
  "predictions": {{
      "matched_KF_column_1": "<prediction_1_of_KF_Column_Title>",
      "matched_KF_column_2": "<prediction_2_of_KF_Column_Title>",
      "matched_KF_column_3": "<prediction_3_of_KF_Column_Title>"
  }},
  ...
}}

Guidelines:
- Always return **three** matches per input column — no fewer.
- Use both title and description context from Korn Ferry’s architecture to infer matches.
- If unsure, select the closest conceptual matches based on meaning.
- Do **not** include commentary, explanations, or text outside the JSON output.
- Preserve exact spelling and casing of all KF_Column_Title values.
"""


In [12]:
user_query = f"""
You are given the following {len(df_comp_columns)} input column names that need to be mapped to Korn Ferry’s column architecture:

{df_comp_columns}

Please map each input column to its **three closest** KF_Column_Titles as described in your system instructions.
"""

In [13]:
print(len(df_comp_columns))
df_comp_columns

198


['Country Code',
 'Case Number',
 'Reports to Case Number',
 'Area/City Code',
 'Postal/ZIP Code',
 'Unique Incumbent Identifier',
 'Department Name',
 'Internal Position Title',
 'Internal Grade/Level/Band',
 'Paterson Grade',
 'Employee Status',
 'Hierarchical Title',
 'Education Level',
 'Livello',
 'Expatriate Status',
 'Expatriate Pay Country',
 'Employment Contract',
 'Gender',
 'Origin Code',
 'Ethnic Origin',
 'Date of Birth',
 'Date of Hire',
 'Primary Medium',
 'Year of Entry into Business/Industry',
 'Date of Contract Termination',
 'Basic Hours Worked Per Week',
 'Date of Appointment',
 'Function Code',
 'Discipline Code',
 'Career Level',
 'Incumbent Survey Grade',
 'PEMP GRADING - Factor 1',
 'PEMP GRADING - Factor 2',
 'PEMP GRADING - Factor 3',
 'PEMP GRADING - Factor 4',
 'PEMP GRADING - Factor 5',
 'Job Matching Comments',
 'Same Job Match',
 'Business Unit Name',
 'Business Unit Primary Industry',
 'Currency of Financial Job Impact Data',
 'Incumbent Revenue Responsi

# Testing the prompts

In [14]:
start_time = time.time()
response_0 = client.responses.create(input=user_query,
                                    instructions=system_instructions,
                                    model="gpt-5",
                                    tools=[{"type": "file_search",
                                            "vector_store_ids": ["vs_69025034e2fc81918fdcad91301db0cf"],
                                            }]
                                    )
end_time = time.time()

In [None]:
latency = end_time - start_time
print(f"Response time: {latency} seconds")

In [15]:
output = response_0.output[-1].content[-1].text
print(type(output))
print(output)

<class 'str'>
{
  "citation": "  ",
  "mappings": [
    {
      "input_column": "Country Code",
      "predictions": {
        "matched_KF_column_1": "Country",
        "matched_KF_column_2": "Currency",
        "matched_KF_column_3": "Employee Work Location / Zip/Postal Code"
      }
    },
    {
      "input_column": "Case Number",
      "predictions": {
        "matched_KF_column_1": "Employee ID",
        "matched_KF_column_2": "Korn Ferry Reference  Job Code",
        "matched_KF_column_3": "Legal Entity / Business Unit"
      }
    },
    {
      "input_column": "Reports to Case Number",
      "predictions": {
        "matched_KF_column_1": "Manager Employee  ID",
        "matched_KF_column_2": "Employee ID",
        "matched_KF_column_3": "Legal Entity / Business Unit"
      }
    },
    {
      "input_column": "Area/City Code",
      "predictions": {
        "matched_KF_column_1": "Employee Work Location / Zip/Postal Code",
        "matched_KF_column_2": "Country",
        "mat

In [23]:
# turn output into JSON object
mapping_results = json.loads(output)
mapping_results = mapping_results['mappings']
type(mapping_results)


list

In [28]:
# length
len(mapping_results)

198

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        "input_column": item["input_column"],
        "matched_KF_column_1": item["predictions"]["matched_KF_column_1"],
        "matched_KF_column_2": item["predictions"]["matched_KF_column_2"],
        "matched_KF_column_3": item["predictions"]["matched_KF_column_3"]
    }
    for item in mapping_results
])

# optional: save to CSV
version = 'v1'
df.to_csv(f"./../Responses/DF_of_GPT5_Response_{version}.csv", index=False)