In [None]:
from openai import OpenAI
import pandas as pd
import json
import os
from datetime import datetime
import re
from dotenv import load_dotenv
from time import sleep

In [None]:
def append_log_jsonl(path, abstract, prompt, response, parsed):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    entry = {
        "timestamp": timestamp,
        "abstract": abstract,
        "prompt": prompt,
        "response": response,
        "parsed": parsed
    }
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


Load ISCO-08 -> Level 2 (Sub-Major Group)

In [None]:
def load_isco_sub_major(path: str) -> pd.DataFrame:
    df = pd.read_excel(path, sheet_name='ISCO-08 EN Struct and defin')
    sub_major = df[df['Level'] == 2][['ISCO 08 Code', 'Title EN']]
    sub_major.columns = ['Code', 'Title']
    return sub_major

def load_isco_minor(path: str) -> pd.DataFrame:
    df = pd.read_excel(path, sheet_name='ISCO-08 EN Struct and defin')
    sub_major = df[df['Level'] == 3][['ISCO 08 Code', 'Title EN']]
    sub_major.columns = ['Code', 'Title']
    return sub_major

Build OPENAI prompt

In [None]:
def generate_prompt(abstract: str, isco_sub_major_df: pd.DataFrame) -> str:
    entries = "\n".join([f"{row.Code}: {row.Title}" for _, row in isco_sub_major_df.iterrows()])
    prompt = f"""You are a labor market expert. Given the following abstract of a patent, your task is to assign the most appropriate ISCO-08 Sub-Major Group code and name (primary match), and list any other relevant secondary matches (optional).

Abstract:
\"\"\"{abstract}\"\"\"

Here is the list of available Sub-Major Groups:
{entries}

Please respond with:
- Primary Sub-Major Group: <code> - <title> with short justification
- Secondary Sub-Major Groups (if any): <code> - <title> with short justification
"""
    # print(f"Generated prompt:\n{prompt}\n")
    return prompt


In [None]:
def parse_classification_output(text: str) -> dict:
    try:
        # --- PRIMARY ---
        primary_code = ""
        primary_comment = ""

        # Tollerante a spazi, trattini opzionali, newline disordinati
        primary_match = re.search(
            r"Primary Sub-Major Group:\s*(\d+)\s*-\s*([^\n]+)\s*\n\s*-?\s*Justification:?\s*(.*?)(?:\n\n|\Z)",
            text,
            re.DOTALL | re.IGNORECASE
        )
        if primary_match:
            primary_code = primary_match.group(1).strip()
            primary_comment = primary_match.group(3).strip()

        # --- SECONDARY ---
        secondary_codes = []
        secondary_comment_blocks = []

        secondary_section = re.search(
            r"Secondary Sub-Major Groups\s*(?:\(if any\))?:\s*(.*)",
            text,
            re.DOTALL | re.IGNORECASE
        )

        if secondary_section:
            block = secondary_section.group(1)

            # Match ogni gruppo secondario (tollerante)
            secondary_matches = re.findall(
                r"-?\s*(\d+)\s*-\s*([^\n]+?)\s*\n\s*-?\s*Justification:?\s*(.*?)(?=\n\s*-?\s*\d+\s*-|\Z)",
                block,
                re.DOTALL
            )

            for code, title, comment in secondary_matches:
                secondary_codes.append(code.strip())
                comment_clean = f"{code.strip()} - {title.strip()}: {comment.strip()}"
                secondary_comment_blocks.append(comment_clean)

        return {
            "primary_code": primary_code,
            "primary_code_comment": primary_comment,
            "secondary_codes": ", ".join(secondary_codes),
            "secondary_codes_comment": " | ".join(secondary_comment_blocks)
        }

    except Exception as e:
        print(f"‚ùå Parsing error: {e}")
        return {
            "primary_code": "ERROR",
            "primary_code_comment": "Parsing failed",
            "secondary_codes": "",
            "secondary_codes_comment": ""
        }


Call OPENAI APIs

In [None]:
def classify_abstract(client, abstract: str, isco_df: pd.DataFrame, log_path=None, model="gpt-5") -> dict:
    prompt = generate_prompt(abstract, isco_df)
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}]
        )
        content = response.choices[0].message.content
        parsed = parse_classification_output(content)

        if log_path:
            append_log_jsonl(log_path, abstract, prompt, content, parsed)

        return parsed
    except Exception as e:
        print(f"Errore OpenAI API: {e}")
        return {
            "primary_code": "ERROR",
            "primary_code_comment": str(e),
            "secondary_codes": "",
            "secondary_codes_comment": ""
        }


Global variables

In [None]:
# File paths
ISCO_PATH = "resources/classification/ISCO-08_structure_and_definitions.xlsx"
PATENTS_PATH = "sample/patents_sample.xlsx"
OUTPUT_PATH = "output/patents_classified.csv"

MODEL = "gpt-5-mini"  # Modello da utilizzare, pu√≤ essere gpt-3.5-turbo o gpt-4

Load .env

In [None]:
# Carica la chiave API da file .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

log_path = f"logs/batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"

Read data (classification and sample patents)

In [None]:
# isco_df = load_isco_sub_major(ISCO_PATH)
isco_df = load_isco_minor(ISCO_PATH)
patents_df = pd.read_excel(PATENTS_PATH)

In [None]:
patents_df = patents_df.drop("description", axis=1).head(10).reset_index(drop=True)

Prepare the datastructure for storing results

In [None]:
results = {
        "primary_code": [],
        "primary_code_comment": [],
        "secondary_codes": [],
        "secondary_codes_comment": []
    }


Invoke OPENAI APIs to classify patents

In [None]:
for idx, row in patents_df.iterrows():
	print(f"üîç Elaborazione {idx+1}/{len(patents_df)}")
	classification = classify_abstract(client, row["abstract"], isco_df, log_path=log_path, model=MODEL)
	for key in results:
		results[key].append(classification.get(key, ""))
	sleep(0.5)  # per evitare rate limit

Add results to patents dataframe and save to output folder

In [None]:
 # Aggiungi i risultati al dataframe
for key in results:
	patents_df[key] = results[key]

patents_df.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ File generato: {OUTPUT_PATH}")