## Semantic Search - PATENTS coding according to ISCO

In [None]:
! pip install git+https://github.com/istat-methodology/semantic-search.git

In [None]:
import re
import pandas as pd

from semantic_search.data import build_corpus
from semantic_search.local import LocalKnowledgeBase

In [None]:
ISCO_PATH = "resources/classification/ISCO-08_structure_and_definitions.xlsx"
PATENTS_SAMPLE_PATH = "sample/patents_sample.xlsx"
OUTPUT_PATH = "output/patents_classified_semantic.csv"

### ISCO & PATENTS pre-processing 

Read and pre-process ISCO

In [None]:
# Read Excel
isco_df = pd.read_excel(ISCO_PATH)

# Drop columns we won't use (silently ignore if missing)
DROP_COLS = ["Tasks include", "Included occupations", "Excluded occupations", "Notes"]
isco_df.drop(columns=DROP_COLS, inplace=True, errors='ignore')

# Keep only Level >= 3 (minor + unit groups) and make a defensive copy
sub_major_df = isco_df.loc[isco_df["Level"] >= 3].copy()

# Normalize code column to string and trim
sub_major_df["ISCO 08 Code"] = sub_major_df["ISCO 08 Code"].astype(str).str.strip()

# Derive 3-digit ISCO code:
#  - remove all non-digits (handles formats like '221.1', '221-10', etc.)
#  - take the first 3 digits (we'll drop rows that don't yield 3 digits)
sub_major_df["isco3"] = (
    sub_major_df["ISCO 08 Code"]
      .str.replace(r"\D", "", regex=True)
      .str[:3]
)

# Simple text cleaner: collapse multiple spaces, strip; empty string for NaN
def _clean_text(s: object) -> str:
    if pd.isna(s):
        return ""
    s = str(s).replace("\xa0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Clean fields needed downstream
sub_major_df["Title EN"]   = sub_major_df["Title EN"].map(_clean_text)
sub_major_df["Definition"] = sub_major_df["Definition"].map(_clean_text)

# Keep only well-formed 3-digit codes (safety)
sub_major_df = sub_major_df[sub_major_df["isco3"].str.len() == 3].copy()


Build ISCO clean dataset (needed by semantic_search to build the knowledge base)

In [None]:
# One title per 3-digit code from Level == 3; drop empty titles
titles = (
    sub_major_df.loc[sub_major_df["Level"] == 3, ["isco3", "Title EN"]]
      .drop_duplicates("isco3")
)
titles = titles[titles["Title EN"].str.strip().ne("")].set_index("isco3")

# Aggregate all definitions (from Level >= 3) under the same 3-digit code
definitions = (
    sub_major_df
      .groupby("isco3", as_index=True)["Definition"]
      .apply(lambda x: _clean_text(" ".join([t for t in x if t])))
      .to_frame()
)

# Join definitions and titles; INNER keeps only codes with a (non-empty) title
isco_clean_df = definitions.join(titles, how="inner").reset_index()

# Build the descriptor string used by semantic search
#    (markdown ** around the title is intentional for visual emphasis)
isco_clean_df["descriptor"] = (
    "**" + isco_clean_df["Title EN"].str.strip() + "**. " +
    isco_clean_df["Definition"].str.strip()
).str.replace(r"\s+\.", ".", regex=True).str.strip()

# Tidy column names and order for downstream use
isco_clean_df = (
    isco_clean_df
      .rename(columns={"isco3": "ISCO3", "Title EN": "title", "Definition": "definition"})
      .loc[:, ["ISCO3", "title", "definition", "descriptor"]]
      .sort_values("ISCO3")
      .reset_index(drop=True)
)

In [None]:
isco_clean_df.head()

Read sample patents and generate descriptor (needed by semantic search)

In [None]:
patents_df = pd.read_excel(PATENTS_SAMPLE_PATH)
# Drop unnecessary columns
if "description" in patents_df.columns:
    patents_df = patents_df.drop("description", axis=1)

# Descriptor column is used in semantic search
patents_df["descriptor"] =  "**"+ patents_df["title"].str.strip() + "**. " + patents_df["abstract"].str.strip()


### Semantic Search

1. Semantic Search - Build Corpus

In [None]:
# Texts for embedding/search
texts = isco_clean_df["descriptor"].tolist()

# Sequential numeric IDs (1..N) â€” sorted by ISCO3 already in previous step
ids = list(range(1, len(isco_clean_df) + 1))

# Metadata: include both 3-digit ISCO code and title
metadata = (
    isco_clean_df[["ISCO3", "title"]]
      .assign(
          ISCO3=lambda df: df["ISCO3"].astype(str),
          title=lambda df: df["title"].fillna("").astype(str).str.strip()
      )
      .to_dict(orient="records")
)  # -> [{"ISCO3": "111", "title": "Legislators"}, ...]

In [None]:
# Build the corpus
corpus = build_corpus(
    texts=texts,
    ids=ids,
    metadata=metadata
)

2. Semantic Search - Build Knowledge Base (runs only on GPU)

In [None]:
base = LocalKnowledgeBase(
    corpus=corpus,
    model_id="BAAI/bge-m3"
)