# AI Patents Classification

In [None]:
import re
import pandas as pd
from semantic_search.data import build_corpus
from semantic_search.local import LocalKnowledgeBase

## Data
Here, we import and pre-process the data, both for ISCO classification and for patents.

### ISCO 08 (Level 3 and Level 4)

In [None]:
notes_df = pd.read_excel("resources/classification/isco_08.xlsx", dtype={"ISCO 08 Code": str})

notes_level_3 = notes_df[notes_df["Level"] == 3].copy()
notes_level_4 = notes_df[notes_df["Level"] == 4].copy()

rename_mapping = {
    "Level": "level",
    "ISCO 08 Code": "code",
    "Title EN": "title",
    "Definition": "definition",
    "Tasks include": "included_tasks",
    "Included occupations": "included_occupations",
    "Excluded occupations": "excluded_occupations",
    "Notes": "note"
}

notes_level_3.reset_index(inplace=True, drop=True)
notes_level_3.drop(columns="Level", inplace=True)
notes_level_3.rename(columns=rename_mapping, inplace=True)

notes_level_4.reset_index(inplace=True, drop=True)
notes_level_4.drop(columns="Level", inplace=True)
notes_level_4.rename(columns=rename_mapping, inplace=True)

---

## Descriptor Template

In [None]:
desc_template = """# {title}

{definition}

* {included_tasks}
* {included_occupations}"""

Define a function to clean the occupation texts.

In [None]:
def clean_occupations(text: str):
    return re.sub(r"\d{4} ", r"\t-", text)

Create a function to get the descriptor.

In [None]:
def get_descriptor(df):
    for i, row in df.iterrows():

        descriptor = desc_template.format(
            title=row["title"],
            definition=row["definition"],
            included_tasks=row["included_tasks"],
            included_occupations=clean_occupations(row["included_occupations"]),
        )
        descriptor = descriptor.replace("\n* nan", "")
        descriptor = descriptor.replace("\nnan\n", "\n")

        df.at[i, "descriptor"] = descriptor
    return df

Apply the template to the Level 3 and Level 4 datasets.

In [None]:
notes_level_3 = get_descriptor(notes_level_3)
notes_level_4 = get_descriptor(notes_level_4)

---

## Semantic Search

In [None]:
corpus = build_corpus(
    texts = notes_level_4["descriptor"].tolist(),
    ids = notes_level_4.index.tolist(),
    metadata = [{"code": c, "title": t} for c, t in zip(notes_level_4["code"], notes_level_4["title"])]
)

base = LocalKnowledgeBase(
    corpus=corpus,
    model_id="BAAI/bge-m3",
    batch_size=32
)

Test the search.

In [None]:
query = ["Armed forces.", "Clerk", "Cleaning jobs", "Researchers"]

results = base.search(query, top_k=3)

for res in results:
    for res_i in res:
        print(f"{res_i.score:.2f} | {res_i.metadata["title"]}")
    print("---")