## Logistic Classifier -> ISCO 2-digit

In [None]:
import pandas as pd
import numpy as np
import ast
import re
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report
from sentence_transformers import SentenceTransformer


Utility functions

In [None]:
def isco3_to_isco2(code):
    """Convert a 3-digit ISCO code to 2-digit"""
    if pd.isna(code):
        return None
    code_str = str(code).strip()
    code_str = re.sub(r"\D", "", code_str)
    return code_str[:2] if len(code_str) >= 2 else None


def parse_secondary_codes(val):
    """Parse stringified list to actual list of secondary codes"""
    try:
        codes = ast.literal_eval(val) if isinstance(val, str) else val
        return codes if isinstance(codes, list) else []
    except Exception:
        return []


In [None]:
# Label dataset path
LABELED_DATASET = "output/patents_classified_500_gpt5_mini.xlsx"

# Minimum number of samples for a label to be considered
# This is used to filter out labels with very few samples
# to avoid overfitting and ensure meaningful evaluation.
MIN_LABEL_SAMPLE = 7

### Data pre-processing

In [None]:
df = pd.read_excel(LABELED_DATASET)

df["text"] = df["title"].fillna("") + ". " + df["abstract"].fillna("")

# Build a list of ISCO-2 labels per patent (primary + secondary)
df["labels"] = df.apply(
    lambda row: sorted(
        set(
            [isco3_to_isco2(row["primary_code"])]
            + [isco3_to_isco2(c) for c in parse_secondary_codes(row["secondary_codes"])]
        )
    ),
    axis=1,
)

# Remove rows with no valid labels
df = df[df["labels"].apply(lambda x: len([l for l in x if l]) > 0)].reset_index(
    drop=True
)

### Analize labels frequency

In [None]:
# Flatten all labels into a single list
all_labels = [label for sublist in df["labels"] for label in sublist]
label_freq = Counter(all_labels)

In [None]:
labels_to_keep = {label for label, count in label_freq.items() if count > MIN_LABEL_SAMPLE}
labels_to_remove = set(label_freq.keys()) - labels_to_keep

print("Labels to remove (support < 3):", sorted(labels_to_remove))
print("Labels to keep:", sorted(labels_to_keep))

In [None]:
# Filter dataset: remove rare labels in each row
df["filtered_labels"] = df["labels"].apply(lambda labels: [l for l in labels if l in labels_to_keep])

# Remove rows with no valid labels after filtering
df_filtered = df[df["filtered_labels"].apply(lambda x: len(x) > 0)].reset_index(drop=True)


### Logistic Classifier

In [None]:
# Multi-label binarization
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_filtered["filtered_labels"])
class_names = mlb.classes_

In [None]:
# Text embedding using sentence-transformers
model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight model
X = model.encode(df_filtered["text"].tolist(), show_progress_bar=True)

In [None]:
# Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
# One-vs-Rest Logistic Regression classifier
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight="balanced"))
clf.fit(X_train, Y_train)


In [None]:
# Predictions
Y_pred = clf.predict(X_test)

In [None]:
# STEP 12: Evaluation
micro_f1 = f1_score(Y_test, Y_pred, average="micro")
macro_f1 = f1_score(Y_test, Y_pred, average="macro")
report = classification_report(Y_test, Y_pred, target_names=class_names)

# Print results
print("âœ… Micro-F1 score:", micro_f1)
print("âœ… Macro-F1 score:", macro_f1)
print("\nðŸ“‹ Classification Report:\n", report)

In [21]:
def predict_labels(title, abstract, top_n=3):
    """
    Predict the top N ISCO2 labels for a new patent based on probability.
    Returns a list of (label, probability) tuples, sorted by confidence.
    """
    # 1. Prepare the input text
    text = f"{title.strip()}. {abstract.strip()}"

    # 2. Compute the embedding
    embedding = model.encode([text])  # shape: (1, embedding_dim)

    # 3. Predict probabilities for each label
    probs = clf.predict_proba(embedding)[0]  # shape: (n_labels,)

    # 4. Get top-N label indices (sorted descending)
    top_indices = np.argsort(probs)[-top_n:][::-1]
    top_labels = mlb.classes_[top_indices]
    top_probs = probs[top_indices]

    # 5. Return top N (label, probability) pairs
    return list(zip(top_labels, top_probs))


In [23]:
title = "processing data for interpretation"
abstract = "A system for improving sensor-based decision making provides for the automatic submission of data obtained locally from instrumentation (such as image data) together with the interpretation of that data, which can be the output of some software which has been checked and possibly corrected by a user according to his/her expertise, to a remote database via an internetwork. The submission to the remote database is preferably automatic so that the remote database grows over time. The local site can access the remote database to retrieve information to assist in interpretation of the locally produced data (for example similar images and their corresponding interpretations), or can retrieve updated or improved software or parameters improving the software used for processing the data. The information on the remote database can also be reprocessed by software agents to provide statistical information based on information from a variety of such local sites. The system is particularly useful in improving the interpretation of data which is difficult to interpret such as medical image data (e.g. mammographic or cardiac ultrasound data)."

results = predict_labels(title, abstract)

print("ðŸ”® Top 3 predicted ISCO2 labels:")
for label, prob in results:
    print(f"- {label}: {prob:.2f}")


ðŸ”® Top 3 predicted ISCO2 labels:
- 25: 0.96
- 22: 0.66
- 21: 0.39
