Different sentence transformers

In [None]:
import pandas as pd
import numpy as np
import re

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report


from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
# ============================================================
# 1. Clean Text Functions
# ============================================================

def clean_text(text):
    if text is None:
        return ""
    text = str(text)

    # Replace multiple slashes with space
    text = re.sub(r'/+', ' ', text)

    # Add space between number and letters: 25g → 25 g
    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:


# ============================================================
# 2. Load Data
# ============================================================

df = pd.read_csv("/content/df_keyword.csv")   # ← change this

# Clean subject + category
df["subject_clean"] = df["subject"].apply(clean_text)

In [None]:
# ----------------------------
# 3. Recode risk_decision into 3 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious']:
        return 0   # Low risk
    elif risk in ['potential risk', 'undecided', 'potentially serious']:
        return 1   # Medium / potential risk
    elif risk == 'serious':
        return 2   # High risk
    else:
        return -1  # Safety net for unexpected values

# Apply to both train and test
df['risk_decision_3class'] = df['risk_decision'].apply(recode_risk)


# Optional: check distribution
print(df['risk_decision_3class'].value_counts())

risk_decision_3class
2    14756
1     8090
0     4551
Name: count, dtype: int64


In [None]:
# ============================================================
# 4. ParaBERT Embeddings (only subject )
# ============================================================

from sentence_transformers import SentenceTransformer

# Load the multilingual MPNet model
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

print("Encoding SUBJECT…")
subject_embeddings = model.encode(df["subject_clean"].tolist(), show_progress_bar=True)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding SUBJECT…


Batches:   0%|          | 0/857 [00:00<?, ?it/s]

In [None]:

# Combine embeddings
X = np.hstack([subject_embeddings])

# Encode target
y = df["risk_decision_3class"]

In [None]:
# ============================================================
# 5. Train-test split
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------------------------------------
# 6. DEFINE BASELINE MODELS
# ---------------------------------------------------------
models = {
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "XGBoost": XGBClassifier(tree_method="hist", eval_metric="mlogloss", use_label_encoder=False, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42),
}


# ---------------------------------------------------------
# 7. TRAIN, EVALUATE, COLLECT RESULTS
# ---------------------------------------------------------
results = {}

for name, clf in models.items():
    print(f"\nTraining {name}...")

    # Use scaled data only for Logistic Regression
    if name == "Logistic Regression":
        clf.fit(X_train_scaled, y_train)
        preds = clf.predict(X_test_scaled)
    else:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")
    results[name] = {"accuracy": acc, "f1_macro": f1}

    print(f"Accuracy: {acc:.4f} | F1-macro: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, preds))




Training Random Forest...
Accuracy: 0.7186 | F1-macro: 0.6512
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.41      0.54       910
           1       0.66      0.56      0.60      1618
           2       0.73      0.90      0.81      2952

    accuracy                           0.72      5480
   macro avg       0.73      0.62      0.65      5480
weighted avg       0.72      0.72      0.70      5480


Training XGBoost...
Accuracy: 0.7214 | F1-macro: 0.6691
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.51      0.59       910
           1       0.63      0.59      0.61      1618
           2       0.77      0.86      0.81      2952

    accuracy                           0.72      5480
   macro avg       0.70      0.65      0.67      5480
weighted avg       0.72      0.72      0.71      5480


Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-thr

In [None]:
# ============================================================
# 4. ParaBERT Embeddings (only subject )
# ============================================================

from sentence_transformers import SentenceTransformer

# Load the multilingual MPNet model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print("Encoding SUBJECT…")
subject_embeddings = model.encode(df["subject_clean"].tolist(), show_progress_bar=True, normalize_embeddings=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding SUBJECT…


Batches:   0%|          | 0/857 [00:00<?, ?it/s]

In [None]:

# Combine embeddings
X = np.hstack([subject_embeddings])

# Encode target
y = df["risk_decision_3class"]

In [None]:
# ============================================================
# 5. Train-test split
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------------------------------------
# 6. DEFINE BASELINE MODELS
# ---------------------------------------------------------
models = {
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "XGBoost": XGBClassifier(tree_method="hist", eval_metric="mlogloss", use_label_encoder=False, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42),
}


# ---------------------------------------------------------
# 7. TRAIN, EVALUATE, COLLECT RESULTS
# ---------------------------------------------------------
results = {}

for name, clf in models.items():
    print(f"\nTraining {name}...")

    # Use scaled data only for Logistic Regression
    if name == "Logistic Regression":
        clf.fit(X_train_scaled, y_train)
        preds = clf.predict(X_test_scaled)
    else:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")
    results[name] = {"accuracy": acc, "f1_macro": f1}

    print(f"Accuracy: {acc:.4f} | F1-macro: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, preds))



Training Random Forest...
Accuracy: 0.7119 | F1-macro: 0.6395
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.39      0.52       910
           1       0.64      0.55      0.59      1618
           2       0.73      0.90      0.80      2952

    accuracy                           0.71      5480
   macro avg       0.73      0.61      0.64      5480
weighted avg       0.72      0.71      0.69      5480


Training XGBoost...
Accuracy: 0.7168 | F1-macro: 0.6642
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.50      0.58       910
           1       0.62      0.58      0.60      1618
           2       0.77      0.85      0.81      2952

    accuracy                           0.72      5480
   macro avg       0.69      0.65      0.66      5480
weighted avg       0.71      0.72      0.71      5480


Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-thr