### BERT or BERT Embedding/Target encoding/One-hot encoding for 7 columns
-XGBoost and LightGVM 

In [2]:
import pandas as pd

df=pd.read_csv("/content/df_keyword.csv")

In [4]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category"]
target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]
onehot_cols = ["type", "classification"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)






Generating embeddings for: ['subject', 'category']
Applying Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['type', 'classification']
Final feature matrix shape: (27397, 780)


In [27]:
# =====================================================
# 7. Sanitize feature names for LightGBM compatibility
# =====================================================

# LightGBM does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)


In [7]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


# =====================================================
# 8. XGBoost Model
# =====================================================
model = XGBClassifier(
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    random_state=42
)

print("\nTraining XGBoost...")
model.fit(X_train, y_train)


# =====================================================
# 9. Evaluation
# =====================================================
y_pred = model.predict(X_test)

print("\n========= XGBoost RESULTS =========")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Training XGBoost...

Accuracy: 0.8871

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3793
           1       0.88      0.92      0.90      4427

    accuracy                           0.89      8220
   macro avg       0.89      0.88      0.89      8220
weighted avg       0.89      0.89      0.89      8220



In [17]:
import lightgbm as lgb

# -------------------------
# Train/Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


In [18]:

# -------------------------
# LightGBM Classifier
# -------------------------
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

print("\nTraining LightGBM...")
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_eval],
    valid_names=['train','eval'],
)

# -------------------------
# Prediction & Evaluation
# -------------------------
y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

print("\n========= LightGBM RESULTS =========")
print(f"Accuracy: {accuracy_score(y_test, y_pred_binary):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))


Training LightGBM...

Accuracy: 0.8873

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3793
           1       0.87      0.92      0.90      4427

    accuracy                           0.89      8220
   macro avg       0.89      0.88      0.89      8220
weighted avg       0.89      0.89      0.89      8220



In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Columns to embed
embed_cols = ['subject', 'category', 'notifying_country', 'origin', 'simplified_hazard', 'type', 'classification']

# Fill missing
for col in embed_cols:
    df[col] = df[col].fillna('missing')

# Load BERT model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

# Create embeddings
embedding_features = []
for col in embed_cols:
    print(f"Embedding column: {col}")
    emb = embedder.encode(df[col].tolist(), convert_to_numpy=True)
    emb_df = pd.DataFrame(emb, columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])])
    embedding_features.append(emb_df)

X = pd.concat(embedding_features, axis=1)
y = df['risk_decision_2class']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# XGBoost
model = XGBClassifier(
    eval_metric='logloss',
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Embedding column: subject
Embedding column: category
Embedding column: notifying_country
Embedding column: origin
Embedding column: simplified_hazard
Embedding column: type
Embedding column: classification
Accuracy: 0.8793187347931873
              precision    recall  f1-score   support

           0       0.89      0.84      0.87      3793
           1       0.87      0.91      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.88      0.88      8220
weighted avg       0.88      0.88      0.88      8220



In [11]:
import lightgbm as lgb

# -------------------------
# Train/Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# -------------------------
# LightGBM Classifier
# -------------------------
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

print("\nTraining LightGBM...")
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_eval],
    valid_names=['train','eval'],
)

# -------------------------
# Prediction & Evaluation
# -------------------------
y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

print("\n========= LightGBM RESULTS =========")
print(f"Accuracy: {accuracy_score(y_test, y_pred_binary):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))



Training LightGBM...

Accuracy: 0.8791

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      3793
           1       0.87      0.91      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.88      0.88      8220
weighted avg       0.88      0.88      0.88      8220



In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

import lightgbm as lgb
import re


# =====================================================
# 1. Recode target
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Columns to embed using MPNet
# =====================================================
text_cols = [
    "subject",
    "category",
    "notifying_country",
    "origin",
    "simplified_hazard",
    "type",
    "classification"
]

# Fill missing
df[text_cols] = df[text_cols].fillna("missing")


# =====================================================
# 3. Load MPNet Sentence Transformer
# =====================================================
print("Loading MPNet model...")
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


# =====================================================
# 4. Encode all columns (768-dim each)
# =====================================================
def embed(series):
    return model.encode(series.tolist(), convert_to_numpy=True, batch_size=64)


print("Embedding columns:", text_cols)

embedding_dfs = []
for col in text_cols:
    emb = embed(df[col])
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_mpnet_{i}" for i in range(emb.shape[1])]
    )
    embedding_dfs.append(emb_df)

X = pd.concat(embedding_dfs, axis=1)
y = df[TARGET]

print("Raw embedding feature matrix shape:", X.shape)


# =====================================================
# 5. Optional: PCA Reduction (recommended for boosting)
# =====================================================
USE_PCA = True
PCA_DIM = 256   # from 7×768 dims → 256 dims

if USE_PCA:
    print(f"Applying PCA reduction to {PCA_DIM} dimensions...")
    pca = PCA(n_components=PCA_DIM, random_state=42)
    X_pca = pca.fit_transform(X)
    X = pd.DataFrame(X_pca, columns=[f"pca_{i}" for i in range(PCA_DIM)])

print("Final X shape:", X.shape)


# =====================================================
# 6. Clean column names for LightGBM
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)


# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# =====================================================
# 8. Train LightGBM
# =====================================================
print("Training LightGBM...")

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 64,
    "learning_rate": 0.03,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 3,
    "lambda_l2": 2.0,
    "verbose": -1,
}

train_set = lgb.Dataset(X_train, y_train)
valid_set = lgb.Dataset(X_test, y_test)

model_lgb = lgb.train(
    params,
    train_set,
    valid_sets=[valid_set],
    num_boost_round=300,

)


# =====================================================
# 9. Predictions & Evaluation
# =====================================================
y_pred = (model_lgb.predict(X_test) > 0.5).astype(int)

print("\n=========== LIGHTGBM RESULTS ===========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Loading MPNet model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding columns: ['subject', 'category', 'notifying_country', 'origin', 'simplified_hazard', 'type', 'classification']
Raw embedding feature matrix shape: (27397, 5376)
Applying PCA reduction to 256 dimensions...
Final X shape: (27397, 256)
Training LightGBM...


TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [20]:
# =====================================================
# 6. Clean column names for LightGBM
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)


# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# =====================================================
# 8. Train LightGBM
# =====================================================
print("Training LightGBM...")

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 64,
    "learning_rate": 0.03,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 3,
    "lambda_l2": 2.0,
    "verbose": -1,
}

train_set = lgb.Dataset(X_train, y_train)
valid_set = lgb.Dataset(X_test, y_test)

model_lgb = lgb.train(
    params,
    train_set,
    valid_sets=[valid_set],
    num_boost_round=300,

)


# =====================================================
# 9. Predictions & Evaluation
# =====================================================
y_pred = (model_lgb.predict(X_test) > 0.5).astype(int)

print("\n=========== LIGHTGBM RESULTS ===========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training LightGBM...

Accuracy: 0.8786496350364964

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87      2528
           1       0.87      0.91      0.89      2952

    accuracy                           0.88      5480
   macro avg       0.88      0.88      0.88      5480
weighted avg       0.88      0.88      0.88      5480



In [21]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


# =====================================================
# 8. XGBoost Model
# =====================================================
model = XGBClassifier(
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    random_state=42
)

print("\nTraining XGBoost...")
model.fit(X_train, y_train)


# =====================================================
# 9. Evaluation
# =====================================================
y_pred = model.predict(X_test)

print("\n========= XGBoost RESULTS =========")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Training XGBoost...

Accuracy: 0.8765

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      3793
           1       0.87      0.91      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.87      0.88      8220
weighted avg       0.88      0.88      0.88      8220



In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category", "simplified_hazard"]
target_encoding_cols = ["notifying_country", "origin"]
onehot_cols = ["type", "classification"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)





Generating embeddings for: ['subject', 'category', 'simplified_hazard']
Applying Target Encoding: ['notifying_country', 'origin']
Applying One-Hot Encoding: ['type', 'classification']
Final feature matrix shape: (27397, 1163)


In [25]:
# -------------------------
# LightGBM Classifier
# -------------------------
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

print("\nTraining LightGBM...")
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_eval],
    valid_names=['train','eval'],
)

# -------------------------
# Prediction & Evaluation
# -------------------------
y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

print("\n========= LightGBM RESULTS =========")
print(f"Accuracy: {accuracy_score(y_test, y_pred_binary):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))


Training LightGBM...

Accuracy: 0.8769

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      3793
           1       0.87      0.90      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.87      0.88      8220
weighted avg       0.88      0.88      0.88      8220

