RandomSearch for best parameter XGBOOST/LightBGM

In [22]:
import pandas as pd

df=pd.read_csv(r"/content/df_keyword.csv")

In [3]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.9/85.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category"]
target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]
onehot_cols = ["type", "classification"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)



# =====================================================
# 7. Sanitize feature names for LightGBM compatibility
# =====================================================

# LightGBM does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for: ['subject', 'category']
Applying Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['type', 'classification']
Final feature matrix shape: (27397, 780)


In [6]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
from xgboost import XGBClassifier

# ============================================================
# XGB model (GPU + logloss objective)
# ============================================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',        # << Changed from 'gpu_hist' to 'hist'
    device='cuda',             # << Added for explicit GPU usage
    predictor='gpu_predictor', # << Kept for consistency, often implied by device='cuda'
    random_state=42
)

# ============================================================
# Parameter grid (your format, fixed)
# ============================================================
param_dist = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

# ============================================================
# Scorer (F1 for your binary target)
# ============================================================
scorer = make_scorer(f1_score)

# ============================================================
# Randomized Search (same format, fixed)
# ============================================================
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations
    scoring=scorer,
    cv=3,                   # 3-fold CV
    verbose=2,              # << SHOWS PROGRESS
    random_state=42,
    n_jobs=1                # IMPORTANT: GPU ‚Üí must be 1
)

# ============================================================
# Fit model (NO eval_set, NO early stopping)
# ============================================================
print("\nüîé Running RandomizedSearchCV...\n")
random_search.fit(X_train, y_train)

print("\nüéâ Best parameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

# ============================================================
# Final Evaluation
# ============================================================
y_pred = best_model.predict(X_test)

print("\n======== TEST RESULTS ========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



üîé Running RandomizedSearchCV...

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.03, max_depth=10, min_child_weight=10, n_estimators=1000, subsample=1.0; total time=  18.7s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.03, max_depth=10, min_child_weight=10, n_estimators=1000, subsample=1.0; total time=  18.3s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.03, max_depth=10, min_child_weight=10, n_estimators=1000, subsample=1.0; total time=  18.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=10, min_child_weight=10, n_estimators=1500, subsample=0.8; total time=  20.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=10, min_child_weight=10, n_estimators=1500, subsample=0.8; total time=  19.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=10, min_child_weight=10, n_estimators=1500, subsample=0.8; total time=  20.1s
[CV] END

In [13]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score

lgb_model = LGBMClassifier(
     objective = 'binary',
     metric = 'binary_logloss',
     boosting_type = 'gbdt',
     random_state=42
        )

param_dist = {
        'n_estimators': [1000, 1500],
        'num_leaves': [31, 50, 64, 80],
        'max_depth': [5, 6, 8, 10],
        'learning_rate': [0.01, 0.03, 0.05],
        'subsample': [0.7, 0.8,  1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'reg_alpha': [0, 0.1,  0.5],
        'reg_lambda': [0, 0.1,  0.5],
        'min_child_samples': [10, 20, 30]
        }

# ============================================================
# Scorer (F1 for your binary target)
# ============================================================
scorer = make_scorer(f1_score, average='macro')

# ============================================================
# Randomized Search (same format, fixed)
# ============================================================
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations
    scoring=scorer,
    cv=3,                   # 3-fold CV
    verbose=2,              # << SHOWS PROGRESS
    random_state=42,
    n_jobs=1                # IMPORTANT: GPU ‚Üí must be 1
)


# ============================================================
# Fit model (NO eval_set, NO early stopping)
# ============================================================
print("\nüîé Running RandomizedSearchCV...\n")
random_search.fit(X_train, y_train)

print("\nüéâ Best parameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

# ============================================================
# Final Evaluation
# ============================================================
y_pred = best_model.predict(X_test)

print("\n======== TEST RESULTS ========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1;30;43mÏä§Ìä∏Î¶¨Î∞ç Ï∂úÎ†• ÎÇ¥Ïö©Ïù¥ Í∏∏Ïñ¥ÏÑú ÎßàÏßÄÎßâ 5000Ï§ÑÏù¥ ÏÇ≠Ï†úÎêòÏóàÏäµÎãàÎã§.[0m
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, min_child_samples=20, n_estimators=1500, num_leaves=64, reg_alpha=0.1, reg_lambda=0, subsample=1.0; total time=  59.2s
[LightGBM] [Info] Number of positive: 6886, number of negative: 5899
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 116164
[LightGBM] [Info] Number of data points in the train set: 12785, number of used features: 777
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.538600 -> initscore=0.154708
[LightGBM] [Info] Start training from score 0.154708
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=8, min_child_samples=20, n_estimators=1500, num_leaves=64, reg_alpha=0.1, reg_lambda=0, subsample=1.0; total time=  59.0s
[LightGBM] [Info] Number of positive: 6886,

In [14]:
import joblib

joblib.dump(best_model, "rasff_risk_model1.pkl")
print("Model saved!")

Model saved!


#### Solving overfitting problem


removed simplifed hazard

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category"]
target_encoding_cols = ["notifying_country", "origin"]
onehot_cols = ["type", "classification", "Hazard_Type"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)



# =====================================================
# 7. Sanitize feature names for LightGBM compatibility
# =====================================================

# LightGBM does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)



Generating embeddings for: ['subject', 'category']
Applying Target Encoding: ['notifying_country', 'origin']
Applying One-Hot Encoding: ['type', 'classification', 'Hazard_Type']
Final feature matrix shape: (27397, 787)


In [17]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',           # CPU histogram method (or 'gpu_hist' if preferred)
    device='cuda',                # GPU
    predictor='gpu_predictor',    # GPU predictor
    random_state=42,
    n_estimators=1500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.6,
    gamma=0.1,
    min_child_weight=1
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


=== Training Data Performance ===
Accuracy: 0.9947332742347604
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      8848
           1       0.99      1.00      1.00     10329

    accuracy                           0.99     19177
   macro avg       0.99      0.99      0.99     19177
weighted avg       0.99      0.99      0.99     19177

=== Test Data Performance ===
Accuracy: 0.878102189781022
              precision    recall  f1-score   support

           0       0.89      0.84      0.86      3793
           1       0.87      0.91      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.88      0.88      8220
weighted avg       0.88      0.88      0.88      8220



adding hazard type

In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category"]
target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]
onehot_cols = ["type", "classification", "Hazard_Type"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)



# =====================================================
# 7. Sanitize feature names for LightGBM compatibility
# =====================================================

# LightGBM does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)


Generating embeddings for: ['subject', 'category']
Applying Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['type', 'classification', 'Hazard_Type']
Final feature matrix shape: (27397, 788)


In [20]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',           # CPU histogram method (or 'gpu_hist' if preferred)
    device='cuda',                # GPU
    predictor='gpu_predictor',    # GPU predictor
    random_state=42,
    n_estimators=1500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.6,
    gamma=0.1,
    min_child_weight=1
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


=== Training Data Performance ===
Accuracy: 0.9956197528289096
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      8848
           1       0.99      1.00      1.00     10329

    accuracy                           1.00     19177
   macro avg       1.00      1.00      1.00     19177
weighted avg       1.00      1.00      1.00     19177

=== Test Data Performance ===
Accuracy: 0.8875912408759125
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3793
           1       0.88      0.92      0.90      4427

    accuracy                           0.89      8220
   macro avg       0.89      0.88      0.89      8220
weighted avg       0.89      0.89      0.89      8220



Rare category group and smoothing ont target encoding
- did not imrpvoe

In [33]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"

# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject"]
onehot_cols = ["classification", "Hazard_Type"]

target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]

# Fill missing
for col in embedding_cols + onehot_cols + target_encoding_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 2a. Group rare categories to reduce overfitting
# =====================================================
# Category: group rare categories (<50 samples)
threshold_category = 50
counts = df['category'].value_counts()
rare_categories = counts[counts < threshold_category].index
df['category_grouped'] = df['category'].replace(rare_categories, 'other')

# Target encoding columns: group rare categories
for col in target_encoding_cols:
    counts = df[col].value_counts()
    rare = counts[counts < 50].index
    df[col] = df[col].replace(rare, 'other')


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)

#PCA Reduce feature dimensionality
#subject embeddings are 384-dim ‚Üí could apply PCA to reduce to 50‚Äì100 dims without losing much info
from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=42)
subject_emb_pca = pca.fit_transform(embedding_df.filter(like='subject_emb'))
embedding_df_pca = pd.DataFrame(subject_emb_pca, columns=[f'subject_emb_{i}' for i in range(100)])


# =====================================================
# 4. Smoothed Target Encoding
# =====================================================
from category_encoders import TargetEncoder

print("Applying Smoothed Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols, smoothing=10)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols + ['category_grouped'])

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols + ['category_grouped']])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols + ['category_grouped'])
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)


# =====================================================
# 7. Sanitize feature names for XGBoost compatibility
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)


Generating embeddings for: ['subject']
Applying Smoothed Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['classification', 'Hazard_Type', 'category_grouped']
Final feature matrix shape: (27397, 427)


In [34]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',           # CPU histogram method (or 'gpu_hist' if preferred)
    device='cuda',                # GPU
    predictor='gpu_predictor',    # GPU predictor
    random_state=42,
    n_estimators=1500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.6,
    gamma=0.1,
    min_child_weight=1
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

=== Training Data Performance ===
Accuracy: 0.994524691036137
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      8848
           1       0.99      1.00      0.99     10329

    accuracy                           0.99     19177
   macro avg       0.99      0.99      0.99     19177
weighted avg       0.99      0.99      0.99     19177

=== Test Data Performance ===
Accuracy: 0.878102189781022
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      3793
           1       0.86      0.92      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.87      0.88      8220
weighted avg       0.88      0.88      0.88      8220



In [29]:


from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

=== Training Data Performance ===
Accuracy: 0.934452729832612
              precision    recall  f1-score   support

           0       0.95      0.90      0.93      8848
           1       0.92      0.96      0.94     10329

    accuracy                           0.93     19177
   macro avg       0.94      0.93      0.93     19177
weighted avg       0.94      0.93      0.93     19177

=== Test Data Performance ===
Accuracy: 0.8728710462287105
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.89      4427

    accuracy                           0.87      8220
   macro avg       0.87      0.87      0.87      8220
weighted avg       0.87      0.87      0.87      8220



After pca applied to subject embedding

In [32]:

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

=== Training Data Performance ===
Accuracy: 0.934452729832612
              precision    recall  f1-score   support

           0       0.95      0.90      0.93      8848
           1       0.92      0.96      0.94     10329

    accuracy                           0.93     19177
   macro avg       0.94      0.93      0.93     19177
weighted avg       0.94      0.93      0.93     19177

=== Test Data Performance ===
Accuracy: 0.8728710462287105
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.89      4427

    accuracy                           0.87      8220
   macro avg       0.87      0.87      0.87      8220
weighted avg       0.87      0.87      0.87      8220



after removing type

In [35]:

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

=== Training Data Performance ===
Accuracy: 0.9332012306408719
              precision    recall  f1-score   support

           0       0.95      0.90      0.93      8848
           1       0.92      0.96      0.94     10329

    accuracy                           0.93     19177
   macro avg       0.94      0.93      0.93     19177
weighted avg       0.93      0.93      0.93     19177

=== Test Data Performance ===
Accuracy: 0.8736009732360097
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.89      4427

    accuracy                           0.87      8220
   macro avg       0.88      0.87      0.87      8220
weighted avg       0.87      0.87      0.87      8220



original data where I did random serach with xgboost modifed from chatgpt

In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"


# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject", "category"]
target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]
onehot_cols = ["type", "classification"]

# Fill missing
for col in embedding_cols + target_encoding_cols + onehot_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)


# =====================================================
# 4. Target Encoding
# =====================================================
print("Applying Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)



# =====================================================
# 7. Sanitize feature names for LightGBM compatibility
# =====================================================

# LightGBM does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)


Generating embeddings for: ['subject', 'category']
Applying Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['type', 'classification']
Final feature matrix shape: (27397, 780)


In [39]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [38]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

=== Training Data Performance ===
Accuracy: 0.9360692496219429
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      8848
           1       0.92      0.96      0.94     10329

    accuracy                           0.94     19177
   macro avg       0.94      0.93      0.94     19177
weighted avg       0.94      0.94      0.94     19177

=== Test Data Performance ===
Accuracy: 0.8763990267639903
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.87      0.87      8220
weighted avg       0.88      0.88      0.88      8220



In [41]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
from xgboost import XGBClassifier

# ============================================================
# XGB model (GPU + logloss objective)
# ============================================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',        # << Changed from 'gpu_hist' to 'hist'
    device='cuda',             # << Added for explicit GPU usage
    predictor='gpu_predictor', # << Kept for consistency, often implied by device='cuda'
    random_state=42

)

# ============================================================
# Parameter grid (your format, fixed)
# ============================================================
param_dist = {
    'n_estimators': [300, 500, 1000],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.03, 0.05],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5],
    "reg_alpha": [0.05, 1.0, 2.0],
    "reg_lambda": [0.05, 1.0, 2.0]

}


# ============================================================
# Scorer (F1 for your binary target)
# ============================================================
scorer = make_scorer(f1_score)

# ============================================================
# Randomized Search (same format, fixed)
# ============================================================
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,              # number of random combinations
    scoring=scorer,
    cv=3,                   # 3-fold CV
    verbose=2,              # << SHOWS PROGRESS
    random_state=42,
    n_jobs=1                # IMPORTANT: GPU ‚Üí must be 1
)

# ============================================================
# Fit model (NO eval_set, NO early stopping)
# ============================================================
print("\nüîé Running RandomizedSearchCV...\n")
random_search.fit(X_train, y_train)

print("\nüéâ Best parameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

# ============================================================
# Final Evaluation
# ============================================================
y_pred = best_model.predict(X_test)

print("\n======== TEST RESULTS ========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


üîé Running RandomizedSearchCV...

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=1000, reg_alpha=0.05, reg_lambda=2.0, subsample=0.8; total time=  17.6s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=1000, reg_alpha=0.05, reg_lambda=2.0, subsample=0.8; total time=  16.1s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=1000, reg_alpha=0.05, reg_lambda=2.0, subsample=0.8; total time=  16.2s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=2.0, reg_lambda=1.0, subsample=1.0; total time=  15.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_depth=8, min_child_weight=1, n_estimators=500, reg_alpha=2.0, reg_lambda=1.0, subsample=1.0; total time=  15.1s
[CV] END c