#### Binary risk classification 
- sentence transform embedding on subject only
- XGBoost showed the highest accuracy (0.75)
- randomized search using XGBoost find best parameter

In [8]:
%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 6.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 5.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import re

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report


from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")




In [2]:
import pandas as pd

df=pd.read_csv(r"F:\Final_project\rasff_new2.csv")

In [11]:
# ============================================================
# 1. Clean Text Functions
# ============================================================

def clean_text(text):
    if text is None:
        return ""
    text = str(text)

    # Replace multiple slashes with space
    text = re.sub(r'/+', ' ', text)

    # Add space between number and letters: 25g â†’ 25 g
    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Clean subject + category
df["subject_clean"] = df["subject"].apply(clean_text)
df["category_clean"] = df["category"].apply(clean_text)

In [3]:
# ----------------------------
# 2. Recode risk_decision into 2 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0   # Lower to medium risk
    elif risk == 'serious':
        return 1   # High risk
    else:
        return -1  # Safety net for unexpected values

In [4]:
df['risk_decision_2class']=df['risk_decision'].apply(recode_risk)
print(df['risk_decision_2class'].value_counts())


risk_decision_2class
1    14756
0    12641
Name: count, dtype: int64


In [5]:
# ----------------------
# Load and prepare data
# ----------------------
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# ----------------------
# Time-based split (80/20)
# ----------------------
cutoff_index = int(len(df) * 0.8)
cutoff_date = df.iloc[cutoff_index]['date']

train_df = df[df['date'] <= cutoff_date]
test_df  = df[df['date'] > cutoff_date]

In [6]:
from sentence_transformers import SentenceTransformer
# ----------------------
# Sentence Transformer Embedding
# ----------------------
model = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = model.encode(train_df['subject'].tolist(), show_progress_bar=True)
test_embeddings  = model.encode(test_df['subject'].tolist(), show_progress_bar=True)

# Add embedding vectors back into dataframe (384 dims)
emb_cols = [f"sub_emb_{i}" for i in range(train_embeddings.shape[1])]

train_df[emb_cols] = train_embeddings
test_df[emb_cols]  = test_embeddings

Batches:   0%|          | 0/685 [00:00<?, ?it/s]

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

In [7]:
# Optional: check distribution
print(train_df['risk_decision_2class'].value_counts())
print(test_df['risk_decision_2class'].value_counts())


# Update target variable
target_col = 'risk_decision_2class'

risk_decision_2class
1    12204
0     9714
Name: count, dtype: int64
risk_decision_2class
0    2927
1    2552
Name: count, dtype: int64


In [22]:
# Prepare data
X_train = train_df[emb_cols].values
y_train = train_df[target_col].values
X_test  = test_df[emb_cols].values
y_test  = test_df[target_col].values

In [25]:
# ---------------------------------------------------------
# 6. DEFINE BASELINE MODELS
# ---------------------------------------------------------

num_classes = len(train_df[target_col].unique())

models = {


    "XGBoost_multi": XGBClassifier(
        objective="multi:softmax",
        num_class=num_classes,
        eval_metric="mlogloss",
        learning_rate=0.03,        # eta
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),

    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "XGBoost": XGBClassifier(tree_method="hist", eval_metric="mlogloss", use_label_encoder=False, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42),
}

# ---------------------------------------------------------
# 7. TRAIN, EVALUATE, COLLECT RESULTS
# ---------------------------------------------------------
results = {}

for name, clf in models.items():
    print(f"\nTraining {name}...")


    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")
    results[name] = {"accuracy": acc, "f1_macro": f1}

    print(f"Accuracy: {acc:.4f} | F1-macro: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, preds))




Training XGBoost_multi...
Accuracy: 0.7124 | F1-macro: 0.7122
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.65      0.71      2927
           1       0.66      0.79      0.72      2552

    accuracy                           0.71      5479
   macro avg       0.72      0.72      0.71      5479
weighted avg       0.72      0.71      0.71      5479


Training Random Forest...
Accuracy: 0.7405 | F1-macro: 0.7405
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.70      0.74      2927
           1       0.70      0.79      0.74      2552

    accuracy                           0.74      5479
   macro avg       0.74      0.74      0.74      5479
weighted avg       0.75      0.74      0.74      5479


Training XGBoost...
Accuracy: 0.7295 | F1-macro: 0.7295
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.68      

In [27]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

# Define multi-class parameters
num_classes = len(train_df[target_col].unique())
params = {
    'objective': 'multi:softmax',  # or 'multi:softprob' for probabilities
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'eta': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# Predict
y_pred = bst.predict(dtest)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0]	train-mlogloss:0.68161	test-mlogloss:0.69354
[50]	train-mlogloss:0.49785	test-mlogloss:0.58247
[100]	train-mlogloss:0.41642	test-mlogloss:0.55156
[150]	train-mlogloss:0.36658	test-mlogloss:0.53779
[200]	train-mlogloss:0.33045	test-mlogloss:0.53033
[250]	train-mlogloss:0.30360	test-mlogloss:0.52449
[300]	train-mlogloss:0.28011	test-mlogloss:0.51998
[350]	train-mlogloss:0.25988	test-mlogloss:0.51714
[400]	train-mlogloss:0.24221	test-mlogloss:0.51537
[450]	train-mlogloss:0.22616	test-mlogloss:0.51397
[500]	train-mlogloss:0.21147	test-mlogloss:0.51308
[550]	train-mlogloss:0.19802	test-mlogloss:0.51249
[600]	train-mlogloss:0.18639	test-mlogloss:0.51230
[636]	train-mlogloss:0.17856	test-mlogloss:0.51221
Accuracy: 0.749224311005658
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      2927
           1       0.70      0.81      0.75      2552

    accuracy                           0.75      5479
   macro avg       0.75      0.75      0.75

In [13]:
import xgboost as xgb

xgb=xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_estimators=1000,
    tree_method='hist',
    random_state=42,
    use_label_encoder=False
)

In [14]:

from sklearn.metrics import accuracy_score, classification_report


xgb.fit(
    X_train, y_train
)

y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7508669465230882
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      2927
           1       0.70      0.82      0.75      2552

    accuracy                           0.75      5479
   macro avg       0.76      0.76      0.75      5479
weighted avg       0.76      0.75      0.75      5479



In [15]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier

# XGB model
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42,
    use_label_encoder=False
)

# Parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [4,6,8,10],
    'learning_rate': [0.01,0.03,0.05],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0],
    'gamma': [0,0.1,0.2],
    'min_child_weight': [1,5,10]
}

# Scorer
scorer = make_scorer(f1_score)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring=scorer,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit WITHOUT eval_set or early_stopping
random_search.fit(X_train, y_train)

print("Best parameters found:", random_search.best_params_)
best_model = random_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters found: {'subsample': 1.0, 'n_estimators': 1000, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.03, 'gamma': 0.2, 'colsample_bytree': 0.6}
Accuracy: 0.7501368862931191
              precision    recall  f1-score   support

           0       0.81      0.70      0.75      2927
           1       0.70      0.81      0.75      2552

    accuracy                           0.75      5479
   macro avg       0.75      0.75      0.75      5479
weighted avg       0.76      0.75      0.75      5479



In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

### best parameter on subject
xgb=xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate=0.03,
    max_depth=10,
    subsample=1.0,
    colsample_bytree=0.6,
    n_estimators=1000,
    min_child_weight= 10
    gamma = 0.2
    tree_method='hist',
    random_state=42,
    use_label_encoder=False
)

xgb.fit(
    X_train, y_train
)

y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [25]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score

lgb_model = LGBMClassifier(
        n_estimators=1000,
        random_state=42
        )

param_dist = {
        'num_leaves': [31, 50, 64, 80],
        'max_depth': [5, 6, 8, 10, 12],
        'learning_rate': [0.01, 0.03, 0.05, 0.1],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.3, 0.5],
        'reg_lambda': [0, 0.1, 0.3, 0.5],
        'min_child_samples': [10, 20, 30, 40]
        }

scorer = make_scorer(f1_score, average='macro')

random_search = RandomizedSearchCV(
        estimator=lgb_model,
        param_distributions=param_dist,
        n_iter=30, # number of random combinations
        scoring=scorer,
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
        )

fit_params = {
        "eval_set": [(X_test, y_test)],
        "eval_metric": "multi_logloss",
        }

random_search.fit(X_train, y_train, **fit_params)

print("Best parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-macro:", f1_score(y_test, y_pred, average='macro'))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 12204, number of negative: 9714
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 21918, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.556803 -> initscore=0.228196
[LightGBM] [Info] Start training from score 0.228196
Best parameters: {'subsample': 1.0, 'reg_lambda': 0.3, 'reg_alpha': 0, 'num_leaves': 80, 'min_child_samples': 20, 'max_depth': 8, 'learning_rate': 0.05, 'colsample_bytree': 0.9}
Accuracy: 0.7488592808906734
F1-macro: 0.7488490322009809


In [None]:
#best parameter
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# ----------------------
# Train LightGBM model
# ----------------------
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth = 8,
    num_leaves= 80,
    subsample=1.0,
    colsample_bytree=0.9,
    reg_lambda= 0.3, 
    reg_alpha=0,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)

# ----------------------
# Predict
# ----------------------
y_pred = lgbm.predict(X_test)

# ----------------------
# Evaluate
# ----------------------
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)

print("LightGBM Accuracy:", accuracy)
print("\nClassification Report:\n", report)
