#### Target variable : risk decision to 2 levels
#### Feature: Subject (after sentence embedding) only & hazard type (one-hot encoding), category (target encoding)

In [1]:
import pandas as pd

df=pd.read_csv(r"F:\Final_project\rasff_new2.csv")

In [2]:
# ----------------------
# Load and prepare data
# ----------------------
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# ----------------------
# Time-based split (80/20)
# ----------------------
cutoff_index = int(len(df) * 0.8)
cutoff_date = df.iloc[cutoff_index]['date']

train_df = df[df['date'] <= cutoff_date]
test_df  = df[df['date'] > cutoff_date]

In [3]:
from sentence_transformers import SentenceTransformer
# ----------------------
# Sentence Transformer Embedding
# ----------------------
model = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = model.encode(train_df['subject'].tolist(), show_progress_bar=True)
test_embeddings  = model.encode(test_df['subject'].tolist(), show_progress_bar=True)

# Add embedding vectors back into dataframe (384 dims)
emb_cols = [f"sub_emb_{i}" for i in range(train_embeddings.shape[1])]

train_df[emb_cols] = train_embeddings
test_df[emb_cols]  = test_embeddings




Batches:   0%|          | 0/685 [00:00<?, ?it/s]

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [6]:
# ----------------------------
# Recode risk_decision into 2 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0   # Lower to medium risk
    elif risk == 'serious':
        return 1   # High risk
    else:
        return -1  # Safety net for unexpected values

# Apply to both train and test
train_df['risk_decision_2class'] = train_df['risk_decision'].apply(recode_risk)
test_df['risk_decision_2class']  = test_df['risk_decision'].apply(recode_risk)

# Optional: check distribution
print(train_df['risk_decision_2class'].value_counts())
print(test_df['risk_decision_2class'].value_counts())

# Update target variable
target_col = 'risk_decision_2class'

risk_decision_2class
1    12204
0     9714
Name: count, dtype: int64
risk_decision_2class
0    2927
1    2552
Name: count, dtype: int64


  train_df['risk_decision_2class'] = train_df['risk_decision'].apply(recode_risk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['risk_decision_2class'] = train_df['risk_decision'].apply(recode_risk)
  test_df['risk_decision_2class']  = test_df['risk_decision'].apply(recode_risk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['risk_decision_2class']  = test_df['risk_decision'].apply(recode_risk)


In [7]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, accuracy_score

In [8]:
# -------------------------
# 3. Create CatBoost Pools
# -------------------------
train_pool = Pool(
    data=train_df[emb_cols],
    label=train_df[target_col],
)

test_pool = Pool(
    data=test_df[emb_cols],
    label=test_df[target_col],
)

# -------------------------
# 4. Initialize and train CatBoost
# -------------------------
model_cb = CatBoostClassifier(
     iterations=2000,
    learning_rate=0.03,
    depth=6,                   # slightly shallower for speed
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=100   # stop early if no improvement
)

model_cb.fit(train_pool, eval_set=test_pool)

# -------------------------
# 5. Evaluate model
# -------------------------
preds = model_cb.predict(test_pool)
preds = preds.flatten()  # flatten if shape is (n,1)

accuracy = accuracy_score(test_df[target_col], preds)
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(test_df[target_col], preds))

0:	learn: 0.6633817	test: 0.6304070	best: 0.6304070 (0)	total: 229ms	remaining: 7m 36s
200:	learn: 0.7802263	test: 0.6964775	best: 0.6964775 (200)	total: 9.54s	remaining: 1m 25s
400:	learn: 0.8164522	test: 0.7114437	best: 0.7127213 (392)	total: 20.4s	remaining: 1m 21s
600:	learn: 0.8441007	test: 0.7236722	best: 0.7244023 (599)	total: 31.3s	remaining: 1m 12s
800:	learn: 0.8618943	test: 0.7306078	best: 0.7309728 (786)	total: 42.8s	remaining: 1m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7326154408
bestIteration = 844

Shrink model to first 845 iterations.
Test Accuracy: 0.7326

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      2927
           1       0.68      0.79      0.73      2552

    accuracy                           0.73      5479
   macro avg       0.74      0.74      0.73      5479
weighted avg       0.74      0.73      0.73      5479



In [9]:
# Prepare data
X_train = train_df[emb_cols].values
y_train = train_df[target_col].values
X_test  = test_df[emb_cols].values
y_test  = test_df[target_col].values

In [10]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

# Define multi-class parameters
num_classes = len(train_df[target_col].unique())
params = {
    'objective': 'multi:softmax',  # or 'multi:softprob' for probabilities
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'eta': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# Predict
y_pred = bst.predict(dtest)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0]	train-mlogloss:0.68161	test-mlogloss:0.69354
[50]	train-mlogloss:0.49785	test-mlogloss:0.58247
[100]	train-mlogloss:0.41642	test-mlogloss:0.55156
[150]	train-mlogloss:0.36658	test-mlogloss:0.53779
[200]	train-mlogloss:0.33045	test-mlogloss:0.53033
[250]	train-mlogloss:0.30360	test-mlogloss:0.52449
[300]	train-mlogloss:0.28011	test-mlogloss:0.51998
[350]	train-mlogloss:0.25988	test-mlogloss:0.51714
[400]	train-mlogloss:0.24221	test-mlogloss:0.51537
[450]	train-mlogloss:0.22616	test-mlogloss:0.51397
[500]	train-mlogloss:0.21147	test-mlogloss:0.51308
[550]	train-mlogloss:0.19802	test-mlogloss:0.51249
[600]	train-mlogloss:0.18639	test-mlogloss:0.51230
[637]	train-mlogloss:0.17832	test-mlogloss:0.51212
Accuracy: 0.749224311005658
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      2927
           1       0.70      0.81      0.75      2552

    accuracy                           0.75      5479
   macro avg       0.75      0.75      0.75

In [11]:
# ----------------------
# One-hot encoding hazard_type
# ----------------------
train_df = pd.get_dummies(train_df, columns=['Hazard_Type'])
test_df  = pd.get_dummies(test_df, columns=['Hazard_Type'])

# Match test columns to train
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)

In [12]:
from category_encoders import CatBoostEncoder

# ----------------------
# Target Encoding
# ----------------------
target_cols = ['category']

encoder = CatBoostEncoder(cols=target_cols)
encoder.fit(train_df[target_cols], train_df['risk_decision_2class'])

train_df[target_cols] = encoder.transform(train_df[target_cols])
test_df[target_cols]  = encoder.transform(test_df[target_cols])

In [13]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# 1. Identify features
# -------------------------
# Assuming your train_df and test_df already have embeddings + encoded features
target_col = 'risk_decision_2class'

# Exclude target and non-feature columns
non_features = ['reference', 'date', 'subject', 'distribution', 'forAttention', 'forFollowUp', 'operator', 'hazards', 'classification', 'origin', 'notifying_country','type', 'risk_decision', target_col]
feature_cols = [col for col in train_df.columns if col not in non_features]

# -------------------------
# 2. Identify categorical features for CatBoost
# -------------------------
# Only include columns that are still strings/categorical
#cat_features = []  # adjust if you one-hot encoded hazard_type

# -------------------------
# 3. Create CatBoost Pools
# -------------------------
train_pool = Pool(
    data=train_df[feature_cols],
    label=train_df[target_col],
)

test_pool = Pool(
    data=test_df[feature_cols],
    label=test_df[target_col],
)

# -------------------------
# 4. Initialize and train CatBoost
# -------------------------
model_cb = CatBoostClassifier(
     iterations=2000,
    learning_rate=0.03,
    depth=6,                   # slightly shallower for speed
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=100   # stop early if no improvement
)

model_cb.fit(train_pool, eval_set=test_pool)

# -------------------------
# 5. Evaluate model
# -------------------------
preds = model_cb.predict(test_pool)
preds = preds.flatten()  # flatten if shape is (n,1)

accuracy = accuracy_score(test_df[target_col], preds)
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(test_df[target_col], preds))

0:	learn: 0.7048088	test: 0.6619821	best: 0.6619821 (0)	total: 85.6ms	remaining: 2m 51s
200:	learn: 0.7941418	test: 0.7200219	best: 0.7202044 (195)	total: 9.27s	remaining: 1m 22s
400:	learn: 0.8238890	test: 0.7282351	best: 0.7287826 (385)	total: 22.5s	remaining: 1m 29s
600:	learn: 0.8474770	test: 0.7391860	best: 0.7395510 (593)	total: 34.3s	remaining: 1m 19s
800:	learn: 0.8644037	test: 0.7446614	best: 0.7450265 (799)	total: 46.3s	remaining: 1m 9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7452089797
bestIteration = 801

Shrink model to first 802 iterations.
Test Accuracy: 0.7452

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75      2927
           1       0.71      0.77      0.74      2552

    accuracy                           0.75      5479
   macro avg       0.75      0.75      0.74      5479
weighted avg       0.75      0.75      0.75      5479



In [14]:
# -------------------------
# 1. Identify features
# -------------------------
# Assuming your train_df and test_df already have embeddings + encoded features
target_col = 'risk_decision_2class'

# Exclude target and non-feature columns
non_features = ['reference', 'date', 'subject', 'distribution', 'forAttention', 'forFollowUp', 'operator', 'hazards', 'classification', 'origin', 'notifying_country','type', 'risk_decision', target_col]
feature_cols = [col for col in train_df.columns if col not in non_features]

# Prepare data
X_train = train_df[feature_cols].values
y_train = train_df[target_col].values
X_test  = test_df[feature_cols].values
y_test  = test_df[target_col].values

In [15]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

# Define multi-class parameters
num_classes = len(train_df[target_col].unique())
params = {
    'objective': 'multi:softmax',  # or 'multi:softprob' for probabilities
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'eta': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# Predict
y_pred = bst.predict(dtest)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0]	train-mlogloss:0.67910	test-mlogloss:0.69220
[50]	train-mlogloss:0.47291	test-mlogloss:0.56240
[100]	train-mlogloss:0.39569	test-mlogloss:0.52819
[150]	train-mlogloss:0.34796	test-mlogloss:0.51147
[200]	train-mlogloss:0.31396	test-mlogloss:0.50181
[250]	train-mlogloss:0.28758	test-mlogloss:0.49628
[300]	train-mlogloss:0.26434	test-mlogloss:0.49216
[350]	train-mlogloss:0.24512	test-mlogloss:0.48921
[400]	train-mlogloss:0.22772	test-mlogloss:0.48677
[450]	train-mlogloss:0.21255	test-mlogloss:0.48562
[500]	train-mlogloss:0.19857	test-mlogloss:0.48461
[550]	train-mlogloss:0.18584	test-mlogloss:0.48446
[600]	train-mlogloss:0.17415	test-mlogloss:0.48378
[650]	train-mlogloss:0.16332	test-mlogloss:0.48321
[700]	train-mlogloss:0.15334	test-mlogloss:0.48311
[716]	train-mlogloss:0.15029	test-mlogloss:0.48301
Accuracy: 0.7609052746851616
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      2927
           1       0.72      0.79      0.75      