#### Target variable : classification to 4 or 3 levels
#### Feature: Subject (after sentence embedding)

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

df=pd.read_csv(r"F:\Final_project\rasff_new2.csv")

# ----------------------
# Load and prepare data
# ----------------------
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# ----------------------
# Time-based split (80/20)
# ----------------------
cutoff_index = int(len(df) * 0.8)
cutoff_date = df.iloc[cutoff_index]['date']

train_df = df[df['date'] <= cutoff_date]
test_df  = df[df['date'] > cutoff_date]


# ----------------------
# Sentence Transformer Embedding
# ----------------------
model = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = model.encode(train_df['subject'].tolist(), show_progress_bar=True)
test_embeddings  = model.encode(test_df['subject'].tolist(), show_progress_bar=True)

# Add embedding vectors back into dataframe (384 dims)
emb_cols = [f"sub_emb_{i}" for i in range(train_embeddings.shape[1])]

train_df[emb_cols] = train_embeddings
test_df[emb_cols]  = test_embeddings




Batches:   0%|          | 0/685 [00:00<?, ?it/s]

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[emb_cols] = train_embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [2]:
train_df.to_csv('train_df_sembedding.csv', index=False)

In [3]:
test_df.to_csv('test_df_sembedding.csv', index=False)

In [8]:
# ----------------------------
# Recode risk_decision into 2 classes
# ----------------------------
def notification(risk):
    if risk in ['information notification for follow-up', 'information notification for attention']:
        return 0   # Lower risk
    elif risk == 'alert notification':
        return 1   # medium risk
    elif risk == 'border rejection notification':
        return 2   # High risk
    else:
        return -1  # Safety net for unexpected values

# Apply to both train and test
train_df['classification_3level'] = train_df['classification'].apply(notification)
test_df['classification_3level']  = test_df['classification'].apply(notification)

# Optional: check distribution
print(train_df['classification_3level'].value_counts())
print(test_df['classification_3level'].value_counts())

# Update target variable
target_col = 'classification_3level'

classification_3level
0    8512
2    6897
1    6507
Name: count, dtype: int64
classification_3level
0    2467
2    1523
1    1488
Name: count, dtype: int64


In [10]:
# Drop rows with -1 in the target column
train_df = train_df[train_df['classification_3level'] != -1].reset_index(drop=True)
test_df  = test_df[test_df['classification_3level'] != -1].reset_index(drop=True)

# Check distribution again
print(train_df['classification_3level'].value_counts())
print(test_df['classification_3level'].value_counts())

classification_3level
0    8512
2    6897
1    6507
Name: count, dtype: int64
classification_3level
0    2467
2    1523
1    1488
Name: count, dtype: int64


In [6]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# 3. Create CatBoost Pools
# -------------------------
train_pool = Pool(
    data=train_df[emb_cols],
    label=train_df[target_col],
)

test_pool = Pool(
    data=test_df[emb_cols],
    label=test_df[target_col],
)

# -------------------------
# 4. Initialize and train CatBoost
# -------------------------
model_cb = CatBoostClassifier(
     iterations=2000,
    learning_rate=0.03,
    depth=6,                   # slightly shallower for speed
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=100   # stop early if no improvement
)

model_cb.fit(train_pool, eval_set=test_pool)

# -------------------------
# 5. Evaluate model
# -------------------------
preds = model_cb.predict(test_pool)
preds = preds.flatten()  # flatten if shape is (n,1)

accuracy = accuracy_score(test_df[target_col], preds)
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(test_df[target_col], preds))

0:	learn: 0.4302792	test: 0.3864549	best: 0.3864549 (0)	total: 300ms	remaining: 10m
200:	learn: 0.6238365	test: 0.5162468	best: 0.5171595 (199)	total: 19.7s	remaining: 2m 56s
400:	learn: 0.6809637	test: 0.5467324	best: 0.5467324 (400)	total: 43.4s	remaining: 2m 53s
600:	learn: 0.7215733	test: 0.5609712	best: 0.5629792 (590)	total: 1m 4s	remaining: 2m 30s
800:	learn: 0.7487680	test: 0.5682731	best: 0.5691858 (768)	total: 1m 25s	remaining: 2m 8s
1000:	learn: 0.7740920	test: 0.5726543	best: 0.5732019 (923)	total: 1m 48s	remaining: 1m 48s
1200:	learn: 0.7938949	test: 0.5783133	best: 0.5783133 (1194)	total: 2m 29s	remaining: 1m 39s
1400:	learn: 0.8125114	test: 0.5885360	best: 0.5887185 (1399)	total: 3m 11s	remaining: 1m 21s
1600:	learn: 0.8276145	test: 0.5910916	best: 0.5910916 (1600)	total: 3m 56s	remaining: 59.1s
1800:	learn: 0.8418051	test: 0.5921869	best: 0.5930997 (1737)	total: 4m 27s	remaining: 29.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5930996714
bestI

In [11]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# 3. Create CatBoost Pools
# -------------------------
train_pool = Pool(
    data=train_df[emb_cols],
    label=train_df[target_col],
)

test_pool = Pool(
    data=test_df[emb_cols],
    label=test_df[target_col],
)

# -------------------------
# 4. Initialize and train CatBoost
# -------------------------
model_cb = CatBoostClassifier(
     iterations=2000,
    learning_rate=0.03,
    depth=6,                   # slightly shallower for speed
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=100   # stop early if no improvement
)

model_cb.fit(train_pool, eval_set=test_pool)

# -------------------------
# 5. Evaluate model
# -------------------------
preds = model_cb.predict(test_pool)
preds = preds.flatten()  # flatten if shape is (n,1)

accuracy = accuracy_score(test_df[target_col], preds)
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(test_df[target_col], preds))

0:	learn: 0.5098102	test: 0.4937934	best: 0.4937934 (0)	total: 102ms	remaining: 3m 24s
200:	learn: 0.6753057	test: 0.5989412	best: 0.6002191 (194)	total: 14.4s	remaining: 2m 9s
400:	learn: 0.7280526	test: 0.6224900	best: 0.6230376 (390)	total: 28.8s	remaining: 1m 54s
600:	learn: 0.7656963	test: 0.6319825	best: 0.6327127 (592)	total: 43.8s	remaining: 1m 42s
800:	learn: 0.7918416	test: 0.6438481	best: 0.6442132 (791)	total: 58.8s	remaining: 1m 28s
1000:	learn: 0.8118270	test: 0.6504199	best: 0.6509675 (995)	total: 1m 13s	remaining: 1m 13s
1200:	learn: 0.8302154	test: 0.6526104	best: 0.6540708 (1191)	total: 1m 28s	remaining: 59s
1400:	learn: 0.8468242	test: 0.6571742	best: 0.6582694 (1396)	total: 1m 43s	remaining: 44.4s
1600:	learn: 0.8609235	test: 0.6622855	best: 0.6637459 (1582)	total: 1m 59s	remaining: 29.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6637458927
bestIteration = 1582

Shrink model to first 1583 iterations.
Test Accuracy: 0.6637

Classification R

In [12]:
# Prepare data
X_train = train_df[emb_cols].values
y_train = train_df[target_col].values
X_test  = test_df[emb_cols].values
y_test  = test_df[target_col].values

In [13]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

# Define multi-class parameters
num_classes = len(train_df[target_col].unique())
params = {
    'objective': 'multi:softmax',  # or 'multi:softprob' for probabilities
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'eta': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Train
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# Predict
y_pred = bst.predict(dtest)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0]	train-mlogloss:1.08334	test-mlogloss:1.08179
[50]	train-mlogloss:0.77423	test-mlogloss:0.88506
[100]	train-mlogloss:0.64016	test-mlogloss:0.81690
[150]	train-mlogloss:0.55929	test-mlogloss:0.78281
[200]	train-mlogloss:0.49999	test-mlogloss:0.76211
[250]	train-mlogloss:0.45410	test-mlogloss:0.74803
[300]	train-mlogloss:0.41685	test-mlogloss:0.73807
[350]	train-mlogloss:0.38589	test-mlogloss:0.72990
[400]	train-mlogloss:0.35776	test-mlogloss:0.72418
[450]	train-mlogloss:0.33346	test-mlogloss:0.71894
[500]	train-mlogloss:0.31165	test-mlogloss:0.71502
[550]	train-mlogloss:0.29154	test-mlogloss:0.71162
[600]	train-mlogloss:0.27434	test-mlogloss:0.70907
[650]	train-mlogloss:0.25782	test-mlogloss:0.70716
[700]	train-mlogloss:0.24270	test-mlogloss:0.70549
[750]	train-mlogloss:0.22851	test-mlogloss:0.70387
[800]	train-mlogloss:0.21575	test-mlogloss:0.70417
[820]	train-mlogloss:0.21101	test-mlogloss:0.70401
Accuracy: 0.674333698430084
              precision    recall  f1-score   support

  