##### Binary classification with simple one-hot encoding 
-

In [None]:
import pandas as pd

df=pd.read_csv(r"F:\Final_project\rasff_new2.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27397 entries, 0 to 27396
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   reference              27397 non-null  float64
 1   category               27397 non-null  object 
 2   type                   27397 non-null  object 
 3   subject                27397 non-null  object 
 4   date                   27397 non-null  object 
 5   notifying_country      27397 non-null  object 
 6   classification         27397 non-null  object 
 7   risk_decision          27397 non-null  object 
 8   distribution           18759 non-null  object 
 9   forAttention           14966 non-null  object 
 10  forFollowUp            13810 non-null  object 
 11  operator               27303 non-null  object 
 12  origin                 26823 non-null  object 
 13  hazards                20241 non-null  object 
 14  year                   27397 non-null  int64  
 15  mo

In [None]:
df['category'].value_counts()

category
fruits and vegetables                                   4934
nuts, nut products and seeds                            3291
poultry meat and poultry meat products                  2309
dietetic foods, food supplements and fortified foods    1947
cereals and bakery products                             1779
herbs and spices                                        1763
fish and fish products                                  1476
meat and meat products (other than poultry)             1155
food contact materials                                  1114
feed materials                                          1072
other food product / mixed                              1068
milk and milk products                                   655
prepared dishes and snacks                               633
confectionery                                            633
bivalve molluscs and products thereof                    541
cocoa and cocoa preparations, coffee and tea             512
crustaceans and

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import numpy as np


# ----------------------------
# Recode risk_decision into 2 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0   # Lower to medium risk
    elif risk == 'serious':
        return 1   # High risk
    else:
        return -1  # Safety net for unexpected values

df['risk_decision_2class']=df['risk_decision'].apply(recode_risk)
print(df['risk_decision_2class'].value_counts())

# Define the original target column name
TARGET = 'risk_decision_2class'
df.dropna(subset=[TARGET], inplace=True) # Ensure target is not null



# --- 3. Feature Selection and Engineering (Revised) ---
# Use ONLY the five specified categorical features
FEATURES = ['category', 'type', 'subject', 'origin', 'classification', 'hazards']

# Create a filtered DataFrame containing only the features and the binary target
df_model = df[FEATURES + ['risk_decision_2class']].copy()

# Handle missing values by filling with a simple placeholder 'missing'
print(f"Handling NaNs in selected features: {FEATURES}")
df_model[FEATURES] = df_model[FEATURES].fillna('missing')

# One-Hot Encoding for all selected categorical features
df_encoded = pd.get_dummies(df_model, columns=FEATURES, drop_first=True)

# Define X (features: all OHE columns) and y (target)
X = df_encoded.drop(columns=['risk_decision_2class'])
y = df_encoded['risk_decision_2class']

print(f"Feature matrix (X) shape after encoding: {X.shape}")
print(f"X now contains {len(X.columns)} features (One-Hot Encoded variables).")


# --- 4. Split Data ---
# Stratify ensures the train/test split maintains the same proportion of the target class
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# --- 5. Model Training (Logistic Regression Baseline) ---
# Use 'liblinear' solver for smaller datasets or when using L1/L2 regularization
model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
print("\nTraining Logistic Regression baseline model...")
model.fit(X_train, y_train)


# --- 6. Prediction and Evaluation ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BASELINE MODEL RESULTS ---")
print(f"Baseline Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Note about the majority class:
majority_accuracy = y.value_counts(normalize=True).max()
print(f"Majority Class Baseline Accuracy (if model always guessed the majority class): {majority_accuracy:.4f}")

risk_decision_2class
1    14756
0    12641
Name: count, dtype: int64
Handling NaNs in selected features: ['category', 'type', 'subject', 'origin', 'classification', 'hazards']
Feature matrix (X) shape after encoding: (27397, 27418)
X now contains 27418 features (One-Hot Encoded variables).

Training Logistic Regression baseline model...

--- BASELINE MODEL RESULTS ---
Baseline Model Accuracy: 0.8549
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      3793
           1       0.85      0.89      0.87      4427

    accuracy                           0.85      8220
   macro avg       0.86      0.85      0.85      8220
weighted avg       0.86      0.85      0.85      8220

Majority Class Baseline Accuracy (if model always guessed the majority class): 0.5386


3 level risk decision with one-hot encoding

In [None]:
import pandas as pd

df=pd.read_csv(r"/content/df_keyword.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27397 entries, 0 to 27396
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   reference                  27397 non-null  float64
 1   category                   27397 non-null  object 
 2   type                       27397 non-null  object 
 3   subject                    27397 non-null  object 
 4   date                       27397 non-null  object 
 5   notifying_country          27397 non-null  object 
 6   classification             27397 non-null  object 
 7   risk_decision              27397 non-null  object 
 8   distribution               18759 non-null  object 
 9   forAttention               14966 non-null  object 
 10  forFollowUp                13810 non-null  object 
 11  operator                   27303 non-null  object 
 12  origin                     26823 non-null  object 
 13  hazards                    20241 non-null  obj

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import numpy as np


# ----------------------------
# Recode risk_decision into 3 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious']:
        return 0   # Low risk
    elif risk in ['potential risk', 'undecided', 'potentially serious']:
        return 1   # Medium / potential risk
    elif risk == 'serious':
        return 2   # High risk
    else:
        return -1  # Safety net for unexpected values


df['risk_decision_3class']=df['risk_decision'].apply(recode_risk)
print(df['risk_decision_3class'].value_counts())

# Define the original target column name
TARGET = 'risk_decision_3class'
df.dropna(subset=[TARGET], inplace=True) # Ensure target is not null


# --- 3. Feature Selection and Engineering (Revised) ---
# Use ONLY the five specified categorical features
FEATURES = ['category', 'type', 'subject', 'origin', 'classification', 'hazards']

# Create a filtered DataFrame containing only the features and the binary target
df_model = df[FEATURES + ['risk_decision_3class']].copy()

# Handle missing values by filling with a simple placeholder 'missing'
print(f"Handling NaNs in selected features: {FEATURES}")
df_model[FEATURES] = df_model[FEATURES].fillna('missing')

# One-Hot Encoding for all selected categorical features
df_encoded = pd.get_dummies(df_model, columns=FEATURES, drop_first=True)

# Define X (features: all OHE columns) and y (target)
X = df_encoded.drop(columns=['risk_decision_3class'])
y = df_encoded['risk_decision_3class']

print(f"Feature matrix (X) shape after encoding: {X.shape}")
print(f"X now contains {len(X.columns)} features (One-Hot Encoded variables).")


# --- 4. Split Data ---
# Stratify ensures the train/test split maintains the same proportion of the target class
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# --- 5. Model Training (Logistic Regression Baseline) ---
# Use 'liblinear' solver for smaller datasets or when using L1/L2 regularization
model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
print("\nTraining Logistic Regression baseline model...")
model.fit(X_train, y_train)


# --- 6. Prediction and Evaluation ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BASELINE MODEL RESULTS ---")
print(f"Baseline Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Note about the majority class:
majority_accuracy = y.value_counts(normalize=True).max()
print(f"Majority Class Baseline Accuracy (if model always guessed the majority class): {majority_accuracy:.4f}")

risk_decision_3class
2    14756
1     8090
0     4551
Name: count, dtype: int64
Handling NaNs in selected features: ['category', 'type', 'subject', 'origin', 'classification', 'hazards']
Feature matrix (X) shape after encoding: (27397, 27418)
X now contains 27418 features (One-Hot Encoded variables).

Training Logistic Regression baseline model...

--- BASELINE MODEL RESULTS ---
Baseline Model Accuracy: 0.7563
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.54      0.60      1366
           1       0.66      0.59      0.62      2427
           2       0.82      0.92      0.86      4427

    accuracy                           0.76      8220
   macro avg       0.72      0.68      0.69      8220
weighted avg       0.75      0.76      0.75      8220

Majority Class Baseline Accuracy (if model always guessed the majority class): 0.5386


6 risk level with one-hot encoding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import numpy as np


# Define the original target column name
print(df['risk_decision'].value_counts())

TARGET = 'risk_decision'
df.dropna(subset=[TARGET], inplace=True) # Ensure target is not null


# --- 3. Feature Selection and Engineering (Revised) ---
# Use ONLY the five specified categorical features
FEATURES = ['category', 'type', 'subject', 'origin', 'classification', 'hazards']

# Create a filtered DataFrame containing only the features and the binary target
df_model = df[FEATURES + ['risk_decision']].copy()

# Handle missing values by filling with a simple placeholder 'missing'
print(f"Handling NaNs in selected features: {FEATURES}")
df_model[FEATURES] = df_model[FEATURES].fillna('missing')

# One-Hot Encoding for all selected categorical features
df_encoded = pd.get_dummies(df_model, columns=FEATURES, drop_first=True)

# Define X (features: all OHE columns) and y (target)
X = df_encoded.drop(columns=['risk_decision'])
y = df_encoded['risk_decision']

print(f"Feature matrix (X) shape after encoding: {X.shape}")
print(f"X now contains {len(X.columns)} features (One-Hot Encoded variables).")


# --- 4. Split Data ---
# Stratify ensures the train/test split maintains the same proportion of the target class
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# --- 5. Model Training (Logistic Regression Baseline) ---
# Use 'liblinear' solver for smaller datasets or when using L1/L2 regularization
model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
print("\nTraining Logistic Regression baseline model...")
model.fit(X_train, y_train)


# --- 6. Prediction and Evaluation ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BASELINE MODEL RESULTS ---")
print(f"Baseline Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Note about the majority class:
majority_accuracy = y.value_counts(normalize=True).max()
print(f"Majority Class Baseline Accuracy (if model always guessed the majority class): {majority_accuracy:.4f}")

risk_decision
serious                14756
not serious             4134
undecided               3029
potential risk          2689
potentially serious     2372
no risk                  417
Name: count, dtype: int64
Handling NaNs in selected features: ['category', 'type', 'subject', 'origin', 'classification', 'hazards']
Feature matrix (X) shape after encoding: (27397, 27418)
X now contains 27418 features (One-Hot Encoded variables).

Training Logistic Regression baseline model...

--- BASELINE MODEL RESULTS ---
Baseline Model Accuracy: 0.6899
Classification Report:
                     precision    recall  f1-score   support

            no risk       0.48      0.08      0.14       125
        not serious       0.55      0.72      0.62      1240
     potential risk       0.42      0.29      0.35       807
potentially serious       0.45      0.12      0.19       712
            serious       0.78      0.95      0.86      4427
          undecided       0.50      0.26      0.34       909



3 level with one-hot encoding XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import numpy as np
from xgboost import XGBClassifier # Import XGBClassifier


# ----------------------------
# Recode risk_decision into 3 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious']:
        return 0   # Low risk
    elif risk in ['potential risk', 'undecided', 'potentially serious']:
        return 1   # Medium / potential risk
    elif risk == 'serious':
        return 2   # High risk
    else:
        return -1  # Safety net for unexpected values


df['risk_decision_3class']=df['risk_decision'].apply(recode_risk)
print(df['risk_decision_3class'].value_counts())

# Define the original target column name
TARGET = 'risk_decision_3class'
df.dropna(subset=[TARGET], inplace=True) # Ensure target is not null


# --- 3. Feature Selection and Engineering (Revised) ---
# Use ONLY the five specified categorical features
FEATURES = ['category', 'type', 'subject', 'origin', 'classification', 'hazards']

# Create a filtered DataFrame containing only the features and the binary target
df_model = df[FEATURES + ['risk_decision_3class']].copy()

# Handle missing values by filling with a simple placeholder 'missing'
print(f"Handling NaNs in selected features: {FEATURES}")
df_model[FEATURES] = df_model[FEATURES].fillna('missing')

# One-Hot Encoding for all selected categorical features
df_encoded = pd.get_dummies(df_model, columns=FEATURES, drop_first=True)

# Define X (features: all OHE columns) and y (target)
X = df_encoded.drop(columns=['risk_decision_3class'])
y = df_encoded['risk_decision_3class']

print(f"Feature matrix (X) shape after encoding: {X.shape}")
print(f"X now contains {len(X.columns)} features (One-Hot Encoded variables).")

# =====================================================
# Sanitize feature names for  compatibility
# =====================================================

# LightGBM/XGBoost does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[[\]{}\":\"\',]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)


# --- 4. Split Data ---
# Stratify ensures the train/test split maintains the same proportion of the target class
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y)

# =====================================================
# 8. XGBoost Model
# =====================================================
model = XGBClassifier(
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    random_state=42
)
print("\nTraining XGBoost baseline model...")
model.fit(X_train.values, y_train)


# --- 6. Prediction and Evaluation ---
y_pred = model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BASELINE MODEL RESULTS ---")
print(f"Baseline Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Note about the majority class:
majority_accuracy = y.value_counts(normalize=True).max()
print(f"Majority Class Baseline Accuracy (if model always guessed the majority class): {majority_accuracy:.4f}")


risk_decision_3class
2    14756
1     8090
0     4551
Name: count, dtype: int64
Handling NaNs in selected features: ['category', 'type', 'subject', 'origin', 'classification', 'hazards']
Feature matrix (X) shape after encoding: (27397, 27418)
X now contains 27418 features (One-Hot Encoded variables).


  pat = re.compile(pat, flags=flags)



Training XGBoost baseline model...

--- BASELINE MODEL RESULTS ---
Baseline Model Accuracy: 0.7557
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       910
           1       0.70      0.51      0.59      1618
           2       0.80      0.94      0.87      2952

    accuracy                           0.76      5480
   macro avg       0.72      0.68      0.69      5480
weighted avg       0.75      0.76      0.74      5480

Majority Class Baseline Accuracy (if model always guessed the majority class): 0.5386


2 level one hot encoding with XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import numpy as np
from xgboost import XGBClassifier # Import XGBClassifier


# ----------------------------
# Recode risk_decision into 2 classes
# ----------------------------
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0   # Lower to medium risk
    elif risk == 'serious':
        return 1   # High risk
    else:
        return -1  # Safety net for unexpected values

df['risk_decision_2class']=df['risk_decision'].apply(recode_risk)
print(df['risk_decision_2class'].value_counts())

# Define the original target column name
TARGET = 'risk_decision_2class'
df.dropna(subset=[TARGET], inplace=True) # Ensure target is not null



# --- 3. Feature Selection and Engineering (Revised) ---
# Use ONLY the five specified categorical features
FEATURES = ['category', 'type', 'subject', 'origin', 'classification', 'hazards']

# Create a filtered DataFrame containing only the features and the binary target
df_model = df[FEATURES + ['risk_decision_2class']].copy()

# Handle missing values by filling with a simple placeholder 'missing'
print(f"Handling NaNs in selected features: {FEATURES}")
df_model[FEATURES] = df_model[FEATURES].fillna('missing')

# One-Hot Encoding for all selected categorical features
df_encoded = pd.get_dummies(df_model, columns=FEATURES, drop_first=True)

# Define X (features: all OHE columns) and y (target)
X = df_encoded.drop(columns=['risk_decision_2class'])
y = df_encoded['risk_decision_2class']

print(f"Feature matrix (X) shape after encoding: {X.shape}")
print(f"X now contains {len(X.columns)} features (One-Hot Encoded variables).")

# =====================================================
# Sanitize feature names for  compatibility
# =====================================================

# LightGBM/XGBoost does not allow characters: {}[]:"', etc.
X.columns = (
    X.columns
      .str.replace(r'[[\]{}\":\"\',]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)   # extra safety
)


# --- 4. Split Data ---
# Stratify ensures the train/test split maintains the same proportion of the target class
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y)

# =====================================================
# 8. XGBoost Model
# =====================================================
model = XGBClassifier(
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.8,
    random_state=42
)
print("\nTraining XGBoost baseline model...")
model.fit(X_train.values, y_train)


# --- 6. Prediction and Evaluation ---
y_pred = model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- BASELINE MODEL RESULTS ---")
print(f"Baseline Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Note about the majority class:
majority_accuracy = y.value_counts(normalize=True).max()
print(f"Majority Class Baseline Accuracy (if model always guessed the majority class): {majority_accuracy:.4f}")

risk_decision_2class
1    14756
0    12641
Name: count, dtype: int64
Handling NaNs in selected features: ['category', 'type', 'subject', 'origin', 'classification', 'hazards']
Feature matrix (X) shape after encoding: (27397, 27418)
X now contains 27418 features (One-Hot Encoded variables).

Training XGBoost baseline model...

--- BASELINE MODEL RESULTS ---
Baseline Model Accuracy: 0.8540
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.84      2528
           1       0.85      0.89      0.87      2952

    accuracy                           0.85      5480
   macro avg       0.86      0.85      0.85      5480
weighted avg       0.85      0.85      0.85      5480

Majority Class Baseline Accuracy (if model always guessed the majority class): 0.5386
