In [35]:
import pandas as pd

df=pd.read_csv(r"/content/df_keyword.csv")

In [3]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

from xgboost import XGBClassifier

# =====================================================
# 1. Recode target (same as before)
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"

# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject"]
onehot_cols = ["classification", "Hazard_Type"]

target_encoding_cols = ["notifying_country", "origin", "simplified_hazard"]

# Fill missing
for col in embedding_cols + onehot_cols + target_encoding_cols:
    df[col] = df[col].fillna("missing")


# =====================================================
# 2a. Group rare categories to reduce overfitting
# =====================================================
# Category: group rare categories (<50 samples)
threshold_category = 50
counts = df['category'].value_counts()
rare_categories = counts[counts < threshold_category].index
df['category_grouped'] = df['category'].replace(rare_categories, 'other')

# Target encoding columns: group rare categories
for col in target_encoding_cols:
    counts = df[col].value_counts()
    rare = counts[counts < 50].index
    df[col] = df[col].replace(rare, 'other')


# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)

embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    # Rename embedding columns (e.g., subject_emb_0, subject_emb_1...)
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)

#PCA Reduce feature dimensionality
#subject embeddings are 384-dim → could apply PCA to reduce to 50–100 dims without losing much info
from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=42)
subject_emb_pca = pca.fit_transform(embedding_df.filter(like='subject_emb'))
embedding_df_pca = pd.DataFrame(subject_emb_pca, columns=[f'subject_emb_{i}' for i in range(100)])


# =====================================================
# 4. Smoothed Target Encoding
# =====================================================
from category_encoders import TargetEncoder

print("Applying Smoothed Target Encoding:", target_encoding_cols)

te = TargetEncoder(cols=target_encoding_cols, smoothing=10)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])


# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols + ['category_grouped'])

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols + ['category_grouped']])

ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols + ['category_grouped'])
)


# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)


# =====================================================
# 7. Sanitize feature names for XGBoost compatibility
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)


Generating embeddings for: ['subject']
Applying Smoothed Target Encoding: ['notifying_country', 'origin', 'simplified_hazard']
Applying One-Hot Encoding: ['classification', 'Hazard_Type', 'category_grouped']
Final feature matrix shape: (27397, 427)


In [27]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Training Data Performance ===
Accuracy: 0.9332012306408719
              precision    recall  f1-score   support

           0       0.95      0.90      0.93      8848
           1       0.92      0.96      0.94     10329

    accuracy                           0.93     19177
   macro avg       0.94      0.93      0.93     19177
weighted avg       0.93      0.93      0.93     19177

=== Test Data Performance ===
Accuracy: 0.8736009732360097
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.89      4427

    accuracy                           0.87      8220
   macro avg       0.88      0.87      0.87      8220
weighted avg       0.87      0.87      0.87      8220



In [29]:
import joblib
import json

# ===============================
# 1. Save Model
# ===============================
joblib.dump(xgb_model, "xgb_model.pkl")

# ===============================
# 2. Save Preprocessing Objects
# ===============================
joblib.dump(te, "target_encoder.pkl")
joblib.dump(ohe, "onehot_encoder.pkl")
joblib.dump(pca, "pca_subject.pkl")

# ===============================
# 3. Save Column Info (JSON)
# ===============================
preprocess_info = {
    "embedding_cols": embedding_cols,
    "target_encoding_cols": target_encoding_cols,
    "onehot_cols": onehot_cols + ["category_grouped"],
    "pca_output_dim": 100,
    "ohe_feature_names": list(ohe.get_feature_names_out(onehot_cols + ["category_grouped"])),
    "final_feature_names": list(X.columns),
    "sentence_model_name": model_name
}

with open("preprocessing_info.json", "w") as f:
    json.dump(preprocess_info, f, indent=4)

print("All model & preprocessing files saved successfully!")


All model & preprocessing files saved successfully!


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=1000,          # best parameter
    max_depth=8,                # best parameter
    learning_rate=0.01,         # best parameter
    subsample=0.8,              # best parameter
    colsample_bytree=0.6,       # best parameter
    gamma=0,                    # best parameter
    min_child_weight=5,         # best parameter
    reg_alpha=0.05,             # best parameter
    reg_lambda=2.0              # best parameter
)

# ============================================
# Fit on training data with early stopping
# ============================================
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  # monitor performance on test set
)

# ============================================
# Predict on training data
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-logloss:0.68706
[1]	validation_0-logloss:0.68364
[2]	validation_0-logloss:0.67894
[3]	validation_0-logloss:0.67555
[4]	validation_0-logloss:0.67251
[5]	validation_0-logloss:0.66784
[6]	validation_0-logloss:0.66526
[7]	validation_0-logloss:0.66080
[8]	validation_0-logloss:0.65578
[9]	validation_0-logloss:0.65275
[10]	validation_0-logloss:0.64875
[11]	validation_0-logloss:0.64438
[12]	validation_0-logloss:0.64075
[13]	validation_0-logloss:0.63672
[14]	validation_0-logloss:0.63396
[15]	validation_0-logloss:0.63099
[16]	validation_0-logloss:0.62659
[17]	validation_0-logloss:0.62218
[18]	validation_0-logloss:0.61864
[19]	validation_0-logloss:0.61660
[20]	validation_0-logloss:0.61480
[21]	validation_0-logloss:0.61296
[22]	validation_0-logloss:0.60966
[23]	validation_0-logloss:0.60598
[24]	validation_0-logloss:0.60271
[25]	validation_0-logloss:0.59952
[26]	validation_0-logloss:0.59761
[27]	validation_0-logloss:0.59541
[28]	validation_0-logloss:0.59316
[29]	validation_0-loglos

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


=== Training Data Performance ===
Accuracy: 0.973666371173802
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      8848
           1       0.96      0.99      0.98     10329

    accuracy                           0.97     19177
   macro avg       0.97      0.97      0.97     19177
weighted avg       0.97      0.97      0.97     19177

=== Test Data Performance ===
Accuracy: 0.8774939172749392
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      3793
           1       0.86      0.92      0.89      4427

    accuracy                           0.88      8220
   macro avg       0.88      0.87      0.88      8220
weighted avg       0.88      0.88      0.88      8220



In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# =====================================================
# 1. Recode target
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]

TARGET = "risk_decision_2class"

# =====================================================
# 2. Define feature groups
# =====================================================
text_cols = ["subject", "category"]              # will embed
onehot_cols = ["classification", "Hazard_Type"] # low cardinality
target_encoding_cols = ["notifying_country", "origin"]  # smoothed target encoding

# Fill missing
for col in text_cols + onehot_cols + target_encoding_cols:
    df[col] = df[col].fillna("missing")

# =====================================================
# 2a. Clean text in subject (light cleaning)
# =====================================================
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # remove extra spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove special characters
    text = text.strip()
    return text

df['subject_clean'] = df['subject'].apply(clean_text)
df['category_clean'] = df['category'].apply(clean_text)

# =====================================================
# 2b. Handle multiple origins
# =====================================================
# Split multiple origins by comma, then join back with ';' to keep as single string for TE
def preprocess_origin(orig):
    if pd.isna(orig) or orig == "missing":
        return "missing"
    parts = [o.strip() for o in str(orig).split(',')]
    return ';'.join(parts)

df['origin'] = df['origin'].apply(preprocess_origin)

# =====================================================
# 3. SentenceTransformer Embedding
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", text_cols)

embedding_features = []
for col in ['subject_clean', 'category_clean']:
    emb = embed_column(df[col])
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)

# Optional PCA to reduce dimensions
pca = PCA(n_components=100, random_state=42)
embedding_df_pca = pd.DataFrame(
    pca.fit_transform(embedding_df),
    columns=[f"emb_pca_{i}" for i in range(100)]
)

# =====================================================
# 4. Smoothed Target Encoding
# =====================================================
print("Applying Smoothed Target Encoding:", target_encoding_cols)
te = TargetEncoder(cols=target_encoding_cols, smoothing=10)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])

# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)
ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])
ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)

# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df_pca, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)

# =====================================================
# 7. Sanitize feature names for XGBoost compatibility
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)


Generating embeddings for: ['subject', 'category']
Applying Smoothed Target Encoding: ['notifying_country', 'origin']
Applying One-Hot Encoding: ['classification', 'Hazard_Type']
Final feature matrix shape: (27397, 114)


In [20]:
# =====================================================
# 7. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Training Data Performance ===
Accuracy: 0.9191218647337956
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      8848
           1       0.91      0.95      0.93     10329

    accuracy                           0.92     19177
   macro avg       0.92      0.92      0.92     19177
weighted avg       0.92      0.92      0.92     19177

=== Test Data Performance ===
Accuracy: 0.8648418491484184
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      3793
           1       0.85      0.90      0.88      4427

    accuracy                           0.86      8220
   macro avg       0.87      0.86      0.86      8220
weighted avg       0.87      0.86      0.86      8220



In [23]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# 1. Recreate XGBoost model with tuned parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',       # histogram-based tree, GPU compatible
    device='cuda',            # GPU device
    random_state=42,
    n_estimators=1000,        # moderate number of trees
    max_depth=5,              # balanced depth to reduce overfitting
    learning_rate=0.03,       # slower learning rate for stability
    subsample=0.8,            # row sampling
    colsample_bytree=0.8,     # feature sampling
    gamma=0.1,                # minimal loss reduction for split
    min_child_weight=5,       # reduce sensitivity to noise
    reg_alpha=0.5,            # L1 regularization
    reg_lambda=1.0            # L2 regularization
)

# ============================================
# 2. Fit model on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# 3. Evaluate on training data
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# 4. Evaluate on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


=== Training Data Performance ===
Accuracy: 0.9593262762684466
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      8848
           1       0.95      0.98      0.96     10329

    accuracy                           0.96     19177
   macro avg       0.96      0.96      0.96     19177
weighted avg       0.96      0.96      0.96     19177

=== Test Data Performance ===
Accuracy: 0.8710462287104623
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      3793
           1       0.86      0.91      0.88      4427

    accuracy                           0.87      8220
   macro avg       0.87      0.87      0.87      8220
weighted avg       0.87      0.87      0.87      8220



In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from category_encoders import TargetEncoder
from sentence_transformers import SentenceTransformer

# =====================================================
# 1. Recode target
# =====================================================
def recode_risk(risk):
    if risk in ['no risk', 'not serious', 'potential risk', 'undecided', 'potentially serious']:
        return 0
    elif risk == 'serious':
        return 1
    else:
        return -1

df["risk_decision_2class"] = df["risk_decision"].apply(recode_risk)
df = df[df["risk_decision_2class"] != -1]
TARGET = "risk_decision_2class"

# =====================================================
# 2. Define feature groups
# =====================================================
embedding_cols = ["subject"]
onehot_cols = ["category", "classification", "Hazard_Type"]
target_encoding_cols = ["notifying_country", "origin"]

# Fill missing
for col in embedding_cols + onehot_cols + target_encoding_cols:
    df[col] = df[col].fillna("missing")

# =====================================================
# 2a. Group rare categories for target encoding
# =====================================================
for col in target_encoding_cols:
    counts = df[col].value_counts()
    rare = counts[counts < 50].index
    df[col] = df[col].replace(rare, 'other')

# =====================================================
# 3. Subject Embeddings
# =====================================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

def embed_column(series):
    return embedder.encode(series.tolist(), convert_to_numpy=True)

print("Generating embeddings for:", embedding_cols)
embedding_features = []
for col in embedding_cols:
    emb = embed_column(df[col])
    emb_df = pd.DataFrame(
        emb,
        columns=[f"{col}_emb_{i}" for i in range(emb.shape[1])]
    )
    embedding_features.append(emb_df)

embedding_df = pd.concat(embedding_features, axis=1)

# Reduce dimension using PCA
pca = PCA(n_components=100, random_state=42)
subject_emb_pca = pca.fit_transform(embedding_df.filter(like='subject_emb'))
embedding_df_pca = pd.DataFrame(subject_emb_pca, columns=[f'subject_emb_{i}' for i in range(100)])

# =====================================================
# 4. Smoothed Target Encoding
# =====================================================
print("Applying Smoothed Target Encoding:", target_encoding_cols)
te = TargetEncoder(cols=target_encoding_cols, smoothing=10)
target_encoded_df = te.fit_transform(df[target_encoding_cols], df[TARGET])

# =====================================================
# 5. One-Hot Encoding
# =====================================================
print("Applying One-Hot Encoding:", onehot_cols)
ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe_array = ohe.fit_transform(df[onehot_cols])
ohe_df = pd.DataFrame(
    ohe_array,
    columns=ohe.get_feature_names_out(onehot_cols)
)

# =====================================================
# 6. Combine all engineered features
# =====================================================
X = pd.concat([embedding_df_pca, target_encoded_df, ohe_df], axis=1)
y = df[TARGET]

print("Final feature matrix shape:", X.shape)

# =====================================================
# 7. Sanitize feature names for XGBoost compatibility
# =====================================================
X.columns = (
    X.columns
      .str.replace(r'[\[\]\{\}\:\"\'\,]', '_', regex=True)
      .str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)
)

# =====================================================
# 8. Train/Test Split
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Preprocessing complete. Ready for XGBoost training!")


Generating embeddings for: ['subject']
Applying Smoothed Target Encoding: ['notifying_country', 'origin']
Applying One-Hot Encoding: ['category', 'classification', 'Hazard_Type']
Final feature matrix shape: (27397, 150)
Preprocessing complete. Ready for XGBoost training!


In [25]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ============================================
# Recreate model with best parameters
# ============================================
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    random_state=42,
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=1,
    reg_alpha=1,    # L1 regularization
    reg_lambda=2    # L2 regularization
)

# ============================================
# Fit on training data
# ============================================
xgb_model.fit(X_train, y_train)

# ============================================
# Predict on training data (optional, for quick evaluation)
# ============================================
y_train_pred = xgb_model.predict(X_train)
print("=== Training Data Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

# ============================================
# Predict on test data
# ============================================
y_test_pred = xgb_model.predict(X_test)
print("=== Test Data Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Training Data Performance ===
Accuracy: 0.9202690723262241
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      8848
           1       0.91      0.95      0.93     10329

    accuracy                           0.92     19177
   macro avg       0.92      0.92      0.92     19177
weighted avg       0.92      0.92      0.92     19177

=== Test Data Performance ===
Accuracy: 0.8658150851581509
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      3793
           1       0.86      0.90      0.88      4427

    accuracy                           0.87      8220
   macro avg       0.87      0.86      0.86      8220
weighted avg       0.87      0.87      0.87      8220

