## Load Data

In [2]:
import os, time, json
import kagglehub
import optuna
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier



path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")
data = []

datasets = [
    "Nigerian_Fraud.csv",
    "Ling.csv",
    "Nazario.csv",
    "SpamAssasin.csv",
    "CEAS_08.csv",
    # "Enron.csv"
 ]
for file in datasets:
    print(file)
    csv_path = os.path.join(path, file)
    subset_data = pd.read_csv(csv_path)
    data.append(subset_data)

all_data = pd.concat(data)


Nigerian_Fraud.csv
Ling.csv
Nazario.csv
SpamAssasin.csv
CEAS_08.csv


## Clean & Split Data

In [3]:
all_data['receiver'] = all_data['receiver'].str.replace('undisclosed-recipients:;', 'Unknown')
# -- Timestamp features -- 
all_data['date_parsed'] = pd.to_datetime(all_data['date'], errors='coerce', utc=True)

# Week-of-year
iso_week = all_data['date_parsed'].dt.isocalendar().week
iso_week = iso_week.astype(float)
week0 = ((iso_week - 1) % 52)
theta_week = 2.0 * np.pi * week0 / 52
all_data['sin_week'] = np.where(week0.notna(), np.sin(theta_week), 0.0)
all_data['cos_week'] = np.where(week0.notna(), np.cos(theta_week), 0.0)

# Hour-of-day 
hour = all_data['date_parsed'].dt.hour.astype(float)  # NaN for missing
theta_hour = 2.0 * np.pi * hour / 24
all_data['sin_hour'] = np.where(hour.notna(), np.sin(theta_hour), 0.0)
all_data['cos_hour'] = np.where(hour.notna(), np.cos(theta_hour), 0.0)

# Weekend binary (0/1)
weekday = all_data['date_parsed'].dt.weekday
all_data['is_weekend'] = np.where(weekday.isna(), 0, ((weekday >= 5).astype(int)))

# Timestamp feature list
timestamp_features = [
    "sin_week",
    "cos_week",
    "sin_hour",
    "cos_hour",
    "is_weekend"
]

# -- Sender/reciever feature engineering -- 
with open('domains.json', 'r') as file:
    public_email_domains = json.load(file)
    
email_regex = r'([a-zA-Z0-9._%+\-|{}^&"\'=]+@(?:[a-zA-Z0-9.-]+|\[[0-9.]+\]))'    
for column_name in ('sender', 'receiver'):
    all_data[f'{column_name}_email'] = all_data[column_name].str.extract(email_regex, expand=False)
    all_data[f'{column_name}_domain'] = all_data[f'{column_name}_email'].str.split('@', n=1).str[1]
    all_data[f'{column_name}_domain_len'] = all_data[f'{column_name}_domain'].str.len()
    all_data[f'{column_name}_domain_public'] = all_data[f'{column_name}_domain'].str.lower().isin(public_email_domains).astype(int)
    all_data[f'{column_name}_n_subdomains'] = all_data[f'{column_name}_domain'].str.lower().str.count(r'\.')
    all_data[f'{column_name}_email_n_digits'] = all_data[f'{column_name}_domain'].str.lower().str.count(r'\d')
    
    all_data[f'{column_name}_name'] = all_data[column_name].str.replace(email_regex, '', regex=True)
    all_data[f'{column_name}_name'] = all_data[f'{column_name}_name'].str.replace(r'[<>"\'\(\)]', '', regex=True).str.strip()
    
all_data['is_internal_email'] = (
    (all_data['sender_domain'] == all_data['receiver_domain']) & 
    (all_data['sender_domain'].notna())
).astype(int)

all_data['sender_name_contains_email'] = all_data['sender_name'].str.contains('@', na=False).astype(int)

# Sender/reciever feature list
email_features = [
    "sender_domain_public",
    "sender_domain_len",
    "sender_n_subdomains",
    "sender_email_n_digits",
    "sender_name_contains_email",
    # "is_internal_email"
]

# -- Fill in url count for missing entries -- 
url_regex = r'((?:https?|ftp)://\S+|www\.\S+)'
text_column = 'body' 
missing_count_mask = all_data['urls'].isna()
all_data.loc[missing_count_mask, 'urls'] = (
    all_data.loc[missing_count_mask, text_column]
    .astype(str)
    .str.count(url_regex)
)

all_data[['body', 'subject']] = all_data[['body', 'subject']].fillna('Unknown')

feature_set= [
    'subject',
    'body',
    *email_features,
    *timestamp_features
]

X = all_data[feature_set]
y = all_data['label'] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
# -- Initialize Model --
vectorizer = ColumnTransformer([
    ('subject_word', TfidfVectorizer(lowercase=False, analyzer='word'), 'subject'),
    ('subject_charwb', TfidfVectorizer(lowercase=False, analyzer='char_wb'), 'subject'),
    ('body', TfidfVectorizer(lowercase=True, analyzer='word'), 'body' )
    ],
    remainder="passthrough"
)

model_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('xgboost', XGBClassifier())
        
])

In [5]:
# -- Hyperparameter tuning with Optuna -- 
GPU = True
def objective(trial):
    start_time = time.time()
    
    # Define Search Space
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    max_depth = trial.suggest_int('max_depth', 5, 12)
    subsample = trial.suggest_float('subsample', 0.15, .9)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    ngram_choice = trial.suggest_categorical('ngram_range', ['unigram', 'bigram'])
    ngram_range = (1, 1) if ngram_choice == 'unigram' else (1, 2)
    stop_words_choice = trial.suggest_categorical('stop_words_setting', ['english', 'none'])
    stop_words = 'english' if stop_words_choice == 'english' else None
    df_min_type = trial.suggest_categorical('df_min_type', ['int', 'float'])


    max_features = None
    if df_min_type == 'int':
        min_df = trial.suggest_int('df_min', 2, 20)
    elif df_min_type == 'float':
        min_df = trial.suggest_float('min_df', 0.0001, 0.3, log=True)

    
    max_features = trial.suggest_int('max_features', 10000, 200000) if min_df < 0.01 or min_df == 2 else None

    
    # Build Vectorizer
    vectorizer = ColumnTransformer([
        ('subject_word_vectorizer', TfidfVectorizer(lowercase=False, analyzer='word', ngram_range=ngram_range), 'subject'),
        ('subject_charwb_vectorizer', TfidfVectorizer(lowercase=False, analyzer='char_wb'), 'subject'),
        ('body_vectorizer', TfidfVectorizer(
            lowercase=True, 
            analyzer='word', 
            min_df=min_df, 
            ngram_range=ngram_range,
            stop_words=stop_words,
            max_features=max_features
        ), 'body')
        ],
        remainder="passthrough"
    )
    
    # Train/Validation Split
    X_sub_train, X_sub_valid, y_sub_train, y_sub_valid = train_test_split(
        X_train,
        y_train,
        test_size=0.2,
        stratify=y_train
    )
    vectorizer.fit(X_sub_train)
    X_sub_train_vec = vectorizer.transform(X_sub_train)
    X_sub_valid_vec = vectorizer.transform(X_sub_valid)
    
    # Model with pruning + early stopping
    pruning_callback = XGBoostPruningCallback(trial, "validation_0-auc")
    
    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        n_jobs=-1,
        tree_method='hist',
        device='cuda' if GPU else 'cpu',
        objective='binary:logistic',
        eval_metric='auc',
        early_stopping_rounds=30,
        callbacks=[pruning_callback]
    )
    
    model.fit(
        X_sub_train_vec,
        y_sub_train,
        eval_set=[(X_sub_valid_vec, y_sub_valid)],
        verbose=False
    )
    
    preds = model.predict(X_sub_valid_vec)
    f1 = f1_score(y_sub_valid, preds)
    elapsed = time.time() - start_time
    print(f"Trial {trial.number} done in {elapsed:.2f}s | F1: {f1:.4f}")
    return f1
    
start = time.perf_counter()
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=20, interval_steps=100))

study.optimize(objective, n_trials=100, n_jobs=1)

end = time.perf_counter()
duration = end - start
print(f"Study took: {duration//(60**2)}m {duration//60}m {round(duration%60)}s")
print(f"Best value: {study.best_value}")
print(f"Best params: {study.best_params}")

[I 2025-12-10 17:05:17,710] A new study created in memory with name: no-name-ed3e75b2-4897-4610-b35f-acabeebda44b
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 17:07:01,450] Trial 0 finished with value: 0.9812831331818673 and parameters: {'n_estimators': 116, 'learning_rate': 0.0598121908100976, 'max_depth': 12, 'subsample': 0.6042214363162028, 'colsample_bytree': 0.7348192358510411, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.09676214928169895}. Best is trial 0 with value: 0.9812831331818673.


Trial 0 done in 103.73s | F1: 0.9813


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 17:10:09,528] Trial 1 finished with value: 0.9933903998266335 and parameters: {'n_estimators': 703, 'learning_rate': 0.15965232560070328, 'max_depth': 6, 'subsample': 0.8363550644662354, 'colsample_bytree': 0.5919439430073383, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.008151902229152496, 'max_features': 112776}. Best is trial 1 with value: 0.9933903998266335.


Trial 1 done in 188.08s | F1: 0.9934


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 17:16:30,655] Trial 2 finished with value: 0.9794594594594594 and parameters: {'n_estimators': 715, 'learning_rate': 0.2592395681103579, 'max_depth': 6, 'subsample': 0.24775210554362517, 'colsample_bytree': 0.6841908805653953, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.10492651066689693}. Best is trial 1 with value: 0.9933903998266335.


Trial 2 done in 381.12s | F1: 0.9795


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 17:38:06,092] Trial 3 finished with value: 0.9773191443620337 and parameters: {'n_estimators': 193, 'learning_rate': 0.021478218068488878, 'max_depth': 7, 'subsample': 0.25119078828031055, 'colsample_bytree': 0.8330591945131464, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.0004326316627501882, 'max_features': 108919}. Best is trial 1 with value: 0.9933903998266335.


Trial 3 done in 1295.39s | F1: 0.9773


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:26:26,669] Trial 4 finished with value: 0.9917802292883409 and parameters: {'n_estimators': 642, 'learning_rate': 0.021085890835368788, 'max_depth': 10, 'subsample': 0.1597000678596122, 'colsample_bytree': 0.7272371377121403, 'ngram_range': 'bigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 5}. Best is trial 1 with value: 0.9933903998266335.


Trial 4 done in 2900.51s | F1: 0.9918


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:28:11,462] Trial 5 finished with value: 0.9870410367170627 and parameters: {'n_estimators': 350, 'learning_rate': 0.1125844854295576, 'max_depth': 6, 'subsample': 0.18642044376150815, 'colsample_bytree': 0.973276431243713, 'ngram_range': 'unigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.005359440336308658, 'max_features': 160039}. Best is trial 1 with value: 0.9933903998266335.


Trial 5 done in 104.79s | F1: 0.9870


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:31:05,052] Trial 6 finished with value: 0.9914585360579522 and parameters: {'n_estimators': 335, 'learning_rate': 0.16053837600984247, 'max_depth': 7, 'subsample': 0.3604361505509826, 'colsample_bytree': 0.7927094470615532, 'ngram_range': 'unigram', 'stop_words_setting': 'english', 'df_min_type': 'int', 'df_min': 12}. Best is trial 1 with value: 0.9933903998266335.


Trial 6 done in 173.59s | F1: 0.9915


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:33:24,700] Trial 7 finished with value: 0.9912309191295875 and parameters: {'n_estimators': 488, 'learning_rate': 0.20192237238466487, 'max_depth': 8, 'subsample': 0.7347605346093568, 'colsample_bytree': 0.6343705874830627, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.0007975630579630452, 'max_features': 73230}. Best is trial 1 with value: 0.9933903998266335.


Trial 7 done in 139.62s | F1: 0.9912


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:44:11,789] Trial 8 finished with value: 0.9892055267702936 and parameters: {'n_estimators': 672, 'learning_rate': 0.018856638861387248, 'max_depth': 5, 'subsample': 0.4708351531480908, 'colsample_bytree': 0.7739418919032215, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 7}. Best is trial 1 with value: 0.9933903998266335.


Trial 8 done in 647.06s | F1: 0.9892


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:53:48,073] Trial 9 finished with value: 0.9911466206003023 and parameters: {'n_estimators': 719, 'learning_rate': 0.06860851644027985, 'max_depth': 10, 'subsample': 0.5327427434491594, 'colsample_bytree': 0.9355819765713291, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.001063082697411438, 'max_features': 86976}. Best is trial 1 with value: 0.9933903998266335.


Trial 9 done in 576.24s | F1: 0.9911


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 18:59:24,353] Trial 10 finished with value: 0.9933989827940699 and parameters: {'n_estimators': 830, 'learning_rate': 0.0939889429150694, 'max_depth': 5, 'subsample': 0.8859176191207755, 'colsample_bytree': 0.5264557537571961, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 10 with value: 0.9933989827940699.


Trial 10 done in 336.28s | F1: 0.9934


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:02:48,405] Trial 11 finished with value: 0.9931751706207345 and parameters: {'n_estimators': 939, 'learning_rate': 0.10443073984945891, 'max_depth': 5, 'subsample': 0.8633700156426356, 'colsample_bytree': 0.5078007783532457, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 20}. Best is trial 10 with value: 0.9933989827940699.


Trial 11 done in 204.05s | F1: 0.9932


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:10:26,552] Trial 12 finished with value: 0.9913476097772009 and parameters: {'n_estimators': 916, 'learning_rate': 0.03638722196876749, 'max_depth': 5, 'subsample': 0.8933178898259986, 'colsample_bytree': 0.5600304898347226, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 20}. Best is trial 10 with value: 0.9933989827940699.


Trial 12 done in 458.12s | F1: 0.9913


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:14:56,914] Trial 13 finished with value: 0.99286023366508 and parameters: {'n_estimators': 842, 'learning_rate': 0.11223970543334273, 'max_depth': 7, 'subsample': 0.7332441722550117, 'colsample_bytree': 0.6021666605758713, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 13}. Best is trial 10 with value: 0.9933989827940699.


Trial 13 done in 270.36s | F1: 0.9929


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:19:55,123] Trial 14 finished with value: 0.992528424472117 and parameters: {'n_estimators': 822, 'learning_rate': 0.038008479427827764, 'max_depth': 6, 'subsample': 0.7704781951351636, 'colsample_bytree': 0.50031877726721, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.013480478728812991}. Best is trial 10 with value: 0.9933989827940699.


Trial 14 done in 298.21s | F1: 0.9925


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:21:36,090] Trial 15 finished with value: 0.9930615784908933 and parameters: {'n_estimators': 548, 'learning_rate': 0.2875822045045738, 'max_depth': 8, 'subsample': 0.6420380247027465, 'colsample_bytree': 0.5989883262940603, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 16}. Best is trial 10 with value: 0.9933989827940699.


Trial 15 done in 100.94s | F1: 0.9931


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:31:04,722] Trial 16 finished with value: 0.9863783783783784 and parameters: {'n_estimators': 807, 'learning_rate': 0.010590705837532867, 'max_depth': 5, 'subsample': 0.8035978419724029, 'colsample_bytree': 0.6614477318094053, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 16}. Best is trial 10 with value: 0.9933989827940699.


Trial 16 done in 568.63s | F1: 0.9864


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:32:25,631] Trial 17 finished with value: 0.9912252193695158 and parameters: {'n_estimators': 994, 'learning_rate': 0.15590202677705733, 'max_depth': 9, 'subsample': 0.6610637384549367, 'colsample_bytree': 0.5579274287898653, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.009057280102075214, 'max_features': 13123}. Best is trial 10 with value: 0.9933989827940699.


Trial 17 done in 80.90s | F1: 0.9912


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:38:17,725] Trial 18 finished with value: 0.9936112615051435 and parameters: {'n_estimators': 576, 'learning_rate': 0.08838624134411559, 'max_depth': 12, 'subsample': 0.897454378834651, 'colsample_bytree': 0.5534179261945256, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.00016332921233470446, 'max_features': 199823}. Best is trial 18 with value: 0.9936112615051435.


Trial 18 done in 352.06s | F1: 0.9936


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:49:41,611] Trial 19 finished with value: 0.992526806021878 and parameters: {'n_estimators': 430, 'learning_rate': 0.0792636271313904, 'max_depth': 12, 'subsample': 0.8992544855694358, 'colsample_bytree': 0.5402787184520802, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 2, 'max_features': 190049}. Best is trial 18 with value: 0.9936112615051435.


Trial 19 done in 683.85s | F1: 0.9925


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 19:57:39,566] Trial 20 finished with value: 0.9925332756195216 and parameters: {'n_estimators': 578, 'learning_rate': 0.04137823787871771, 'max_depth': 11, 'subsample': 0.4304270475540079, 'colsample_bytree': 0.8417691219059162, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.00015931281497327164, 'max_features': 198763}. Best is trial 18 with value: 0.9936112615051435.


Trial 20 done in 477.92s | F1: 0.9925


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:02:18,896] Trial 21 finished with value: 0.9932827735644637 and parameters: {'n_estimators': 773, 'learning_rate': 0.08973542345716835, 'max_depth': 9, 'subsample': 0.8193087631045505, 'colsample_bytree': 0.6083427968096513, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.002857353483806125, 'max_features': 135683}. Best is trial 18 with value: 0.9936112615051435.


Trial 21 done in 279.30s | F1: 0.9933


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:04:15,988] Trial 22 finished with value: 0.9912176081535292 and parameters: {'n_estimators': 595, 'learning_rate': 0.15262049354090418, 'max_depth': 6, 'subsample': 0.8165286144739663, 'colsample_bytree': 0.5522540730934171, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.02836159460714966}. Best is trial 18 with value: 0.9936112615051435.


Trial 22 done in 117.09s | F1: 0.9912


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:12:21,888] Trial 23 finished with value: 0.9929722132122392 and parameters: {'n_estimators': 479, 'learning_rate': 0.047405298619380004, 'max_depth': 11, 'subsample': 0.719364370353502, 'colsample_bytree': 0.667115164816103, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.0001322553101993167, 'max_features': 37782}. Best is trial 18 with value: 0.9936112615051435.


Trial 23 done in 485.87s | F1: 0.9930


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:15:04,245] Trial 24 finished with value: 0.9926311226701344 and parameters: {'n_estimators': 903, 'learning_rate': 0.19890606083732285, 'max_depth': 7, 'subsample': 0.8430523714745132, 'colsample_bytree': 0.5819888570443559, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.002633239765603888, 'max_features': 143567}. Best is trial 18 with value: 0.9936112615051435.


Trial 24 done in 162.35s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:16:11,580] Trial 25 finished with value: 0.9926422852196495 and parameters: {'n_estimators': 723, 'learning_rate': 0.12583797771468216, 'max_depth': 8, 'subsample': 0.6906214142808844, 'colsample_bytree': 0.5232846788868124, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'float', 'min_df': 0.04043753463368777}. Best is trial 18 with value: 0.9936112615051435.


Trial 25 done in 67.33s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:37:39,532] Trial 26 pruned. Trial was pruned at iteration 320.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:38:02,792] Trial 27 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:41:00,103] Trial 28 finished with value: 0.9935036812472932 and parameters: {'n_estimators': 871, 'learning_rate': 0.12961347164130538, 'max_depth': 11, 'subsample': 0.8994858474512379, 'colsample_bytree': 0.6303809161664422, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 17}. Best is trial 18 with value: 0.9936112615051435.


Trial 28 done in 177.28s | F1: 0.9935


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:42:07,893] Trial 29 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:44:34,423] Trial 30 finished with value: 0.9914344573349235 and parameters: {'n_estimators': 854, 'learning_rate': 0.211428457482089, 'max_depth': 11, 'subsample': 0.8919524790681683, 'colsample_bytree': 0.6231752498820748, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 18}. Best is trial 18 with value: 0.9936112615051435.


Trial 30 done in 146.49s | F1: 0.9914


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:48:02,614] Trial 31 finished with value: 0.9942522502982323 and parameters: {'n_estimators': 869, 'learning_rate': 0.13357091275770183, 'max_depth': 12, 'subsample': 0.8481763931792046, 'colsample_bytree': 0.5840055442759128, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 14}. Best is trial 31 with value: 0.9942522502982323.


Trial 31 done in 208.16s | F1: 0.9943


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:50:37,278] Trial 32 finished with value: 0.9939328277356446 and parameters: {'n_estimators': 885, 'learning_rate': 0.13235374734204802, 'max_depth': 12, 'subsample': 0.8543218915541906, 'colsample_bytree': 0.5714526950777784, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 14}. Best is trial 31 with value: 0.9942522502982323.


Trial 32 done in 154.64s | F1: 0.9939


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:53:45,876] Trial 33 finished with value: 0.9922027290448343 and parameters: {'n_estimators': 889, 'learning_rate': 0.13068601258487494, 'max_depth': 12, 'subsample': 0.7676811894236957, 'colsample_bytree': 0.5764764990560096, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 14}. Best is trial 31 with value: 0.9942522502982323.


Trial 33 done in 188.57s | F1: 0.9922


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 20:59:52,744] Trial 34 finished with value: 0.9920901506122006 and parameters: {'n_estimators': 954, 'learning_rate': 0.07362428824082809, 'max_depth': 12, 'subsample': 0.8489614804400774, 'colsample_bytree': 0.6527031123377189, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 10}. Best is trial 31 with value: 0.9942522502982323.


Trial 34 done in 366.84s | F1: 0.9921


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:06:15,164] Trial 35 finished with value: 0.9922934983175947 and parameters: {'n_estimators': 259, 'learning_rate': 0.23154022174441838, 'max_depth': 12, 'subsample': 0.8475531061167019, 'colsample_bytree': 0.6960226616349158, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'int', 'df_min': 14}. Best is trial 31 with value: 0.9942522502982323.


Trial 35 done in 382.38s | F1: 0.9923


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:08:21,160] Trial 36 finished with value: 0.9942485078676072 and parameters: {'n_estimators': 769, 'learning_rate': 0.1757318092487825, 'max_depth': 11, 'subsample': 0.7728896381076091, 'colsample_bytree': 0.5780725776574428, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 17}. Best is trial 31 with value: 0.9942522502982323.


Trial 36 done in 125.97s | F1: 0.9942


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:16:34,380] Trial 37 finished with value: 0.99263431542461 and parameters: {'n_estimators': 763, 'learning_rate': 0.17481653352114465, 'max_depth': 11, 'subsample': 0.7735118374640974, 'colsample_bytree': 0.583980790937476, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'int', 'df_min': 14}. Best is trial 31 with value: 0.9942522502982323.


Trial 37 done in 493.18s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:18:11,733] Trial 38 finished with value: 0.9926454683106208 and parameters: {'n_estimators': 130, 'learning_rate': 0.2909714466198787, 'max_depth': 12, 'subsample': 0.69311134670307, 'colsample_bytree': 0.5368416083792403, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 11}. Best is trial 31 with value: 0.9942522502982323.


Trial 38 done in 97.32s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:20:09,001] Trial 39 finished with value: 0.9922094784678641 and parameters: {'n_estimators': 658, 'learning_rate': 0.180096479683363, 'max_depth': 11, 'subsample': 0.8156129474287852, 'colsample_bytree': 0.9024138515424651, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 15}. Best is trial 31 with value: 0.9942522502982323.


Trial 39 done in 117.24s | F1: 0.9922


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:20:30,029] Trial 40 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:23:06,456] Trial 41 finished with value: 0.9928571428571429 and parameters: {'n_estimators': 881, 'learning_rate': 0.12124522358100354, 'max_depth': 11, 'subsample': 0.8578316642312614, 'colsample_bytree': 0.6214734146487583, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 18}. Best is trial 31 with value: 0.9942522502982323.


Trial 41 done in 156.40s | F1: 0.9929


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:24:47,703] Trial 42 pruned. Trial was pruned at iteration 120.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:26:25,865] Trial 43 finished with value: 0.9921055477452146 and parameters: {'n_estimators': 955, 'learning_rate': 0.2355406370703531, 'max_depth': 10, 'subsample': 0.8581607239583472, 'colsample_bytree': 0.5945898859998646, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 17}. Best is trial 31 with value: 0.9942522502982323.


Trial 43 done in 98.13s | F1: 0.9921


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:29:42,234] Trial 44 finished with value: 0.9930705933304461 and parameters: {'n_estimators': 866, 'learning_rate': 0.1397898733161579, 'max_depth': 11, 'subsample': 0.8012114090737348, 'colsample_bytree': 0.5187292486893054, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 15}. Best is trial 31 with value: 0.9942522502982323.


Trial 44 done in 196.34s | F1: 0.9931


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:32:36,302] Trial 45 finished with value: 0.9912347148576994 and parameters: {'n_estimators': 349, 'learning_rate': 0.11576740214620153, 'max_depth': 12, 'subsample': 0.8974411825713109, 'colsample_bytree': 0.7487582374500896, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 18}. Best is trial 31 with value: 0.9942522502982323.


Trial 45 done in 174.04s | F1: 0.9912


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:42:50,990] Trial 46 finished with value: 0.9932943975773307 and parameters: {'n_estimators': 692, 'learning_rate': 0.19332753308922343, 'max_depth': 11, 'subsample': 0.8651333415932149, 'colsample_bytree': 0.5496022287251265, 'ngram_range': 'bigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 13}. Best is trial 31 with value: 0.9942522502982323.


Trial 46 done in 614.64s | F1: 0.9933


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:43:15,466] Trial 47 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:43:39,368] Trial 48 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:44:02,250] Trial 49 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:48:21,797] Trial 50 finished with value: 0.9936140274921529 and parameters: {'n_estimators': 930, 'learning_rate': 0.08467354233166735, 'max_depth': 12, 'subsample': 0.5481112414485146, 'colsample_bytree': 0.6094856733340226, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 12}. Best is trial 31 with value: 0.9942522502982323.


Trial 50 done in 259.52s | F1: 0.9936


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:48:51,693] Trial 51 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:49:20,170] Trial 52 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:51:52,703] Trial 53 finished with value: 0.9938264919311166 and parameters: {'n_estimators': 917, 'learning_rate': 0.11079312877001463, 'max_depth': 12, 'subsample': 0.4878707067105498, 'colsample_bytree': 0.5051207498580129, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 13}. Best is trial 31 with value: 0.9942522502982323.


Trial 53 done in 152.50s | F1: 0.9938


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:52:18,850] Trial 54 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:56:09,556] Trial 55 finished with value: 0.9946860427285544 and parameters: {'n_estimators': 837, 'learning_rate': 0.16417635037423906, 'max_depth': 12, 'subsample': 0.43869295336334196, 'colsample_bytree': 0.5228999416673616, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 9}. Best is trial 55 with value: 0.9946860427285544.


Trial 55 done in 230.67s | F1: 0.9947


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 21:59:23,092] Trial 56 finished with value: 0.9920901506122006 and parameters: {'n_estimators': 836, 'learning_rate': 0.16695930302301404, 'max_depth': 12, 'subsample': 0.42941918889716274, 'colsample_bytree': 0.5308141432421072, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 7}. Best is trial 55 with value: 0.9946860427285544.


Trial 56 done in 193.51s | F1: 0.9921


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:01:07,395] Trial 57 finished with value: 0.9926486486486487 and parameters: {'n_estimators': 797, 'learning_rate': 0.15052588368918426, 'max_depth': 12, 'subsample': 0.5063670227761375, 'colsample_bytree': 0.5174393506512805, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 9}. Best is trial 55 with value: 0.9946860427285544.


Trial 57 done in 104.27s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:04:10,191] Trial 58 finished with value: 0.9914363143631436 and parameters: {'n_estimators': 922, 'learning_rate': 0.24693261995737087, 'max_depth': 11, 'subsample': 0.4363769588490334, 'colsample_bytree': 0.5400716249420199, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 11}. Best is trial 55 with value: 0.9946860427285544.


Trial 58 done in 182.77s | F1: 0.9914


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:18:20,845] Trial 59 finished with value: 0.9932871372888696 and parameters: {'n_estimators': 740, 'learning_rate': 0.11242051986794535, 'max_depth': 12, 'subsample': 0.4640210469731057, 'colsample_bytree': 0.9963181240303558, 'ngram_range': 'bigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 7}. Best is trial 55 with value: 0.9946860427285544.


Trial 59 done in 850.60s | F1: 0.9933


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:22:37,850] Trial 60 finished with value: 0.9928447528187337 and parameters: {'n_estimators': 974, 'learning_rate': 0.2112215478592723, 'max_depth': 11, 'subsample': 0.543681128897887, 'colsample_bytree': 0.5011627753095703, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 8}. Best is trial 55 with value: 0.9946860427285544.


Trial 60 done in 256.98s | F1: 0.9928


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:23:03,321] Trial 61 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:27:07,345] Trial 62 finished with value: 0.9948085658663206 and parameters: {'n_estimators': 838, 'learning_rate': 0.08057162799149066, 'max_depth': 12, 'subsample': 0.6279341206624385, 'colsample_bytree': 0.5508536642950541, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 13}. Best is trial 62 with value: 0.9948085658663206.


Trial 62 done in 244.00s | F1: 0.9948


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:27:35,702] Trial 63 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:28:00,568] Trial 64 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:28:26,909] Trial 65 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:28:56,357] Trial 66 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:31:23,574] Trial 67 finished with value: 0.991121697704634 and parameters: {'n_estimators': 858, 'learning_rate': 0.16456998325623742, 'max_depth': 11, 'subsample': 0.6126829669154609, 'colsample_bytree': 0.6121176693373439, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 14}. Best is trial 62 with value: 0.9948085658663206.


Trial 67 done in 147.18s | F1: 0.9911


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:31:47,122] Trial 68 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:32:10,686] Trial 69 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:35:00,204] Trial 70 finished with value: 0.9922027290448343 and parameters: {'n_estimators': 903, 'learning_rate': 0.12296548322192456, 'max_depth': 12, 'subsample': 0.44971712280742915, 'colsample_bytree': 0.5572259783549576, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 15}. Best is trial 62 with value: 0.9948085658663206.


Trial 70 done in 169.48s | F1: 0.9922


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:35:11,777] Trial 71 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:37:40,571] Trial 72 pruned. Trial was pruned at iteration 120.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:38:06,664] Trial 73 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:40:27,054] Trial 74 finished with value: 0.9919809276116168 and parameters: {'n_estimators': 296, 'learning_rate': 0.1840763069414436, 'max_depth': 11, 'subsample': 0.871399282871818, 'colsample_bytree': 0.575772165158618, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 13}. Best is trial 62 with value: 0.9948085658663206.


Trial 74 done in 140.36s | F1: 0.9920


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:42:51,174] Trial 75 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:46:50,058] Trial 76 finished with value: 0.9922010398613518 and parameters: {'n_estimators': 687, 'learning_rate': 0.11468943022525893, 'max_depth': 11, 'subsample': 0.8246582412591397, 'colsample_bytree': 0.5338344623677622, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 20}. Best is trial 62 with value: 0.9948085658663206.


Trial 76 done in 238.85s | F1: 0.9922


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:47:17,508] Trial 77 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:47:36,834] Trial 78 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:48:04,623] Trial 79 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:48:27,362] Trial 80 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:50:59,718] Trial 81 finished with value: 0.9940418156212761 and parameters: {'n_estimators': 874, 'learning_rate': 0.14740174836922487, 'max_depth': 10, 'subsample': 0.8819751397190585, 'colsample_bytree': 0.6302465095171319, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 62 with value: 0.9948085658663206.


Trial 81 done in 152.32s | F1: 0.9940


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:53:08,509] Trial 82 finished with value: 0.9930720935267374 and parameters: {'n_estimators': 971, 'learning_rate': 0.14971691308648014, 'max_depth': 10, 'subsample': 0.7533415320388855, 'colsample_bytree': 0.5899375765533028, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 62 with value: 0.9948085658663206.


Trial 82 done in 128.76s | F1: 0.9931


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:56:14,345] Trial 83 finished with value: 0.9952308692824626 and parameters: {'n_estimators': 804, 'learning_rate': 0.17389030487747995, 'max_depth': 9, 'subsample': 0.8405243248517169, 'colsample_bytree': 0.6457296831288679, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 83 with value: 0.9952308692824626.


Trial 83 done in 185.80s | F1: 0.9952


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 22:57:45,766] Trial 84 pruned. Trial was pruned at iteration 120.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:00:22,622] Trial 85 finished with value: 0.9927559736187695 and parameters: {'n_estimators': 793, 'learning_rate': 0.2602509545965745, 'max_depth': 10, 'subsample': 0.8796108486443794, 'colsample_bytree': 0.644783762972403, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 18}. Best is trial 83 with value: 0.9952308692824626.


Trial 85 done in 156.83s | F1: 0.9928


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:02:48,343] Trial 86 finished with value: 0.993615409587707 and parameters: {'n_estimators': 894, 'learning_rate': 0.18412037694957709, 'max_depth': 9, 'subsample': 0.809501591827976, 'colsample_bytree': 0.7008894414984157, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 20}. Best is trial 83 with value: 0.9952308692824626.


Trial 86 done in 145.69s | F1: 0.9936


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:08:26,428] Trial 87 pruned. Trial was pruned at iteration 120.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:11:06,839] Trial 88 finished with value: 0.992855596449448 and parameters: {'n_estimators': 891, 'learning_rate': 0.19489314622787218, 'max_depth': 8, 'subsample': 0.7666417977987439, 'colsample_bytree': 0.6779456654523304, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 83 with value: 0.9952308692824626.


Trial 88 done in 160.38s | F1: 0.9929


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:11:28,088] Trial 89 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:11:48,783] Trial 90 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:12:11,234] Trial 91 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:14:05,780] Trial 92 finished with value: 0.9926406926406927 and parameters: {'n_estimators': 941, 'learning_rate': 0.16688212770963082, 'max_depth': 10, 'subsample': 0.8546165789107196, 'colsample_bytree': 0.6259537478893276, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 83 with value: 0.9952308692824626.


Trial 92 done in 114.52s | F1: 0.9926


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:15:28,728] Trial 93 pruned. Trial was pruned at iteration 120.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:17:32,774] Trial 94 finished with value: 0.9931825560004328 and parameters: {'n_estimators': 958, 'learning_rate': 0.15987215816138717, 'max_depth': 9, 'subsample': 0.7809804235716028, 'colsample_bytree': 0.6013112780248508, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 14}. Best is trial 83 with value: 0.9952308692824626.


Trial 94 done in 124.02s | F1: 0.9932


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:19:17,358] Trial 95 finished with value: 0.9934980494148244 and parameters: {'n_estimators': 744, 'learning_rate': 0.26027587113911094, 'max_depth': 10, 'subsample': 0.8107454691899771, 'colsample_bytree': 0.7648901584591274, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 17}. Best is trial 83 with value: 0.9952308692824626.


Trial 95 done in 104.55s | F1: 0.9935


  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:19:51,369] Trial 96 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:20:16,385] Trial 97 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:20:54,196] Trial 98 pruned. Trial was pruned at iteration 20.
  self.starting_round = model.num_boosted_rounds()
[I 2025-12-10 23:22:32,947] Trial 99 finished with value: 0.9926390993721584 and parameters: {'n_estimators': 978, 'learning_rate': 0.17258738786704705, 'max_depth': 12, 'subsample': 0.4585671418943691, 'colsample_bytree': 0.744749295975785, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}. Best is trial 83 with value: 0.9952308692824626.


Trial 99 done in 98.72s | F1: 0.9926
Study took: 6.0m 362.0m 17s
Best value: 0.9952308692824626
Best params: {'n_estimators': 804, 'learning_rate': 0.17389030487747995, 'max_depth': 9, 'subsample': 0.8405243248517169, 'colsample_bytree': 0.6457296831288679, 'ngram_range': 'unigram', 'stop_words_setting': 'none', 'df_min_type': 'int', 'df_min': 19}


In [8]:
importances = optuna.importance.get_param_importances(study)

# Print out the dictionary nicely
for param, score in importances.items():
    print(f"{param}: {score:.4f} ({score*100:.2f}%)")


# --- PART 2: Top 10 Models (Trials) ---
print("\n--- Top 10 Models ---")
# Convert the study to a Pandas DataFrame for easy sorting/viewing
df = study.trials_dataframe()

# Sort by objective value (ascending for minimization) and take top 10
# We filter columns to only show params and the resulting value
cols_to_keep = ['value'] + [c for c in df.columns if c.startswith('params_')]
top_10 = df.sort_values('value', ascending=False).head(10)[cols_to_keep]
print(top_10)

subsample: 0.4941 (49.41%)
n_estimators: 0.2525 (25.25%)
stop_words_setting: 0.0808 (8.08%)
learning_rate: 0.0705 (7.05%)
max_depth: 0.0509 (5.09%)
df_min_type: 0.0290 (2.90%)
colsample_bytree: 0.0117 (1.17%)
ngram_range: 0.0104 (1.04%)

--- Top 10 Models ---
       value  params_colsample_bytree  params_df_min params_df_min_type  \
93  0.999502                 0.659702           16.0                int   
84  0.999486                 0.664306           19.0                int   
26  0.999451                 0.634545            9.0                int   
72  0.999389                 0.565086            NaN              float   
87  0.999379                 0.715503           20.0                int   
42  0.999221                 0.632838           17.0                int   
89  0.998377                 0.702237           20.0                int   
98  0.998272                 0.634886           16.0                int   
96  0.998266                 0.576372           20.0             