# Classification

In [1]:
import os
import sys
from pathlib import Path

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

Load Data

In [2]:
notebook_path = Path.cwd()
project_root = notebook_path.parent
sys.path.insert(0, str(project_root))

# print(f"Project Root (added to sys.path): {project_root}")

In [3]:
from src import config
from src.data_loader import load_census_data

raw_df = load_census_data(
    data_path=config.DATA_FILE,
    columns_path=config.COLUMNS_FILE
)
raw_df.head()

Data loaded and initial cleaning complete.
Dataset shape: (199523, 42)


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veterans_admin,veterans_benefits,weeks_worked_in_year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,0
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,0
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,0
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,0
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,0


Separate the Target, Features and Weights

In [4]:
X = raw_df.drop(columns=[config.TARGET_VARIABLE, 'weight'])
y = raw_df[config.TARGET_VARIABLE]
weights = raw_df['weight']

Feature Engineering

In [5]:
from src.feature_engineering import create_features

X_featured = create_features(X)

print(X_featured.shape[1])
print(X_featured.columns.tolist())

Mapping numeric codes to string labels...
Dropping unused columns: ['detailed_industry_recode', 'detailed_occupation_recode', 'year']
Feature engineering complete. New columns added.
46
['age', 'class_of_worker', 'education', 'wage_per_hour', 'enroll_in_edu_inst_last_wk', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union', 'reason_for_unemployment', 'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'tax_filer_stat', 'region_of_previous_residence', 'state_of_previous_residence', 'detailed_household_and_family_stat', 'detailed_household_summary_in_household', 'migration_code_change_in_msa', 'migration_code_change_in_reg', 'migration_code_move_within_reg', 'live_in_this_house_1_year_ago', 'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self', 'citizenshi

Data Split for Training

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_featured, 
    y, 
    test_size=config.TEST_SIZE, 
    random_state=config.RANDOM_STATE,
    stratify=y # To maintain class distribution in splits
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nTest set target distribution:\n{y_test.value_counts(normalize=True)}")

Training set shape: (159618, 46)
Test set shape: (39905, 46)

Training set target distribution:
label
0    0.937939
1    0.062061
Name: proportion, dtype: float64

Test set target distribution:
label
0    0.937953
1    0.062047
Name: proportion, dtype: float64


Preprocessing

In [8]:
from src.preprocessing import create_preprocessor

NUMERICAL_COLS = X_featured.select_dtypes(include='number').columns.tolist()
CATEGORICAL_COLS = X_featured.select_dtypes(include='object').columns.tolist()
print(f"Numerical columns: {NUMERICAL_COLS}")
print(f"Categorical columns: {CATEGORICAL_COLS}")

preprocessor = create_preprocessor(NUMERICAL_COLS, CATEGORICAL_COLS)

# Fit the preprocessor ONLY on the training data
preprocessor.fit(X_train)

# Transform both the training and testing data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# The output is a NumPy array, ready for the model
print(f"\nShape of processed training data: {X_train_processed.shape}")
print(f"Shape of processed testing data: {X_test_processed.shape}")

Numerical columns: ['age', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'num_persons_worked_for_employer', 'weeks_worked_in_year', 'net_capital_gain', 'has_investment_income', 'age_squared']
Categorical columns: ['class_of_worker', 'education', 'enroll_in_edu_inst_last_wk', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union', 'reason_for_unemployment', 'full_or_part_time_employment_stat', 'tax_filer_stat', 'region_of_previous_residence', 'state_of_previous_residence', 'detailed_household_and_family_stat', 'detailed_household_summary_in_household', 'migration_code_change_in_msa', 'migration_code_change_in_reg', 'migration_code_move_within_reg', 'live_in_this_house_1_year_ago', 'migration_prev_res_in_sunbelt', 'family_members_under_18', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self', 'citizenship', 'own_business_or_self_employed', 'fill_inc_questionnaire_

In [9]:
# get the feature names after preprocessing
def get_feature_names(preprocessor):
    feature_names = []
    
    # Numerical features
    if 'num' in preprocessor.named_transformers_:
        num_features = preprocessor.named_transformers_['num'].feature_names_in_.tolist()
        feature_names.extend(num_features)
    
    # Categorical features
    if 'cat' in preprocessor.named_transformers_:
        cat_transformer = preprocessor.named_transformers_['cat']
        if hasattr(cat_transformer, 'get_feature_names_out'):
            cat_features = cat_transformer.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names.extend(cat_features)
        else:
            # Fallback if get_feature_names_out is not available
            cat_features = preprocessor.transformers_[1][2]
            feature_names.extend(cat_features)
    
    return feature_names    
feature_names = get_feature_names(preprocessor)
print(f"\nTotal number of features after preprocessing: {len(feature_names)}")
print(f"First 10 feature names: {feature_names[:10]}")


Total number of features after preprocessing: 251
First 10 feature names: ['age', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'num_persons_worked_for_employer', 'weeks_worked_in_year', 'net_capital_gain', 'has_investment_income', 'age_squared']


## Baseline
Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# The full pipeline by combining our preprocessor with the classifier
lr_pipeline_standard = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=config.RANDOM_STATE, max_iter=1000))
])

# Perform 5-fold cross-validation
cv_auc_standard = cross_val_score(
    lr_pipeline_standard, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1
)

cv_f1_standard = cross_val_score(
    lr_pipeline_standard, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1
)

# To see how the model performs for underrepresented classes
cv_f1_macro = cross_val_score(
    lr_pipeline_standard, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1
)

print("\nResults for Standard Logistic Regression (5-fold CV):")
print(f"  Mean ROC AUC: {np.mean(cv_auc_standard):.4f} (Std: {np.std(cv_auc_standard):.4f})")
print(f"  Mean F1-Score (Weighted): {np.mean(cv_f1_standard):.4f} (Std: {np.std(cv_f1_standard):.4f})")
print(f"  Mean F1-Score (Macro): {np.mean(cv_f1_macro):.4f} (Std: {np.std(cv_f1_macro):.4f})")


Results for Standard Logistic Regression (5-fold CV):
  Mean ROC AUC: 0.9435 (Std: 0.0012)
  Mean F1-Score (Weighted): 0.9455 (Std: 0.0010)
  Mean F1-Score (Macro): 0.7385 (Std: 0.0051)


## Tree-Based Models

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import time

models_to_test = {
    "Decision Tree": DecisionTreeClassifier(
        random_state=config.RANDOM_STATE,
        class_weight='balanced'
    ),
    "Random Forest": RandomForestClassifier(
        random_state=config.RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1  
    ),
    "LightGBM": LGBMClassifier(
        random_state=config.RANDOM_STATE,
        is_unbalance=True, 
        n_jobs=-1
    )
}

results = {}

for name, model in models_to_test.items():
    start_time = time.time()
    print(f"\nTraining {name}...")

    # Create the full Scikit-Learn pipeline
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Perform 5-fold cross-validation for ROC AUC and F1-Score
    cv_auc = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    cv_f1 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1)
    cv_macro_f1 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    
    end_time = time.time()
    training_time = end_time - start_time
    
    # Store the results
    results[name] = {
        "Mean ROC AUC": cv_auc.mean(),
        "Std ROC AUC": cv_auc.std(),
        "Mean F1 (Weighted)": cv_f1.mean(),
        "Std F1 (Weighted)": cv_f1.std(),
        "Mean F1 (Macro)": cv_macro_f1.mean(),
        "Std F1 (Macro)": cv_macro_f1.std(),
        "Training Time (s)": training_time
    }
    
    print(f"{name} trained in {training_time:.2f} seconds.")


Training Decision Tree...
Decision Tree trained in 23.35 seconds.

Training Random Forest...
Random Forest trained in 40.62 seconds.

Training LightGBM...
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In



LightGBM trained in 24.22 seconds.


In [12]:
lightgbm_smote_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=config.RANDOM_STATE)),
    ('classifier', LGBMClassifier(random_state=config.RANDOM_STATE, n_jobs=-1))
])

start_time = time.time()

cv_auc = cross_val_score(lightgbm_smote_pipeline, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
cv_f1 = cross_val_score(lightgbm_smote_pipeline, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1)
cv_macro_f1 = cross_val_score(lightgbm_smote_pipeline, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)

end_time = time.time()
training_time = end_time - start_time

results["LightGBM_with_SMOTE"] = {
    "Mean ROC AUC": cv_auc.mean(),
    "Std ROC AUC": cv_auc.std(),
    "Mean F1 (Weighted)": cv_f1.mean(),
    "Std F1 (Weighted)": cv_f1.std(),
    "Mean F1 (Macro)": cv_macro_f1.mean(),
    "Std F1 (Macro)": cv_macro_f1.std(),
    "Training Time (s)": training_time
}

[LightGBM] [Info] Number of positive: 119769, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54497
[LightGBM] [Info] Number of data points in the train set: 239538, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 119770, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54386
[LightGBM] [Info] Number of data points in the train set: 239540, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> ini



[LightGBM] [Info] Number of positive: 119769, number of negative: 119769
[LightGBM] [Info] Number of positive: 119770, number of negative: 119770
[LightGBM] [Info] Number of positive: 119769, number of negative: 119769
[LightGBM] [Info] Number of positive: 119770, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.178524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54497
[LightGBM] [Info] Number of data points in the train set: 239538, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.220407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54609
[LightGBM] [



[LightGBM] [Info] Number of positive: 119769, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54609
[LightGBM] [Info] Number of data points in the train set: 239538, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 119769, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.441550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54497
[LightGBM] [Info] Number of data points in the train set: 239538, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> ini



In [13]:
results_df = pd.DataFrame(results).T # .T transposes the DataFrame
results_df.sort_values(by='Mean ROC AUC', ascending=False, inplace=True)

print("\nModel Performance Results (sorted by Mean ROC AUC):")
display(results_df)


Model Performance Results (sorted by Mean ROC AUC):


Unnamed: 0,Mean ROC AUC,Std ROC AUC,Mean F1 (Weighted),Std F1 (Weighted),Mean F1 (Macro),Std F1 (Macro),Training Time (s)
LightGBM,0.95181,0.001203,0.899153,0.000678,0.696271,0.001275,24.220922
LightGBM_with_SMOTE,0.945777,0.001818,0.949792,0.000839,0.776933,0.003275,38.858648
Random Forest,0.93218,0.001642,0.945092,0.001289,0.735514,0.006846,40.617646
Decision Tree,0.713506,0.007576,0.931743,0.001556,0.709432,0.006816,23.347244


Tuned hyperparameters for the best model (LightGBM)

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Tuned hyperparameters for the best model (LightGBM)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        random_state=config.RANDOM_STATE,
        is_unbalance=True, # Continue handling class imbalance
        n_jobs=-1
    ))
])

# Define the parameter distribution to sample from
param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__num_leaves': randint(20, 50),
    'classifier__max_depth': randint(5, 20),
    'classifier__reg_alpha': uniform(0.1, 0.9), # L1 regularization
    'classifier__reg_lambda': uniform(0.1, 0.9)  # L2 regularization
}

n_iter_search = 25

random_search = RandomizedSearchCV(
    lgbm_pipeline,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    cv=5, # 5-fold cross-validation
    scoring='roc_auc', # primary metric
    random_state=config.RANDOM_STATE,
    n_jobs=-1,
    # verbose=2 # show progress
)

# Fit the search on the TRAINING data
random_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049605 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.71555





























[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551


















[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041908 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.170152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.214222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[Ligh



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.142591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data point



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.243104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.287903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In







[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.383130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In
























[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.71554















[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data point



[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.287806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.278490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.161050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.170809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.389992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.453703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.226949 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.112916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.402987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.233274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.393732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.71567







[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.377299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.326052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.433458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In












[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677









[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.131754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.71554



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.283091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In







[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.336236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In



















[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.217796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.185201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100212 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.422645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.323348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [In












[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.278902 seconds.
You can set `force_row_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.183709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.7155











[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.260610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.332545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.376709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7925, number of negative: 119769




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.238374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.71555
















[LightGBM] [Info] Number of positive: 9906, number of negative: 149712
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1680
[LightGBM] [Info] Number of data points in the train set: 159618, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062061 -> initscore=-2.715573
[LightGBM] [Info] Start training from score -2.715573


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'classifier__learning_rate': <scipy.stats....t 0x17fcbf830>, 'classifier__max_depth': <scipy.stats....t 0x341a26ab0>, 'classifier__n_estimators': <scipy.stats....t 0x3442a6150>, 'classifier__num_leaves': <scipy.stats....t 0x341a39400>, ...}"
,n_iter,25
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,0.005
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,47
,max_depth,8
,learning_rate,np.float64(0....0318597055907)
,n_estimators,700
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
import json

print("Best cross-validated ROC AUC score from search: {:.4f}".format(random_search.best_score_))
print("\nBest parameters found:")
print(random_search.best_params_)

print(f"\nSaving best parameters to: {config.HYPERPARAMETERS_FILE}")
with open(config.HYPERPARAMETERS_FILE, 'w') as f:
    json.dump(random_search.best_params_, f, indent=4) # indent=4 makes it human-readable

print("Parameters saved successfully.")

# Compare to the default LightGBM score from the fine-tuning results
default_lgbm_score = results_df.loc['LightGBM', 'Mean ROC AUC']
print(f"\nDefault LightGBM score was: {default_lgbm_score:.4f}")
improvement = (random_search.best_score_ - default_lgbm_score) / default_lgbm_score * 100
print(f"Improvement from tuning: {improvement:.2f}%")

Best cross-validated ROC AUC score from search: 0.9521

Best parameters found:
{'classifier__learning_rate': np.float64(0.023010318597055907), 'classifier__max_depth': 8, 'classifier__n_estimators': 700, 'classifier__num_leaves': 47, 'classifier__reg_alpha': np.float64(0.6069593960609854), 'classifier__reg_lambda': np.float64(0.4468748522859245)}

Saving best parameters to: /Users/zhuoqi/Library/CloudStorage/OneDrive-Personal/Aaaaaandrea/2025-2026/JPMC_Interview/TakeHomeProject_new/models/best_hyperparameters.json
Parameters saved successfully.

Default LightGBM score was: 0.9518
Improvement from tuning: 0.03%


In [16]:
from sklearn.metrics import roc_auc_score, classification_report, ConfusionMatrixDisplay

best_model = random_search.best_estimator_

# Make predictions on the unseen test data
y_pred_test = best_model.predict(X_test)
y_pred_proba_test = best_model.predict_proba(X_test)[:, 1] # Probabilities for the positive class

# --- Calculate and Print Final Metrics ---
final_roc_auc = roc_auc_score(y_test, y_pred_proba_test)
print(f"Final ROC AUC on Test Set: {final_roc_auc:.4f}")

print("\n--- Classification Report on Test Set ---")
print(classification_report(y_test, y_pred_test, target_names=['Income < $50k', 'Income > $50k']))




Final ROC AUC on Test Set: 0.9558

--- Classification Report on Test Set ---
               precision    recall  f1-score   support

Income < $50k       0.99      0.88      0.93     37429
Income > $50k       0.33      0.89      0.48      2476

     accuracy                           0.88     39905
    macro avg       0.66      0.89      0.71     39905
 weighted avg       0.95      0.88      0.91     39905





Experiment with f1_macro

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Tuned hyperparameters for the best model (LightGBM)
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        random_state=config.RANDOM_STATE,
        is_unbalance=True, # Continue handling class imbalance
        n_jobs=-1
    ))
])

# Define the parameter distribution to sample from
param_dist = {
    'classifier__n_estimators': randint(100, 1000),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__num_leaves': randint(20, 50),
    'classifier__max_depth': randint(5, 20),
    'classifier__reg_alpha': uniform(0.1, 0.9), # L1 regularization
    'classifier__reg_lambda': uniform(0.1, 0.9)  # L2 regularization
}

n_iter_search = 25

random_search = RandomizedSearchCV(
    lgbm_pipeline,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    cv=5, # 5-fold cross-validation
    scoring='f1_macro', # alternative metric
    random_state=config.RANDOM_STATE,
    n_jobs=-1,
    # verbose=2 # show progress
)

# Fit the search on the TRAINING data
random_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In






























[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551








[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.203286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead o



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.192769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.321297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.71555



[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.335091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543








[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.249049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543




[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.190409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.187668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.262337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM]























[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.257369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.71567



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.269653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.247506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091512 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.293373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551




[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.154546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.312340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[Ligh































[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [In











[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.128588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data point







[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.108876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [In



















[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056848 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551






[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data poi











[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data point







[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.288833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1673
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715543
[LightGBM] [Info] Start training from score -2.715543
[LightGBM] [Info] Number of positive: 7925, number of negative: 119769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.142569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 250
[LightGBM] [In























[LightGBM] [Info] Number of positive: 7924, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 127694, number of used features: 249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062055 -> initscore=-2.715677
[LightGBM] [Info] Start training from score -2.715677




[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062062 -> initscore=-2.715551
[LightGBM] [Info] Start training from score -2.715551
[LightGBM] [Info] Number of positive: 7925, number of negative: 119770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 127695, number of used features: 250
[LightGBM] [In











[LightGBM] [Info] Number of positive: 9906, number of negative: 149712
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1680
[LightGBM] [Info] Number of data points in the train set: 159618, number of used features: 250
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.062061 -> initscore=-2.715573
[LightGBM] [Info] Start training from score -2.715573


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'classifier__learning_rate': <scipy.stats....t 0x341acd220>, 'classifier__max_depth': <scipy.stats....t 0x341a3b020>, 'classifier__n_estimators': <scipy.stats....t 0x341a39af0>, 'classifier__num_leaves': <scipy.stats....t 0x341a3b8c0>, ...}"
,n_iter,25
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,0.005
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,41
,max_depth,18
,learning_rate,np.float64(0....7228206353053)
,n_estimators,489
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [18]:
import json

print("Best cross-validated f1 macro score from search: {:.4f}".format(random_search.best_score_))
print("\nBest parameters found:")
print(random_search.best_params_)

print(f"\nSaving best parameters to: {config.HYPERPARAMETERS_FILE_alt}")
with open(config.HYPERPARAMETERS_FILE_alt, 'w') as f:
    json.dump(random_search.best_params_, f, indent=4) # indent=4 makes it human-readable

print("Parameters saved successfully.")

# Compare to the default LightGBM score 
default_lgbm_score = results_df.loc['LightGBM', 'Mean F1 Macro']
print(f"\nDefault LightGBM score was: {default_lgbm_score:.4f}")
improvement = (random_search.best_score_ - default_lgbm_score) / default_lgbm_score * 100
print(f"Improvement from tuning: {improvement:.2f}%")

Best cross-validated f1 macro score from search: 0.7433

Best parameters found:
{'classifier__learning_rate': np.float64(0.16107228206353053), 'classifier__max_depth': 18, 'classifier__n_estimators': 489, 'classifier__num_leaves': 41, 'classifier__reg_alpha': np.float64(0.5920392514089517), 'classifier__reg_lambda': np.float64(0.26636900997297436)}


AttributeError: module 'src.config' has no attribute 'HYPERPARAMETERS_FILE_alt'