# Final Model Training

## Import Libraries and Load Data

In [1]:
%load_ext autoreload
%autoreload 2

In [103]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

import random

from difflib import SequenceMatcher

from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn import svm

import optuna

from lightgbm import LGBMClassifier, LGBMRanker
from catboost import CatBoostClassifier, CatBoostRanker
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

from collections import Counter

from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings

In [60]:
# EDA Imports
from src.eda import null_columns_checker, column_investigator, find_unique_values, analyser_generic, tree_path_investigator
from src.eda import non_numeric_check, dimension_investigator

# FE Imports
from src.fe import indicate_nulls, fuzzy_search, dynamic_check_text, check_columns_on_query, check_columns_on_text 
from src.fe import dim_binner, check_columns_on_dim, column_dropper, add_new_features

# Feature Model Selectin Imports
from src.feature_model_selection import find_high_corr_pairs, manual_pr_draw, manual_auc_roc_draw, groupper

In [4]:
# Load training data
train_file_path= 'data/train.feather'
ori_train_df = pd.read_feather(train_file_path)

## Split data

In [78]:
gss = GroupShuffleSplit(test_size=0.2, random_state=42).split(ori_train_df, groups=ori_train_df['query'])
X_train_inds, X_val_inds = next(gss)

train_df = ori_train_df.iloc[X_train_inds]
X_train = train_df.loc[:, ~train_df.columns.isin(['is_relevant'])]
y_train = train_df.loc[:, train_df.columns.isin(['is_relevant'])]

#We need to keep the query fo2r later predictions
val_df = ori_train_df.iloc[X_val_inds]
X_val = val_df.loc[:, ~val_df.columns.isin(['is_relevant'])]
y_val = val_df.loc[:, val_df.columns.isin(['is_relevant'])]

## Feature Engineering

In [83]:
train_output = add_new_features(train_df, 'train')

null_check_columns:  ['alt', 'sizes', 'class']
columns_to_drop:  ['crossorigin', 'ismap', 'longdesc', 'referrerpolicy']
Successfully dropping fully Null columns!

Successfully indicating partially Null columns!

Successfully adding text-based features!

Successfully adding url-based features!

rel_feature_potential:  ['figure', 'p', 'source']
irrel_feature_potential:  ['a']
Total Data covered: 47.4%
Successfully adding first tree-path feature!

rel_feature_potential:  ['figure', 'p', 'amp-img', 'getpreference', 'readme-toc', 'treasure-overlay-spinner', 'pre', 'task-lists', 'textarea', 'app-pharmacy-layout', 'tw-wrapper', 'app-project-detail', 'ps-carousel', 'bpc-app', 'z-widget', 'bpc-product', 'app-modal', 'swiper', 'brb', 'cont', 'object', 'app-photo-wall', 'rs-layer', 'app-product', 'titles', 'app-person', 'b-container', 'bpc-dynamic-view-carousel', 'bpc-product-card-base', 'enx-image', 'bpc-other-products-base', 'bpc-product-card-image', 'app-medicine-detail', 'flo-root', 'flo-head

In [84]:
X_train_fe = train_output.pop(0)
val_inputs = train_output

In [85]:
val_output = add_new_features(val_df, 'val', val_inputs)
X_val_fe = val_output.pop(0)

Successfully dropping fully Null columns!

Successfully indicating partially Null columns!

Successfully adding text-based features!

Successfully adding url-based features!

Successfully adding first tree-path feature!

Successfully adding overall tree-path feature!

Successfully adding text-tag feature!

Successfully adding height-based feature!

Successfully adding sizes-based feature!

Successfully adding srcset-based feature!

Successfully adding width-based feature!

Successfully adding class-based feature!

Successfully adding style-based feature!

Successfully removing high correlated features!

Successfully reordering columns!



In [90]:
print("Missing columns from X_train_fe: ", [col for col in val_fe.columns if col not in X_train_fe.columns])
print("Missing columns from X_val_fe: ", [col for col in train_fe.columns if col not in X_val_fe.columns])

Missing columns from X_train_fe:  []
Missing columns from X_val_fe:  []


## Model Training

In [93]:
le = LabelEncoder()
le.fit_transform(train_df['query'])

array([  0,   0,   0, ..., 585, 585, 585])

In [98]:
X = X_train_fe
y = y_train
le = LabelEncoder()
kfold_group = le.fit_transform(train_df['query'])

In [None]:
trial

In [110]:
def objective_lgbm(trial, X, y, kfold_group):
    # Define hyperparameters to tune (these are for LightGBM, modify as needed for different models)
    hyperparams = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'eval_at': [1, 10, 20, 30],
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'num_iterations': trial.suggest_int('num_iterations', 50, 200), 
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']),
        'verbose': -1,
    }

    # Initialize GroupKFold
    group_kfold = GroupKFold(n_splits=4)
    scores = []

    # Split the data and perform training and evaluation
    for train_index, val_index in group_kfold.split(X, y, kfold_group):
        X_train_, X_val_ = X.iloc[train_index], X.iloc[val_index]
        y_train_, y_val_ = y.iloc[train_index], y.iloc[val_index]

        # Initialize and train the model
        model = LGBMRanker(**hyperparams)
        model.fit(X_train_, y_train_, group=groupper(train_index))  # Adjust for non-ranking models
        
        # Predictions and evaluation for each fold
        y_pred = model.predict(X_val_)
        score = roc_auc_score(y_val_, y_pred)
        scores.append(score)

    # Return the average score across all folds
    return np.mean(scores)

X = X_train_fe
y = y_train
le = LabelEncoder()
kfold_group = le.fit_transform(train_df['query'])

with warnings.catch_warnings(record=True) as captured_warnings:
    warnings.simplefilter("always")
    
    # Start the Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective_lgbm(trial, X, y, kfold_group), n_trials=3)

best_params = study.best_trial.params
print(f'Best parameters: {best_params}')

NameError: name 'warnings' is not defined

In [None]:
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

def objective(trial, model_class, X, y, groups, score_func, additional_params=None):
    """
    Objective function for hyperparameter tuning using Optuna.
    Allows for the training and evaluation of different models.
    
    :param trial: Optuna trial object.
    :param model_class: ML model class to be optimized.
    :param X: Feature data.
    :param y: Target labels.
    :param groups: Group labels for ranking or other group-specific tasks.
    :param score_func: Function used to calculate the score for model evaluation.
    :param additional_params: Additional model parameters not being tuned.
    """
    # Define hyperparameters to tune (these are for LightGBM, modify as needed for different models)
    hyperparams = {
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'num_iterations': trial.suggest_int('num_iterations', 50, 200), 
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']), 
        'verbose': -1,
    }
    if additional_params:
        hyperparams.update(additional_params)

    # Initialize GroupKFold
    group_kfold = GroupKFold(n_splits=4)
    scores = []

    # Split the data and perform training and evaluation
    for train_index, val_index in group_kfold.split(X, y, groups):
        X_train_, X_val_ = X.iloc[train_index], X.iloc[val_index]
        y_train_, y_val_ = y.iloc[train_index], y.iloc[val_index]

        # Initialize and train the model
        model = model_class(**hyperparams)
        model.fit(X_train_, y_train_, group=groups[train_index])  # Adjust for non-ranking models
        
        # Predictions and evaluation for each fold
        y_pred = model.predict(X_val_)
        score = score_func(y_val_, y_pred)
        scores.append(score)

    # Return the average score across all folds
    return np.mean(scores)

# Usage example
X = X_train_fe  # Your feature data
y = y_train  # Your target data
le = LabelEncoder()
groups = le.fit_transform(train_fe['query'])  # Your group data

# For LightGBM ranking model
from lightgbm import LGBMRanker
additional_params = {'objective': 'lambdarank', 'metric': 'ndcg', 'eval_at': [1, 10, 20, 30]}

# Start the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, LGBMRanker, X, y, groups, roc_auc_score, additional_params), n_trials=3)

best_params = study.best_trial.params
print(f'Best parameters: {best_params}')


In [None]:
col = X_train_fe_used.columns
coef = model.feature_importances_
coef_df = pd.DataFrame({'Feature': col, 'Coefficient': coef})
coef_df = coef_df.reindex(coef_df.Coefficient.abs().sort_values(ascending=False).index)

plt.figure(figsize=(10, 6))  # You can adjust the figure size as per your requirements
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color='skyblue')  # Horizontal bar chart for better readability
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.title('Importance of Features in Predicting Result (Gradient Boosting Feature Importance)')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()

In [None]:
trial = optuna.trial.Trial

In [None]:
def objective(trial):
    """
    Objective function for hyperparameter tuning using Optuna.
    Trains a LightGBM ranking model and evaluates using GroupKFold.
    """
    # Define hyperparameters to tune
    hyperparams = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'eval_at': [1, 10, 20, 30],
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
        'num_iterations': trial.suggest_int('num_iterations', 50, 200), 
        'num_leaves': trial.suggest_int('num_leaves', 20, 40), 
        # 'lambda_l1': trial.suggest_float('lambda_l1', 0.1, 0.9),
        # 'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 0.9),
        'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']), 
        'verbose': -1,
    }

    # Input
    X = X_train_fe
    y = y_train
    le = LabelEncoder()
    kfold_group = le.fit_transform(train_fe['query'])

    # Initialize GroupKFold
    group_kfold = GroupKFold(n_splits=4)
    scores = []

    # Split the data and perform training and evaluation
    for train_index, val_index in group_kfold.split(X, y, kfold_group):
        X_train_, X_val_ = X.iloc[train_index], X.iloc[val_index]
        y_train_, y_val_ = y.iloc[train_index], y.iloc[val_index]

        # Initialize and train the LightGBM ranker model
        ranker = LGBMRanker(**hyperparams)
        ranker.fit(X_train_, y_train_, group=groupper(train_index))
        
        # Predictions and evaluation for each fold
        y_pred = ranker.predict(X_val_)
        score = roc_auc_score(y_val_, y_pred)
        scores.append(score)

    # Return the average score across all folds
    return np.mean(scores)


# Start the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3)

best_params = study.best_trial.params
print(f'{best_params=}')

In [None]:
model = LGBMRanker(**best_params)
model.fit(X_train_fe_used, y_train, group=train_groups)

y_pred_proba = model.predict(X_val_fe_used, group=val_groups)
proba_to_predict = lambda proba, threshold=0.5: (proba > threshold).astype(int)
y_pred = proba_to_predict(y_pred_proba)

manual_auc_roc_draw(y_val, y_pred_proba)
manual_pr_draw(y_val, y_pred)