<a href="https://colab.research.google.com/github/jaideepmurkute/Active-Learning-Supervised-Machine-Learning-With-Minimal-Labeled-Data/blob/master/s03_e03/play_s03e03_model_xgb_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install wandb -U -qqq                                           
! pip install sklearn -U -qqq
! pip install xgboost==1.6.0 #-U -qqq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import xgboost
xgboost.__version__

'1.6.0'

In [4]:

import os
import random
import sys
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json

sns.set_style('darkgrid')
from sklearn.datasets import fetch_california_housing 

from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.linear_model import LinearRegression, SGDOneClassSVM
from sklearn.preprocessing import StandardScaler, RobustScaler
# from sklearn.neighbors import LocalOutlierFactor
# from sklearn.metrics import rmse

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.datasets import fetch_california_housing 
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss

import xgboost as xgb
import torch

from scipy.stats.mstats import winsorize
from scipy.stats import mode

import wandb

# import reverse_geocoder
# import geopy
# import category_encoders as ce



In [5]:

def set_seeds(config):
  np.random.seed(config["random_state"])
  random.seed(config["random_state"])
  os.environ["PYTHONHASHSEED"] = str(config["random_state"])
  '''
  torch.manual_seed(config["random_state"])
  if torch.cuda.is_available():
      torch.cuda.manual_seed(config["random_state"])
      torch.cuda.manual_seed_all(config["random_state"])
      torch.backends.cudnn.deterministic = True
      torch.backends.cudnn.benchmark = True 
  '''


def generate_fold_idx(config, train_df, group_col=None):
  if config['fold_split_type'] == 'kfold':
    splitter = KFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'strat_kfold':
    splitter = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'group_kfold':
    splitter = GroupKFold(n_splits=config['num_folds'])
  else:
    raise ValueError("fold_split_type {} not recognized... Choose from: \
                    time_series_split, group_time_series_split, purged_time_series_split, kfold")
  
  fold_idx_dict = dict()
  if config['fold_split_type'] == 'group_kfold':
    if group_col in train_df.columns:
      for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, 
                                                                    groups=train_df[group_col].values)):
        fold_idx_dict[fold_idx] = dict()
        fold_idx_dict[fold_idx]['train_idx'] = train_idx
        fold_idx_dict[fold_idx]['val_idx'] = val_idx
  else:
    for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, y=train_df.Attrition.values)):
      fold_idx_dict[fold_idx] = dict()
      fold_idx_dict[fold_idx]['train_idx'] = train_idx
      fold_idx_dict[fold_idx]['val_idx'] = val_idx
  
  return fold_idx_dict


def save_model(config, model):
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.xgb'
  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  print('Saving model...')
  model.save_model(model_local_save_path)

  print('Copying model to drive...')
  shutil.copy(model_local_save_path, model_drive_save_path)
  

def load_model(config):
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.xgb'
  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  if not os.path.exists(model_local_save_path):
    shutil.copy(model_drive_save_path, model_local_save_path)
  
  print('Loading model...')
  model = xgb.XGBClassifier()
  model.load_model(model_local_save_path)

  return model



def get_xgb_params(config):
  xgb_params = {
            'random_state': config['random_state'], 
            'n_jobs': config['n_jobs'], 
            'verbosity': config['verbosity'], 
            
            'tree_method': config['tree_method'], 
            'max_depth': config['max_depth'], 
            'max_leaves': config['max_leaves'], 
            'n_estimators': config['n_estimators'], 
            'early_stopping_rounds': config['early_stopping_rounds'], 
            
            'colsample_bytree': config['colsample_bytree'], 
            'subsample': config['subsample'], 
            'reg_alpha': config['reg_alpha'], 
            'reg_lambda': config['reg_lambda'], 
            'enable_categorical': config['enable_categorical'], 
            
            'learning_rate': config['learning_rate'], 
            'objective': config['objective'], 
            'eval_metric': config['eval_metric'],
            
            # 'num_class': config['num_classes'],
        }
  
  return xgb_params



In [6]:

def get_data(config):
  '''
  data_type : feature / orig
  '''
  if config['feature_version'] == 'v0':
    for fname in ['train.csv', 'test.csv', 'sample_submission.csv']:
      shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                  os.path.join(config['local_data_dir'], fname)
                  )
    train_df = pd.read_csv(os.path.join(config['local_data_dir'], 'train.csv'))
    test_df = pd.read_csv(os.path.join(config['local_data_dir'], 'test.csv'))
    sub_df = pd.read_csv(os.path.join(config['local_data_dir'], 'sample_submission.csv'))

    train_df.drop(['id'], axis=1, inplace=True)
    
    if config['include_orig_data']:
      for fname in ['orig_data.csv']:
        shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                    os.path.join(config['local_data_dir'], fname)
                    )
      orig_data_df = pd.read_csv(os.path.join(config['local_data_dir'], 'orig_data.csv'))
      orig_data_df['Attrition'] = orig_data_df['Attrition'].replace('Yes', 1).replace('No', 0)
      orig_data_df.drop(['EmployeeNumber'], axis=1, inplace=True)

      train_df = pd.concat((train_df, orig_data_df), axis=0)
      
  else:
    train_feature_fname = 'train_features_' + config['feature_version'] + '.csv'
    test_feature_fname = 'test_features_' + config['feature_version'] + '.csv'
    for fname in [train_feature_fname, test_feature_fname]:
      shutil.copy(os.path.join(config['drive_feature_dir'], fname), 
                  os.path.join(config['local_feature_dir'], fname)
                  )
    
    for fname in ['train.csv', 'test.csv', 'sample_submission.csv']:
      shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                  os.path.join(config['local_data_dir'], fname)
                  )
    train_df = pd.read_csv(os.path.join(config['local_feature_dir'], train_feature_fname))
    test_df = pd.read_csv(os.path.join(config['local_feature_dir'], test_feature_fname))
    sub_df = pd.read_csv(os.path.join(config['local_data_dir'], 'sample_submission.csv'))
      
  
  return train_df, test_df, sub_df
  

def encode_features(config, train_df, test_df):
  
  config['nominal_categorical_cols'] = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
  config['ordinal_categorical_cols'] = ['BusinessTravel', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
                              'PerformanceRating',  'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear',
                              'WorkLifeBalance']
  config['ordinal_continuous_cols'] = ['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeCount', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 
                             'NumCompaniesWorked', 'PercentSalaryHike', 'StandardHours', 'TotalWorkingYears', 'YearsAtCompany', 
                             'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

  # one-hot encoding for nominal_categorical_cols
  train_df_encoded = pd.get_dummies(train_df, columns=config['nominal_categorical_cols'], prefix=config['nominal_categorical_cols'])
  test_df_encoded = pd.get_dummies(test_df, columns=config['nominal_categorical_cols'], prefix=config['nominal_categorical_cols'])
  
  # ---------------

  # Convert ordinal_categorical column BusinessTravel, with non-numeric values, to numeric values
  BusinessTravel_map = {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}
  train_df_encoded.BusinessTravel = train_df_encoded.BusinessTravel.map(BusinessTravel_map)
  test_df_encoded.BusinessTravel = test_df_encoded.BusinessTravel.map(BusinessTravel_map)

  # ---------------

  # Handle case of if some categories/values are missing in the test set; so columns mismatch
  missing_cols = set( train_df_encoded.columns ) - set( test_df_encoded.columns )
  print("missing_cols: ", missing_cols)
  # Add a missing column in test set with default value equal to 0
  for c in missing_cols: test_df_encoded[c] = 0
  
  # Ensure the order of column in the test set is in the same order than in train set
  test_df_encoded = test_df_encoded[train_df_encoded.columns]
  
  for col in train_df_encoded.columns:
    if col not in test_df_encoded.columns:
      print("col {} present in train_df_encoded but not in test_df_encoded...")
      raise
  
  # -----------
  # -----------
  
  config['encoded_nominal_categorical_cols'] = []
  for orig_nom_cat_col in config['nominal_categorical_cols']:
    for col in train_df_encoded.columns:
      if col.startswith(orig_nom_cat_col+'_'):
        config['encoded_nominal_categorical_cols'].append(col)
  print("config['encoded_nominal_categorical_cols']: ", config['encoded_nominal_categorical_cols'])
  

  # dupl_np_encoded_nominal_categorical_cols = []
  # np_encoded_nominal_categorical_cols = np.array(config['encoded_nominal_categorical_cols'])
  # for col in np_encoded_nominal_categorical_cols:
  #   if len(np_encoded_nominal_categorical_cols[np_encoded_nominal_categorical_cols==col]) > 1:
  #     dupl_np_encoded_nominal_categorical_cols.append(col)
  # print("dupl_np_encoded_nominal_categorical_cols: ", dupl_np_encoded_nominal_categorical_cols)
  # raise


  config['encoded_ordinal_categorical_cols'] = []
  for orig_ord_cat_col in config['ordinal_categorical_cols']:
    for col in train_df_encoded.columns:
      if col.startswith(orig_ord_cat_col+'_'):
        config['encoded_ordinal_categorical_cols'].append(col)
  print("config['encoded_ordinal_categorical_cols']: ", config['encoded_ordinal_categorical_cols'])
  
  # -----------

  return train_df_encoded, test_df_encoded
  


def get_feature_cols(config, train_df):
  config['id_cols'] = ['id']

  config['cont_cols'] = config['ordinal_continuous_cols']
  
  '''
  # -------------------
  dupl_encoded_nominal_categorical_cols = []
  np_feature_cols = np.array(config['encoded_nominal_categorical_cols'])
  for col in np_feature_cols:
    if len(np_feature_cols[np_feature_cols == col]) > 1:
      dupl_encoded_nominal_categorical_cols.append(col)
  print("dupl_encoded_nominal_categorical_cols: ", dupl_encoded_nominal_categorical_cols)
  # ----------------------
  dupl_encoded_ordinal_categorical_cols = []
  np_feature_cols = np.array(config['encoded_ordinal_categorical_cols'])
  for col in np_feature_cols:
    if len(np_feature_cols[np_feature_cols == col]) > 1:
      dupl_encoded_ordinal_categorical_cols.append(col)
  print("dupl_encoded_ordinal_categorical_cols: ", dupl_encoded_ordinal_categorical_cols)
  # ----------------------
  overlap_cols = []
  for col in config['encoded_nominal_categorical_cols']:
    if col in config['encoded_ordinal_categorical_cols']:
      overlap_cols.append(col)
  # print(f"col: {col} is in both encoded_nominal_categorical_cols & encoded_ordinal_categorical_cols")
  print("cols in both type of categorical features: ", overlap_cols)
  # ----------------------

  raise
  '''
  config['cat_cols'] = config['encoded_nominal_categorical_cols'] + config['encoded_ordinal_categorical_cols']

  config['target_cols'] = ['Attrition']

  config['feature_cols'] = config['cont_cols'] + config['cat_cols']

  return config



In [7]:
def scale_data(train_df, test_df, cols, method):
  if method == 'standard':
    scaler = StandardScaler()
  elif method == 'robust':
    scaler = RobustScaler()
  
  scaler.fit(train_df[cols].values)
  train_df[cols] = scaler.transform(train_df[cols])
  test_df[cols] = scaler.transform(test_df[cols])

  return train_df, test_df
    

In [8]:

def IQR_outlier_handling(train_df, test_df, cols, handling_type):
  for col in cols:
    # calculate interquartile range
    q25, q75 = np.percentile(train_df[col].values, 25), np.percentile(train_df[col].values, 75)
    iqr = q75 - q25
    
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
    lower_cutoff, upper_cutoff = q25 - cut_off, q75 + cut_off
    
    num_outliers = train_df[col].loc[(train_df[col] < lower_cutoff) | (train_df[col] > upper_cutoff)].shape[0]
    print("col: {} \t # num_outliers: {}".format(col, num_outliers))

    if handling_type == 'remove_train_clip_test':
      train_df[col] = train_df[col].loc[(not(train_df[col] < lower_cutoff)) & (not(train_df[col] > upper_cutoff))]
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
    elif handling_type == 'clip':
      train_df[col].loc[train_df[col] < lower_cutoff] = lower_cutoff
      train_df[col].loc[train_df[col] > upper_cutoff] = upper_cutoff
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
      
    return train_df, test_df



def winsorize(train_df, test_df, cols, lower_lim=0.01, upper_lim=0.98):
  # lower_lim = train_df.quantile(0.01)
  # upper_lim = train_df.quantile(0.99)
  for col in cols:
    train_df[col] = winsorize(train_df[col], (lower_lim, upper_lim))
    test_df[col] = winsorize(test_df[col], (lower_lim, upper_lim))
  
  return train_df, test_df



def isolation_forest_outlier_handling(train_df, test_df, cols, outlier_thresh=-0.1, 
                                      handling_method='drop_median', seed=0):
  print("Training Isolation forest model to detect outliers...")
  iso_forest_model = IsolationForest(n_estimators=500, contamination='auto', random_state=seed)
  iso_forest_model.fit(train_df[cols], train_df.MedHouseVal.values)
  
  sample_scores_train = iso_forest_model.decision_function(train_df[cols])
  sample_scores_test = iso_forest_model.decision_function(test_df[cols])

  print("# train outliers: ", np.sum(sample_scores_train < outlier_thresh))
  print("# test outliers: ", np.sum(sample_scores_test < outlier_thresh))
  
  if handling_method == 'drop_median':
    print("Dropping outlier train samples...")
    # drop train samples and replace test sample values with median from train columns
    train_df = train_df.loc[sample_scores_train >= outlier_thresh]
    
    print("Clipping outlier test samples to median value...")
    for col in cols:
      test_df[col].loc[sample_scores_test < outlier_thresh] = train_df[col].median(axis=0)
  elif handling_method == 'winsorize':
    train_df.loc[sample_scores_train < outlier_thresh] = winsorize(train_df.loc[sample_scores_train < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    test_df.loc[sample_scores_test < outlier_thresh] = winsorize(test_df.loc[sample_scores_test < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    
  return train_df, test_df


def handle_outliers(config, train_df, test_df, cols, method):
  
  if method == 'winsorize':
    train_df, test_df = winsorize(train_df, test_df, cols=cols)
  elif method == 'iso_forest':
    train_df, test_df = isolation_forest_outlier_handling(train_df, test_df, cols=cols, 
                                  outlier_thresh=-0.1, handling_method='drop_median', seed=config['seed'])
  elif method == 'iqr':
    train_df, test_df = IQR_outlier_handling(train_df, test_df, cols, handling_type='clip')

  return train_df, test_df


In [9]:

def rmse(label, pred):
  return np.sqrt(np.mean((label-pred)**2))

def mse(label, pred):
  return np.mean((label-pred)**2)

def get_config():
  return config

def save_config(config):
  config_to_save = {}  # to avoid types like object or others that somtimes cause problem reading data back.
  for k, v in config.items():
    if isinstance(v, (bool, int, float, str, list, dict)):
      config_to_save[k] = v
  
  config_local_save_path = os.path.join(config['local_model_dir'], 'saved_config.json')
  config_drive_save_path = os.path.join(config['drive_model_dir'], 'saved_config.json')
  
  with open(config_local_save_path, 'w') as fp:
    json.dump(config_to_save, fp, indent=4, sort_keys=True)
  
  shutil.copy(config_local_save_path, config_drive_save_path)


def train_k_folds():

  # needed becuse variable that is updated within function becomes a local variable and has to be passed in.
  config = get_config()
  # print("1")
  print("config: ", config)

  create_paths(config)
  # print("2")
  train_df, test_df, sub_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  
  train_df, test_df = encode_features(config, train_df, test_df)

  config = get_feature_cols(config, train_df)
  print("config['feature_cols']: ", config['feature_cols'])

  
  
  # dupl_cols = []
  # np_feature_cols = np.array(config['feature_cols'])
  # for col in np_feature_cols:
  #   if len(np_feature_cols[np_feature_cols == col]) > 1:
  #     dupl_cols.append(col)
  # print("dupl_cols: ", dupl_cols)

  # -------------

  # if config['handle_outliers']:
  #   # handle outliers
  #   print("Before outlier handling: ")
  #   print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  #   train_df, test_df = handle_outliers(config, train_df, test_df, cols=float_feature_cols, method='iqr')
  #   print("After outlier handling: ")
  #   print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  # if config['scale_data']:
  #   print("Scaling data...")
  #   # scale data
  #   # float_feature_cols = [col for col in config['feature_cols'] if col not in ['Latitude', 'Longitude', 'MedHouseVal']]
  #   train_df, test_df = scale_data(train_df, test_df, cols=float_feature_cols, method='robust')

  # ------------------
  
  fold_idx_dict = generate_fold_idx(config, train_df)
  
  per_model_metrics = {
                        'AUROC': {'train': [], 'val': []},
                        'f1_score': {'train': [], 'val': []},
                        'logloss': {'train': [], 'val': []},
                      }
  
  shutil.copy('/content/drive/MyDrive/Playground Series/S03_E03/code/play_s03e03_model_xgb_1.ipynb', 
              os.path.join(config['drive_model_dir'], 'model_2.ipynb'))
  save_config(config)

  # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
  # if config['use_wandb']:
  wandb.init(name=config['model_name'], project=config['project_name'], 
            tags=['baseline'], config=config)
  if config['choice'] == 3:
    print("Updating sweep configs...")
    for k, v in wandb.config.items():
      config[k] = v
    print("*** Updated sweep config: ", config)
    
  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue

    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
    # # if config['use_wandb']:
    # wandb.init(name=fold_model_name, project=config['project_name'], 
    #           tags=['baseline'], config=config)
    # if config['choice'] == 3:
    #   print("Updating sweep configs...")
    #   for k, v in wandb.config.items():
    #     config[k] = v
    #   print("*** Updated sweep config: ", config)
    
    set_seeds(config)
    
    # -----------
    
    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    # print("config['feature_cols']: ", config['feature_cols'])
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx]

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]

    xgb_params = get_xgb_params(config)
    if config['scale_pos_weight'] == 'auto':
      auto_pos_cls_weight = train_label[train_label.values==0].shape[0] / train_label[train_label.values==1].shape[0]
      print("Setting scale_pos_weight to: ", auto_pos_cls_weight)
      xgb_params['scale_pos_weight'] = auto_pos_cls_weight
    else:
      xgb_params['scale_pos_weight'] = config['scale_pos_weight']
    

    model = xgb.XGBClassifier(**xgb_params)

    # print("train_data.shape: ", train_data.shape)
    # print("train_label.shape: ", train_label.shape)
    # print("train_label[:10]: ", train_label[:10])
    
    # print("val_data.shape: ", val_data.shape)
    # print("val_label.shape: ", val_label.shape)
    # print("val_label[:10]: ", val_label[:10])
    
    print("Training model ...")
    model.fit(train_data, train_label, 
              # early_stopping_rounds=config['early_stopping_rounds'], 
              # eval_metric=config['eval_metric'],
              eval_set=[(train_data, train_label), 
                        (val_data, val_label)], 
              verbose=50, 
              )
    
    print("Saving Model...")
    save_model(config, model)

    # -----------
    # print(model.evals_result())
    for i in range(len(model.evals_result()['validation_0'][config['eval_metric']])):
      wandb.log({
        "Per Epoch Train AUC ": model.evals_result()['validation_0'][config['eval_metric']][i], 
        "Per Epoch Val AUC": model.evals_result()['validation_1'][config['eval_metric']][i], 
        }
      )
    # raise
    # print("Best val epoch: min validation_0 auc: ", np.min(model.evals_result()['validation_0'][config['eval_metric']]))
    
    
    print("Best val epoch: min validation_1 auc: ", np.min(model.evals_result()['validation_1'][config['eval_metric']]))
    print("model.best_ntree_limit: ", model.best_ntree_limit)
    
    # print('train_label[:10]: ', train_label[:10])
    # print('val_label[:10]: ', val_label[:10])
    
    train_pred_probs = model.predict_proba(train_data, ntree_limit=model.best_ntree_limit)
    val_pred_probs = model.predict_proba(val_data, ntree_limit=model.best_ntree_limit)
    # print("train_pred_probs.shape: ", train_pred_probs.shape)
    # print("val_pred_probs.shape: ", val_pred_probs.shape)
    # print("train_pred_probs[:10]: ", train_pred_probs[:10])
    # print("val_pred_probs[:10]: ", val_pred_probs[:10])
    
    train_pred_cls = np.argmax(train_pred_probs, axis=1)
    # train_pred_cls = np.copy(train_pred_probs)
    # train_pred_cls[train_pred_cls <= 0.5] = 0
    # train_pred_cls[train_pred_cls > 0.5] = 1
    
    val_pred_cls = np.argmax(val_pred_probs, axis=1)
    # val_pred_cls = np.copy(val_pred_probs)
    # val_pred_cls[val_pred_cls <= 0.5] = 0
    # val_pred_cls[val_pred_cls > 0.5] = 1

    # print("train_pred_cls.shape: ", train_pred_cls.shape)
    # print("val_pred_cls.shape: ", val_pred_cls.shape)

    train_pred_cont = train_pred_probs[:, 1]
    val_pred_cont = val_pred_probs[:, 1]
    # print("train_pred_cont.shape: ", train_pred_cont.shape)
    # print("val_preds_cont.shape: ", val_preds_cont.shape)

    train_auroc = roc_auc_score(train_label.values, train_pred_cont)
    val_auroc = roc_auc_score(val_label.values, val_pred_cont)

    train_f1 = f1_score(train_label.values, train_pred_cls)
    val_f1 = f1_score(val_label.values, val_pred_cls)
    
    train_logloss = log_loss(train_label.values, train_pred_probs)
    val_logloss = log_loss(val_label.values, val_pred_probs)
    
    per_model_metrics['AUROC']['train'] = train_auroc
    per_model_metrics['AUROC']['val'] = val_auroc
    per_model_metrics['f1_score']['train'] = train_f1
    per_model_metrics['f1_score']['val'] = val_f1
    per_model_metrics['logloss']['train'] = train_logloss
    per_model_metrics['logloss']['val'] = val_logloss

    print(f"AUROC: Train: {train_auroc} \t Val: {val_auroc}")
    print(f"F1 score: Train: {train_f1} \t Val: {val_f1}")
    print(f"Log loss: Train: {train_logloss} \t Val: {val_logloss}")
    
    # wandb.log({
    #     "Best Epoch Train AUROC": train_auroc, 
    #     "Best Epoch Val AUROC": val_auroc,

    #     "Best Epoch Train F1": train_f1, 
    #     "Best Epoch Val F1": val_f1,
        
    #     "Best Epoch Train Log Loss": train_logloss, 
    #     "Best Epoch Val Log Loss": val_logloss,
    #     }
    #   )

    print('-'*30)
  
  print("Fold average stats.: ")
  print(f"AUROC: Train: {np.mean(per_model_metrics['AUROC']['train'])} \t Val: {np.mean(per_model_metrics['AUROC']['val'])}")
  print(f"F1 score: Train: {np.mean(per_model_metrics['f1_score']['train'])} \t Val: {np.mean(per_model_metrics['f1_score']['val'])}")
  print(f"Log loss: Train: {np.mean(per_model_metrics['logloss']['train'])} \t Val: {np.mean(per_model_metrics['logloss']['val'])}")

  wandb.log({
        "Fold average Train AUROC": np.mean(per_model_metrics['AUROC']['train']), 
        "Fold average Val AUROC": np.mean(per_model_metrics['AUROC']['val']),
        }
      )
  




In [10]:

def create_paths(config):
  config['local_model_dir'] = '/content/model_store'
  config['drive_model_dir'] = os.path.join(config['drive_project_dir'], 'model_store')
  
  if not os.path.exists(config['local_model_dir']):
    os.mkdir(config['local_model_dir'])
  
  if not os.path.exists(config['drive_model_dir']):
    os.mkdir(config['drive_model_dir'])
  
  # -------------

  config['local_model_dir'] = os.path.join(config['local_model_dir'], config['model_name']) 
  config['drive_model_dir'] = os.path.join(config['drive_model_dir'], config['model_name']) 

  if not os.path.exists(config['local_model_dir']): 
    os.mkdir(config['local_model_dir'])
  if not os.path.exists(config['drive_model_dir']): 
    os.mkdir(config['drive_model_dir'])
  
  # -------------

  config['local_data_dir'] = '/content/data'
  config['drive_data_dir'] = os.path.join(config['drive_project_dir'], 'data/comp_data')

  config['local_feature_dir'] = '/content/feature_store'
  config['drive_feature_dir'] = os.path.join(config['drive_project_dir'], 'feature_store')

  if not os.path.exists(config['local_data_dir']):
    os.mkdir(config['local_data_dir'])
  
  if not os.path.exists(config['local_feature_dir']):
    os.mkdir(config['local_feature_dir'])



In [11]:

def test_model(config):
  create_paths(config)
  
  
  train_df, test_df, sub_df = get_data(config)
  print("train_df.shape: ", train_df.shape)

  test_ids = test_df.id.values
  
  train_df, test_df = encode_features(config, train_df, test_df)

  config = get_feature_cols(config, train_df)

  # ------------------
  
  # float_feature_cols = [col for col in config['feature_cols'] if col not in ['Latitude', 'Longitude', 'MedHouseVal']]

  # if config['handle_outliers']:
  #   # handle outliers
  #   print("Before outlier handling: ")
  #   print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  #   train_df, test_df = handle_outliers(config, train_df, test_df, cols=float_feature_cols, method='iqr')
  #   print("After outlier handling: ")
  #   print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  # if config['scale_data']:
  #   print("Scaling data...")
  #   # scale data
  #   # float_feature_cols = [col for col in config['feature_cols'] if col not in ['Latitude', 'Longitude', 'MedHouseVal']]
  #   train_df, test_df = scale_data(train_df, test_df, cols=float_feature_cols, method='robust')
    

  # ------------------

  fold_idx_dict = generate_fold_idx(config, train_df)
  
  per_model_metrics = {
                        'AUROC': {'train': [], 'val': []},
                        'f1_score': {'train': [], 'val': []},
                        'logloss': {'train': [], 'val': []},
                      }

  per_fold_test_pred_probs = None
  per_fold_test_pred_cls = None
  pred_cnt = 0

  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue
    pred_cnt += 1
    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])

    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx]

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]

    test_data = test_df[config['feature_cols']]
    
    # --------

    model = load_model(config)

    train_pred_probs = model.predict_proba(train_data, ntree_limit=model.best_ntree_limit)
    val_pred_probs = model.predict_proba(val_data, ntree_limit=model.best_ntree_limit)
    test_pred_probs = model.predict_proba(test_data, ntree_limit=model.best_ntree_limit)

    # train_pred_cls = np.copy(train_pred_probs)
    # train_pred_cls[train_pred_cls <= 0.5] = 0
    # train_pred_cls[train_pred_cls > 0.5] = 1
    
    # val_pred_cls = np.copy(val_pred_probs)
    # val_pred_cls[val_pred_cls <= 0.5] = 0
    # val_pred_cls[val_pred_cls > 0.5] = 1

    # test_pred_cls = np.copy(test_pred_probs)
    # test_pred_cls[test_pred_cls <= 0.5] = 0
    # test_pred_cls[test_pred_cls > 0.5] = 1

    train_pred_cls = np.argmax(train_pred_probs, axis=1)
    val_pred_cls = np.argmax(val_pred_probs, axis=1)
    test_pred_cls = np.argmax(test_pred_probs, axis=1)

    train_pred_cont = train_pred_probs[:, 1]
    val_pred_cont = val_pred_probs[:, 1]
    test_pred_cont = test_pred_probs[:, 1]
    
    # --------

    train_auroc = roc_auc_score(train_label.values, train_pred_cont)
    val_auroc = roc_auc_score(val_label.values, val_pred_cont)
    train_f1 = f1_score(train_label.values, train_pred_cls)
    val_f1 = f1_score(val_label.values, val_pred_cls)
    train_logloss = log_loss(train_label.values, train_pred_probs)
    val_logloss = log_loss(val_label.values, val_pred_probs)
    
    per_model_metrics['AUROC']['train'] = train_auroc
    per_model_metrics['AUROC']['val'] = val_auroc
    per_model_metrics['f1_score']['train'] = train_f1
    per_model_metrics['f1_score']['val'] = val_f1
    per_model_metrics['logloss']['train'] = train_logloss
    per_model_metrics['logloss']['val'] = val_logloss

    print(f"AUROC: Train: {train_auroc} \t Val: {val_auroc}")
    print(f"F1 score: Train: {train_f1} \t Val: {val_f1}")
    print(f"Log loss: Train: {train_logloss} \t Val: {val_logloss}")
    
    
    # test_pred_probs = test_pred_probs.flatten()
    test_pred_cls = np.reshape(test_pred_cls, newshape=(test_pred_cls.shape[0], 1))
    # if per_fold_test_pred_probs is None:
    #   per_fold_test_pred_probs = test_pred_probs
    #   per_fold_test_pred_cls = test_pred_cls
    # else:
    #   per_fold_test_pred_probs = np.concatenate((per_fold_test_pred_probs, test_pred_probs), axis=1)
    #   per_fold_test_pred_cls = np.concatenate((per_fold_test_pred_cls, test_pred_cls), axis=1)
    if per_fold_test_pred_probs is None:
      per_fold_test_pred_probs = test_pred_probs
      per_fold_test_pred_cls = test_pred_cls
    else:
      per_fold_test_pred_probs += test_pred_probs
      per_fold_test_pred_cls = np.concatenate((per_fold_test_pred_cls, test_pred_cls), axis=1)


  print("Fold average stats.: ")
  print(f"AUROC: Train: {np.mean(per_model_metrics['AUROC']['train'])} \t Val: {np.mean(per_model_metrics['AUROC']['val'])}")
  print(f"F1 score: Train: {np.mean(per_model_metrics['f1_score']['train'])} \t Val: {np.mean(per_model_metrics['f1_score']['val'])}")
  print(f"Log loss: Train: {np.mean(per_model_metrics['logloss']['train'])} \t Val: {np.mean(per_model_metrics['logloss']['val'])}")

  print("per_fold_test_pred_probs.shape: ", per_fold_test_pred_probs.shape)
  print("per_fold_test_pred_cls.shape: ", per_fold_test_pred_cls.shape)

  # avg_test_pred_probs = np.mean(per_fold_test_pred_probs, axis=1).flatten()
  avg_test_pred_probs = per_fold_test_pred_probs / pred_cnt
  mode_test_pred_cls, _ = mode(per_fold_test_pred_cls, axis=1)
  mode_test_pred_cls = mode_test_pred_cls.flatten()
  print("avg_test_pred_probs.shape: ", avg_test_pred_probs.shape)
  print("mode_test_pred_cls.shape: ", mode_test_pred_cls.shape)
  print("avg_test_pred_probs[:10]: ", avg_test_pred_probs[:10])
  print("mode_test_pred_cls[:10]: ", mode_test_pred_cls[:10])
  
  sub_df = pd.DataFrame([])
  sub_df['id'] = test_ids # test_df.id.values
  sub_df['Attrition'] = avg_test_pred_probs[:, 1]
  sub_df.to_csv(os.path.join(config['local_model_dir'], 'sample_submission.csv'), index=False)
  shutil.copy(os.path.join(config['local_model_dir'], 'sample_submission.csv'), 
              os.path.join(config['drive_model_dir'], 'sample_submission.csv'))




In [12]:
'''
 SWEEP best config: 

max_depth': 2, 
    'max_leaves': 8,  
    'learning_rate': 0.1, 

    'reg_alpha': 1.0, 
    'reg_lambda': 5.0, 
'colsample_bytree': 0.8, 
    'subsample': 0.8, 
'''

"\n SWEEP best config: \n\nmax_depth': 2, \n    'max_leaves': 8,  \n    'learning_rate': 0.1, \n\n    'reg_alpha': 1.0, \n    'reg_lambda': 5.0, \n'colsample_bytree': 0.8, \n    'subsample': 0.8, \n"

In [None]:

config = {
    'choice': 3, 
    'random_state': 42, 
    
    'model_name': 'xgb_model_sweep_4', 
    
    'feature_version': 'v0',  # v0 (means original data only) / v1 (orig + extracted features on comp data) / v2: (orig + extracted features on comp + orig data)
    'include_orig_data': True, 
    'handle_outliers': False, 
    'scale_data': False, 
    
    'enable_categorical': False, 
    # 'num_classes': 2, 
    'fold_split_type': 'strat_kfold', #'kfold', 
    'num_folds': 10, 
    'folds_to_train': [0,1,2,3,4,5,6,7,8,9,10], 
    
    'tree_method': 'hist', 
    'n_estimators': 9999, 
    'early_stopping_rounds': 200, 

    'colsample_bytree': 0.8, 
    'subsample': 0.6, 
    
    'max_depth': 5, 
    'max_leaves': 12,  
    'learning_rate': 0.03, 

    'reg_alpha': 0.5, 
    'reg_lambda': 3.0, 

    # if 'auto'; will be overridden as sum(negative instances) / sum(positive instances)
    # else; provided value will be used.
    # with 5 fold strat_kfold; auto pos weight is around 23.1956.
    'scale_pos_weight': 'auto', # 'auto',  / 10 / 25 etc.

    'verbosity': 1,

    'objective': 'binary:logistic', 
    'eval_metric': 'auc', 
    
    'use_gpu_if_available': True, 
    'predictor': 'gpu_predictor',
    'use_wandb': False, # Defaults to true if choice==3.
    'n_jobs': -1, 
    'data_dir': '/content/data/', 
    'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E03', 
    'project_name': 'playground_s03_e03', 
}



if config['use_gpu_if_available']:
  if torch.cuda.is_available():
    config['gpu_id'] = 0
    config['tree_method'] = 'gpu_hist'
    config['predictor'] = 'gpu_predictor'
    print("GPU available... XGBoost will use GPU...")
  else:
    print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to TRUE; But NO GPU IS VISIBLE!!!!!")
    if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
    if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor'
else:
  if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
  if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor' 
  print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to False!!!!!!!")    


if config['choice'] == 3: config['use_wandb'] = True

if config['use_wandb']:
  os.environ['WANDB_MODE'] = 'online'
  try: 
    wandb.login(key='d60ad29783a045de090c17001912975dc8f9f2e2') 
  except:
    wandb.login()
# else:
# os.environ['WANDB_MODE'] = 'offline'

set_seeds(config)

if config['choice'] == 1:
  train_k_folds()
elif config['choice'] == 2:
  test_model(config)
elif config['choice'] == 3:
  sweep_configs = {
      "method": "bayes",
      "metric": {
          # "name": "Best Epoch Val AUROC",
          "name": "Fold average Val AUROC",
          "goal": "maximize",
      },
      "parameters": {      
          "colsample_bytree": {
              "values": [0.6, 0.8, 1.0]
          },
          "subsample": {
              "values": [0.6, 0.8, 1.0]
          },
          "max_depth": {
              "values": [2, 4, 6, 8, 12, 24]
          },
          'max_leaves': {
              'values': [2, 4, 8, 16, 32, 64],
          },
          "reg_alpha": {
              "values": [0, 1.0, 2.0, 5.0]
          },
          "reg_lambda": {
              "values": [0, 1.0, 2.0, 5.0]
          },
          "learning_rate": {
              "values": [0.01, 0.05, 0.1, 0.3, 0.5]
          }
          # "random_state": {
          #     "values": [0.01, 0.05, 0.1, 0.3, 0.5]
          # },
        }
  }
  print("Running sweep>>>>>>>>>>>>>>>>>>>>>>>>")
  sweep_id = wandb.sweep(sweep=sweep_configs, project=config['project_name']+'_sweep')
  wandb.agent(sweep_id=sweep_id, function=train_k_folds, count=50)
else:
  raise ValueError(f"Incorrect value for 'choice'={config['choice']} in config")



NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to TRUE; But NO GPU IS VISIBLE!!!!!


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjaideepmurkute[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running sweep>>>>>>>>>>>>>>>>>>>>>>>>
Create sweep with ID: axcnih4g
Sweep URL: https://wandb.ai/jaideepmurkute/playground_s03_e03_sweep/sweeps/axcnih4g


[34m[1mwandb[0m: Agent Starting Run: 20iju99j with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.8
[34m[1mwandb[0m: 	learning_rate: 0.5
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	max_leaves: 64
[34m[1mwandb[0m: 	reg_alpha: 1
[34m[1mwandb[0m: 	reg_lambda: 0
[34m[1mwandb[0m: 	subsample: 0.6


config:  {'choice': 3, 'random_state': 42, 'model_name': 'xgb_model_sweep_4', 'feature_version': 'v0', 'include_orig_data': True, 'handle_outliers': False, 'scale_data': False, 'enable_categorical': False, 'fold_split_type': 'strat_kfold', 'num_folds': 10, 'folds_to_train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'tree_method': 'hist', 'n_estimators': 9999, 'early_stopping_rounds': 200, 'colsample_bytree': 0.8, 'subsample': 0.6, 'max_depth': 5, 'max_leaves': 12, 'learning_rate': 0.03, 'reg_alpha': 0.5, 'reg_lambda': 3.0, 'scale_pos_weight': 'auto', 'verbosity': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_gpu_if_available': True, 'predictor': 'cpu_predictor', 'use_wandb': True, 'n_jobs': -1, 'data_dir': '/content/data/', 'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E03', 'project_name': 'playground_s03_e03'}
train_df.shape:  (3147, 34)
missing_cols:  {'Attrition'}
config['encoded_nominal_categorical_cols']:  ['Department_Human Resources', 'Department_

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Updating sweep configs...
*** Updated sweep config:  {'choice': 3, 'random_state': 42, 'model_name': 'xgb_model_sweep_4', 'feature_version': 'v0', 'include_orig_data': True, 'handle_outliers': False, 'scale_data': False, 'enable_categorical': False, 'fold_split_type': 'strat_kfold', 'num_folds': 10, 'folds_to_train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'tree_method': 'hist', 'n_estimators': 9999, 'early_stopping_rounds': 200, 'colsample_bytree': 0.8, 'subsample': 0.6, 'max_depth': 2, 'max_leaves': 64, 'learning_rate': 0.5, 'reg_alpha': 1, 'reg_lambda': 0, 'scale_pos_weight': 'auto', 'verbosity': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_gpu_if_available': True, 'predictor': 'cpu_predictor', 'use_wandb': True, 'n_jobs': -1, 'data_dir': '/content/data/', 'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E03', 'project_name': 'playground_s03_e03', 'local_model_dir': '/content/model_store/xgb_model_sweep_4', 'drive_model_dir': '/content/drive/MyDrive/Pl



[0]	validation_0-auc:0.70012	validation_1-auc:0.69146
[50]	validation_0-auc:0.89389	validation_1-auc:0.82766
[100]	validation_0-auc:0.92798	validation_1-auc:0.85340
[150]	validation_0-auc:0.95018	validation_1-auc:0.85340
[200]	validation_0-auc:0.96688	validation_1-auc:0.85844
[250]	validation_0-auc:0.97632	validation_1-auc:0.85819
[300]	validation_0-auc:0.98503	validation_1-auc:0.83714
[350]	validation_0-auc:0.98949	validation_1-auc:0.83655
[400]	validation_0-auc:0.99190	validation_1-auc:0.82825
[436]	validation_0-auc:0.99367	validation_1-auc:0.81810
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6914625964441462
model.best_ntree_limit:  237
AUROC: Train: 0.9737962519574306 	 Val: 0.8731969137873197
F1 score: Train: 0.7314285714285714 	 Val: 0.5454545454545454
Log loss: Train: 0.24856505447779084 	 Val: 0.39769116887983974
------------------------------
Training fold:  2
len(train_idx): 2832 	 len(val_idx): 315
Setting scale_pos_weigh



[50]	validation_0-auc:0.89717	validation_1-auc:0.81097
[100]	validation_0-auc:0.93563	validation_1-auc:0.78900
[150]	validation_0-auc:0.95607	validation_1-auc:0.78153
[200]	validation_0-auc:0.96926	validation_1-auc:0.76786
[244]	validation_0-auc:0.97712	validation_1-auc:0.76426
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6232388460248239
model.best_ntree_limit:  46
AUROC: Train: 0.8914678459761697 	 Val: 0.8219557195571956
F1 score: Train: 0.5256087321578506 	 Val: 0.46875000000000006
Log loss: Train: 0.41880938718246286 	 Val: 0.45950231753762755
------------------------------
Training fold:  3
len(train_idx): 2832 	 len(val_idx): 315
Setting scale_pos_weight to:  6.206106870229007
Training model ...




[0]	validation_0-auc:0.70449	validation_1-auc:0.68165
[50]	validation_0-auc:0.89055	validation_1-auc:0.80820
[100]	validation_0-auc:0.92999	validation_1-auc:0.79688
[150]	validation_0-auc:0.95030	validation_1-auc:0.80694
[200]	validation_0-auc:0.96653	validation_1-auc:0.79160
[214]	validation_0-auc:0.96973	validation_1-auc:0.79680
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6816504528681651
model.best_ntree_limit:  16
AUROC: Train: 0.8410545555837238 	 Val: 0.8435927541093593
F1 score: Train: 0.46493902439024387 	 Val: 0.48611111111111105
Log loss: Train: 0.504213881612779 	 Val: 0.49279798107842604
------------------------------
Training fold:  4
len(train_idx): 2832 	 len(val_idx): 315
Setting scale_pos_weight to:  6.206106870229007
Training model ...




[0]	validation_0-auc:0.70193	validation_1-auc:0.65616
[50]	validation_0-auc:0.89244	validation_1-auc:0.74916
[100]	validation_0-auc:0.93175	validation_1-auc:0.72769
[150]	validation_0-auc:0.95444	validation_1-auc:0.71335
[200]	validation_0-auc:0.97011	validation_1-auc:0.70832
[250]	validation_0-auc:0.97931	validation_1-auc:0.74187
[258]	validation_0-auc:0.97992	validation_1-auc:0.74673
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6561556524656156
model.best_ntree_limit:  60
AUROC: Train: 0.898987717612545 	 Val: 0.7548641395504865
F1 score: Train: 0.5544217687074829 	 Val: 0.40298507462686567
Log loss: Train: 0.4060628720487078 	 Val: 0.5150598233351336
------------------------------
Training fold:  5
len(train_idx): 2832 	 len(val_idx): 315
Setting scale_pos_weight to:  6.206106870229007
Training model ...




[0]	validation_0-auc:0.70635	validation_1-auc:0.73532
[50]	validation_0-auc:0.89786	validation_1-auc:0.73490
[100]	validation_0-auc:0.93467	validation_1-auc:0.71302
[150]	validation_0-auc:0.96026	validation_1-auc:0.71310
[200]	validation_0-auc:0.97250	validation_1-auc:0.71503
[210]	validation_0-auc:0.97552	validation_1-auc:0.71125
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6880241529688024
model.best_ntree_limit:  11
AUROC: Train: 0.821376966950331 	 Val: 0.7835877222408588
F1 score: Train: 0.44427123928293066 	 Val: 0.4296296296296296
Log loss: Train: 0.5282088627342902 	 Val: 0.5161025182713592
------------------------------
Training fold:  6
len(train_idx): 2832 	 len(val_idx): 315
Setting scale_pos_weight to:  6.206106870229007
Training model ...
[0]	validation_0-auc:0.69396	validation_1-auc:0.70018




[50]	validation_0-auc:0.89721	validation_1-auc:0.75243
[100]	validation_0-auc:0.93658	validation_1-auc:0.71411
[150]	validation_0-auc:0.95609	validation_1-auc:0.69339
[200]	validation_0-auc:0.97164	validation_1-auc:0.71117
[211]	validation_0-auc:0.97457	validation_1-auc:0.70924
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6895337135189533
model.best_ntree_limit:  13
AUROC: Train: 0.833717777381336 	 Val: 0.7939869171418987
F1 score: Train: 0.461183704842429 	 Val: 0.41958041958041953
Log loss: Train: 0.510545631356953 	 Val: 0.5463243047751132
------------------------------
Training fold:  7
len(train_idx): 2833 	 len(val_idx): 314
Setting scale_pos_weight to:  6.190355329949239
Training model ...
[0]	validation_0-auc:0.70518	validation_1-auc:0.65623




[50]	validation_0-auc:0.90561	validation_1-auc:0.68446
[100]	validation_0-auc:0.93399	validation_1-auc:0.67476
[150]	validation_0-auc:0.95504	validation_1-auc:0.68858
[200]	validation_0-auc:0.96898	validation_1-auc:0.68849
[250]	validation_0-auc:0.97834	validation_1-auc:0.67184
[300]	validation_0-auc:0.98422	validation_1-auc:0.65974
[350]	validation_0-auc:0.99017	validation_1-auc:0.65048
[364]	validation_0-auc:0.99120	validation_1-auc:0.64387
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6429245687805716
model.best_ntree_limit:  166
AUROC: Train: 0.9607644807412542 	 Val: 0.7053977516519352
F1 score: Train: 0.6916588566073103 	 Val: 0.3770491803278689
Log loss: Train: 0.2776650026826599 	 Val: 0.5967726128175947
------------------------------
Training fold:  8
len(train_idx): 2833 	 len(val_idx): 314
Setting scale_pos_weight to:  6.190355329949239
Training model ...
[0]	validation_0-auc:0.70192	validation_1-auc:0.70411




[50]	validation_0-auc:0.90311	validation_1-auc:0.75200
[100]	validation_0-auc:0.93335	validation_1-auc:0.75680
[150]	validation_0-auc:0.95571	validation_1-auc:0.72985
[200]	validation_0-auc:0.96718	validation_1-auc:0.72994
[206]	validation_0-auc:0.96907	validation_1-auc:0.72548
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.7041105294773878
model.best_ntree_limit:  7
AUROC: Train: 0.8050206771103245 	 Val: 0.782673989530593
F1 score: Train: 0.4411326378539494 	 Val: 0.40243902439024387
Log loss: Train: 0.5464793506784886 	 Val: 0.5857526584273312
------------------------------
Training fold:  9
len(train_idx): 2833 	 len(val_idx): 314
Setting scale_pos_weight to:  6.190355329949239
Training model ...
[0]	validation_0-auc:0.70170	validation_1-auc:0.71437




[50]	validation_0-auc:0.90250	validation_1-auc:0.75732
[100]	validation_0-auc:0.94119	validation_1-auc:0.76186
[150]	validation_0-auc:0.95940	validation_1-auc:0.75122
[200]	validation_0-auc:0.97195	validation_1-auc:0.73363
[250]	validation_0-auc:0.98093	validation_1-auc:0.76375
[277]	validation_0-auc:0.98433	validation_1-auc:0.75388
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.7143653994679482
model.best_ntree_limit:  78
AUROC: Train: 0.92721386604729 	 Val: 0.7755084527589462
F1 score: Train: 0.5965811965811966 	 Val: 0.4159999999999999
Log loss: Train: 0.3683417754415925 	 Val: 0.498010467734391
------------------------------
Fold average stats.: 
AUROC: Train: 0.92721386604729 	 Val: 0.7755084527589462
F1 score: Train: 0.5965811965811966 	 Val: 0.4159999999999999
Log loss: Train: 0.3683417754415925 	 Val: 0.498010467734391




0,1
Fold average Train AUROC,▁
Fold average Val AUROC,▁
Per Epoch Train AUC,▃▅▆▇▄▆▇▇██▃▆▇▇▄▆▇▂▅▆▇▄▆▇▃▆▇▁▅▆▇██▅▆▇▄▆▇█
Per Epoch Val AUC,▇▅▅▅▇▇██▇▇▆▆▅▅▆▆▅▃▄▃▄▄▃▂▄▃▂▁▂▂▂▁▁▄▄▃▄▅▄▅

0,1
Fold average Train AUROC,0.92721
Fold average Val AUROC,0.77551
Per Epoch Train AUC,0.98433
Per Epoch Val AUC,0.75388


[34m[1mwandb[0m: Agent Starting Run: mak1c4wv with config:
[34m[1mwandb[0m: 	colsample_bytree: 1
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	max_leaves: 2
[34m[1mwandb[0m: 	reg_alpha: 2
[34m[1mwandb[0m: 	reg_lambda: 0
[34m[1mwandb[0m: 	subsample: 0.6


config:  {'choice': 3, 'random_state': 42, 'model_name': 'xgb_model_sweep_4', 'feature_version': 'v0', 'include_orig_data': True, 'handle_outliers': False, 'scale_data': False, 'enable_categorical': False, 'fold_split_type': 'strat_kfold', 'num_folds': 10, 'folds_to_train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'tree_method': 'hist', 'n_estimators': 9999, 'early_stopping_rounds': 200, 'colsample_bytree': 0.8, 'subsample': 0.6, 'max_depth': 2, 'max_leaves': 64, 'learning_rate': 0.5, 'reg_alpha': 1, 'reg_lambda': 0, 'scale_pos_weight': 'auto', 'verbosity': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_gpu_if_available': True, 'predictor': 'cpu_predictor', 'use_wandb': True, 'n_jobs': -1, 'data_dir': '/content/data/', 'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E03', 'project_name': 'playground_s03_e03', 'local_model_dir': '/content/model_store/xgb_model_sweep_4', 'drive_model_dir': '/content/drive/MyDrive/Playground Series/S03_E03/model_store/xgb_mode

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Updating sweep configs...
*** Updated sweep config:  {'choice': 3, 'random_state': 42, 'model_name': 'xgb_model_sweep_4', 'feature_version': 'v0', 'include_orig_data': True, 'handle_outliers': False, 'scale_data': False, 'enable_categorical': False, 'fold_split_type': 'strat_kfold', 'num_folds': 10, 'folds_to_train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'tree_method': 'hist', 'n_estimators': 9999, 'early_stopping_rounds': 200, 'colsample_bytree': 1, 'subsample': 0.6, 'max_depth': 4, 'max_leaves': 2, 'learning_rate': 0.05, 'reg_alpha': 2, 'reg_lambda': 0, 'scale_pos_weight': 'auto', 'verbosity': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_gpu_if_available': True, 'predictor': 'cpu_predictor', 'use_wandb': True, 'n_jobs': -1, 'data_dir': '/content/data/', 'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E03', 'project_name': 'playground_s03_e03', 'local_model_dir': '/content/model_store/xgb_model_sweep_4', 'drive_model_dir': '/content/drive/MyDrive/Play



[0]	validation_0-auc:0.62396	validation_1-auc:0.61074
[50]	validation_0-auc:0.78332	validation_1-auc:0.78715
[100]	validation_0-auc:0.79889	validation_1-auc:0.80766
[150]	validation_0-auc:0.81030	validation_1-auc:0.82481
[200]	validation_0-auc:0.81974	validation_1-auc:0.83672
[250]	validation_0-auc:0.82521	validation_1-auc:0.84602
[300]	validation_0-auc:0.83063	validation_1-auc:0.84913
[350]	validation_0-auc:0.83395	validation_1-auc:0.84829
[400]	validation_0-auc:0.83736	validation_1-auc:0.85106
[450]	validation_0-auc:0.84015	validation_1-auc:0.85282
[500]	validation_0-auc:0.84362	validation_1-auc:0.85525
[550]	validation_0-auc:0.84533	validation_1-auc:0.85659
[600]	validation_0-auc:0.84730	validation_1-auc:0.86104
[650]	validation_0-auc:0.84961	validation_1-auc:0.85886
[700]	validation_0-auc:0.85135	validation_1-auc:0.86271
[750]	validation_0-auc:0.85336	validation_1-auc:0.86431
[800]	validation_0-auc:0.85509	validation_1-auc:0.86288
[850]	validation_0-auc:0.85673	validation_1-auc:0.8



[0]	validation_0-auc:0.62232	validation_1-auc:0.62550
[50]	validation_0-auc:0.78635	validation_1-auc:0.78392
[100]	validation_0-auc:0.80195	validation_1-auc:0.78690
[150]	validation_0-auc:0.81203	validation_1-auc:0.80242
[200]	validation_0-auc:0.81956	validation_1-auc:0.80938
[250]	validation_0-auc:0.82594	validation_1-auc:0.81659
[300]	validation_0-auc:0.83116	validation_1-auc:0.82187
[350]	validation_0-auc:0.83518	validation_1-auc:0.82665
[400]	validation_0-auc:0.83868	validation_1-auc:0.82799
[450]	validation_0-auc:0.84209	validation_1-auc:0.83126
[500]	validation_0-auc:0.84381	validation_1-auc:0.83370
[550]	validation_0-auc:0.84675	validation_1-auc:0.83462
[600]	validation_0-auc:0.84961	validation_1-auc:0.83563
[650]	validation_0-auc:0.85121	validation_1-auc:0.83571
[700]	validation_0-auc:0.85303	validation_1-auc:0.83596
[750]	validation_0-auc:0.85481	validation_1-auc:0.83747
[800]	validation_0-auc:0.85652	validation_1-auc:0.83764
[850]	validation_0-auc:0.85828	validation_1-auc:0.8



[50]	validation_0-auc:0.78129	validation_1-auc:0.78606
[100]	validation_0-auc:0.79431	validation_1-auc:0.80573
[150]	validation_0-auc:0.80907	validation_1-auc:0.81944
[200]	validation_0-auc:0.81990	validation_1-auc:0.82976
[250]	validation_0-auc:0.82590	validation_1-auc:0.83680
[300]	validation_0-auc:0.83083	validation_1-auc:0.83890
[350]	validation_0-auc:0.83590	validation_1-auc:0.84569
[400]	validation_0-auc:0.83923	validation_1-auc:0.84971
[450]	validation_0-auc:0.84216	validation_1-auc:0.85089
[500]	validation_0-auc:0.84499	validation_1-auc:0.85072
[550]	validation_0-auc:0.84709	validation_1-auc:0.85466
[600]	validation_0-auc:0.84917	validation_1-auc:0.85307
[650]	validation_0-auc:0.85108	validation_1-auc:0.85382
[700]	validation_0-auc:0.85356	validation_1-auc:0.85206
[750]	validation_0-auc:0.85512	validation_1-auc:0.85257
[774]	validation_0-auc:0.85498	validation_1-auc:0.85055
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.591999



[0]	validation_0-auc:0.62449	validation_1-auc:0.60617
[50]	validation_0-auc:0.78785	validation_1-auc:0.74337
[100]	validation_0-auc:0.80448	validation_1-auc:0.75805
[150]	validation_0-auc:0.81787	validation_1-auc:0.76275
[200]	validation_0-auc:0.82789	validation_1-auc:0.76811
[250]	validation_0-auc:0.83411	validation_1-auc:0.77650
[300]	validation_0-auc:0.83822	validation_1-auc:0.77952
[350]	validation_0-auc:0.84247	validation_1-auc:0.78346
[400]	validation_0-auc:0.84475	validation_1-auc:0.78371
[450]	validation_0-auc:0.84829	validation_1-auc:0.79067
[500]	validation_0-auc:0.85096	validation_1-auc:0.79135
[550]	validation_0-auc:0.85365	validation_1-auc:0.79386
[600]	validation_0-auc:0.85479	validation_1-auc:0.79520
[650]	validation_0-auc:0.85679	validation_1-auc:0.79545
[700]	validation_0-auc:0.85855	validation_1-auc:0.79604
[750]	validation_0-auc:0.86075	validation_1-auc:0.79587
[800]	validation_0-auc:0.86216	validation_1-auc:0.79545
[850]	validation_0-auc:0.86312	validation_1-auc:0.7



[0]	validation_0-auc:0.63448	validation_1-auc:0.64085
[50]	validation_0-auc:0.78690	validation_1-auc:0.77763
[100]	validation_0-auc:0.80272	validation_1-auc:0.78807
[150]	validation_0-auc:0.81399	validation_1-auc:0.79789
[200]	validation_0-auc:0.82495	validation_1-auc:0.79906
[250]	validation_0-auc:0.83010	validation_1-auc:0.80023
[300]	validation_0-auc:0.83556	validation_1-auc:0.79755
[350]	validation_0-auc:0.83942	validation_1-auc:0.79604
[385]	validation_0-auc:0.84225	validation_1-auc:0.79504
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.640850385776585
model.best_ntree_limit:  187
AUROC: Train: 0.8216774279702086 	 Val: 0.8012411942301241
F1 score: Train: 0.4624090541632983 	 Val: 0.4511278195488721
Log loss: Train: 0.5256886749191599 	 Val: 0.5133797838574364
------------------------------
Training fold:  6
len(train_idx): 2832 	 len(val_idx): 315




Setting scale_pos_weight to:  6.206106870229007
Training model ...
[0]	validation_0-auc:0.62710	validation_1-auc:0.57103
[50]	validation_0-auc:0.78715	validation_1-auc:0.75189
[100]	validation_0-auc:0.80135	validation_1-auc:0.76551
[150]	validation_0-auc:0.81474	validation_1-auc:0.77344
[200]	validation_0-auc:0.82454	validation_1-auc:0.78933
[250]	validation_0-auc:0.83111	validation_1-auc:0.78824
[300]	validation_0-auc:0.83806	validation_1-auc:0.79185
[350]	validation_0-auc:0.84088	validation_1-auc:0.79109
[400]	validation_0-auc:0.84435	validation_1-auc:0.79319
[450]	validation_0-auc:0.84689	validation_1-auc:0.79571
[500]	validation_0-auc:0.84967	validation_1-auc:0.79621
[550]	validation_0-auc:0.85231	validation_1-auc:0.79260
[600]	validation_0-auc:0.85525	validation_1-auc:0.79084
[647]	validation_0-auc:0.85669	validation_1-auc:0.79176
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.5710332103321033
model.best_ntree_limit:  448




AUROC: Train: 0.8468858988844341 	 Val: 0.7976350218047635
F1 score: Train: 0.4803212851405622 	 Val: 0.4525547445255475
Log loss: Train: 0.4915535829449599 	 Val: 0.5326403594324514
------------------------------
Training fold:  7
len(train_idx): 2833 	 len(val_idx): 314
Setting scale_pos_weight to:  6.190355329949239
Training model ...
[0]	validation_0-auc:0.63631	validation_1-auc:0.62426
[50]	validation_0-auc:0.78493	validation_1-auc:0.75088
[100]	validation_0-auc:0.80976	validation_1-auc:0.74273
[150]	validation_0-auc:0.82056	validation_1-auc:0.73612
[200]	validation_0-auc:0.83001	validation_1-auc:0.73003
[247]	validation_0-auc:0.83740	validation_1-auc:0.72488
Saving Model...
Saving model...
Copying model to drive...
Best val epoch: min validation_1 auc:  0.6242598472496352
model.best_ntree_limit:  48
AUROC: Train: 0.7842478297879424 	 Val: 0.7527675276752768
F1 score: Train: 0.42668735453840184 	 Val: 0.34285714285714286
Log loss: Train: 0.5931712861011443 	 Val: 0.597249832407684



Training model ...
[0]	validation_0-auc:0.63905	validation_1-auc:0.59916
[50]	validation_0-auc:0.77944	validation_1-auc:0.77547
[100]	validation_0-auc:0.80063	validation_1-auc:0.78538
[150]	validation_0-auc:0.81484	validation_1-auc:0.79173


In [None]:
raise

In [None]:
# orig_data_df = pd.read_csv('/content/data/orig_data.csv')

In [None]:
# orig_data_df.Attrition.values == 'Yes'