<a href="https://colab.research.google.com/github/jaideepmurkute/100-pandas-puzzles/blob/master/S03_E12/play_s03e12_model_xgb_clf_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install wandb -U -qqq
! pip install sklearn -U -qqq
! pip install xgboost -U -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 KB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone


In [None]:
import xgboost
xgboost.__version__

'1.7.4'

In [None]:

import os
import random
import sys
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import pickle

sns.set_style('darkgrid')
from sklearn.datasets import fetch_california_housing 

from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.linear_model import LinearRegression, SGDOneClassSVM
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
# from sklearn.neighbors import LocalOutlierFactor
# from sklearn.metrics import rmse

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss, mean_squared_error, cohen_kappa_score
from sklearn.decomposition import PCA

import xgboost as xgb
import torch

from scipy.stats.mstats import winsorize
from scipy.stats import mode

import wandb

# import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder


In [None]:

def set_seeds(config):
  np.random.seed(config["random_state"])
  random.seed(config["random_state"])
  os.environ["PYTHONHASHSEED"] = str(config["random_state"])
  '''
  torch.manual_seed(config["random_state"])
  if torch.cuda.is_available():
      torch.cuda.manual_seed(config["random_state"])
      torch.cuda.manual_seed_all(config["random_state"])
      torch.backends.cudnn.deterministic = True
      torch.backends.cudnn.benchmark = True 
  '''


def generate_fold_idx(config, train_df, group_col=None):
  if config['fold_split_type'] == 'kfold':
    splitter = KFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'strat_kfold':
    splitter = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'group_kfold':
    splitter = GroupKFold(n_splits=config['num_folds'])
  else:
    raise ValueError("fold_split_type {} not recognized... Choose from: \
                    time_series_split, group_time_series_split, purged_time_series_split, kfold")
  
  fold_idx_dict = dict()
  if config['fold_split_type'] == 'group_kfold':
    if group_col in train_df.columns:
      for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, 
                                                                    groups=train_df[group_col].values)):
        fold_idx_dict[fold_idx] = dict()
        fold_idx_dict[fold_idx]['train_idx'] = train_idx
        fold_idx_dict[fold_idx]['val_idx'] = val_idx
  else:
    # if config['restrict_val_set_to_comp_data']:
    #   comp_data_df = train_df[train_df.original_data==False]
    #   for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=comp_data_df, y=comp_data_df.quality.values)):
    #     fold_idx_dict[fold_idx] = dict()
    #     fold_idx_dict[fold_idx]['train_idx'] = train_idx
    #     fold_idx_dict[fold_idx]['val_idx'] = val_idx
    # else:
    for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, y=train_df.Class.values)):
      fold_idx_dict[fold_idx] = dict()
      fold_idx_dict[fold_idx]['train_idx'] = train_idx
      fold_idx_dict[fold_idx]['val_idx'] = val_idx

  return fold_idx_dict


def save_model(config, model):
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.xgb'
  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  print('Saving model...', model_save_fname)
  model.save_model(model_local_save_path)

  print('Copying model to drive...')
  shutil.copy(model_local_save_path, model_drive_save_path)
  

def load_model(config):
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.xgb'
  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  if not os.path.exists(model_local_save_path):
    shutil.copy(model_drive_save_path, model_local_save_path)
  
  print('Loading model...', model_save_fname)
  model = xgb.XGBClassifier()
  # model = xgb.XGBRFClassifier()
  model.load_model(model_local_save_path)

  return model



def get_xgb_params(config):
  xgb_params = {
            'random_state': config['random_state'], 
            'n_jobs': config['n_jobs'], 
            'verbosity': config['verbosity'], 
            
            'booster': config['booster'], 
            'tree_method': config['tree_method'],  
            'max_depth': config['max_depth'], 
            'max_leaves': config['max_leaves'], 
            'n_estimators': config['n_estimators'], 
            'early_stopping_rounds': config['early_stopping_rounds'], 
            
            'colsample_bytree': config['colsample_bytree'], 
            'subsample': config['subsample'], 
            'reg_alpha': config['reg_alpha'], 
            'reg_lambda': config['reg_lambda'], 
            'enable_categorical': config['enable_categorical'], 
            'max_bin': config['max_bin'], 
            'min_child_weight': config['min_child_weight'], 
            
            'learning_rate': config['learning_rate'], 
            'objective': config['objective'], 
            
            'eval_metric': config['eval_metric'],
            # 'eval_metric': cohen,

            'sample_type': config['sample_type'],  
            'normalize_type': config['normalize_type'],  
            'rate_drop': config['rate_drop'], 
            'skip_drop': config['skip_drop'], 
        }
  
  return xgb_params



In [None]:

def get_data(config):
  for fname in ['train.csv', 'test.csv', 'sample_submission.csv']:
    shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                os.path.join(config['local_data_dir'], fname)
                )
  train_df = pd.read_csv(os.path.join(config['local_data_dir'], 'train.csv'))
  test_df = pd.read_csv(os.path.join(config['local_data_dir'], 'test.csv'))
  sub_df = pd.read_csv(os.path.join(config['local_data_dir'], 'sample_submission.csv'))
  print("Read shape: train_df.shape: ", train_df.shape)
  print("Read shape: test_df.shape: ", test_df.shape)
  print("Read shape: sub_df.shape: ", sub_df.shape)
  
  # train_df.drop(['id'], axis=1, inplace=True)
  train_df['original_data'] = False
    
  # fetch orig_data
  for fname in ['orig_data.csv']:
    shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                os.path.join(config['local_data_dir'], fname)
                )
  orig_data_df = pd.read_csv(os.path.join(config['local_data_dir'], 'orig_data.csv'))
  
  # print('-'*30)
  # print("Before duplicate removal: ")
  # print('orig_data_df.shape: ', orig_data_df.shape)
  # orig_data_df.drop_duplicates(inplace=True)
  # print("After duplicate removal: ")
  # print('orig_data_df.shape: ', orig_data_df.shape)
  # print('-'*30)

  orig_data_df['original_data'] = True
  orig_data_df['id'] = np.arange(orig_data_df.shape[0])
  
  return train_df, test_df, sub_df, orig_data_df
  

def encode_data(config, train_df, test_df):

  # Encode labels to be from 0 to n_categories-1  
  oe = OrdinalEncoder()
  train_labels = np.reshape(train_df.quality.values, newshape=(train_df.shape[0], 1))
  train_df['quality'] = oe.fit_transform(train_labels)
  
  return train_df, test_df, oe


def get_feature_cols(config, train_df):
  config['id_cols'] = ['id', 'original_data']

  config['target_cols'] = ['Class']
  
  config['non_feature_cols'] = config['id_cols'] + config['target_cols']

  config['feature_cols'] = []
  for col in train_df.columns:
    if col not in config['non_feature_cols']:
      config['feature_cols'].append(col)
  
  return config



In [None]:

def scale_data_fn(config, train_df, test_df):
  
  cols_to_scale = config['feature_cols'] # MAKE SURE THESE ARE ALL CONT. FEATURES.

  if config['scaler_type'] == 'standard':
    scaler = StandardScaler()
  elif config['scaler_type'] == 'robust':
    scaler = RobustScaler()
  elif config['scaler_type'] == 'minmax':
    scaler = MinMaxScaler()
  
  scaler.fit(train_df[cols_to_scale])
  train_df[cols_to_scale] = scaler.transform(train_df[cols_to_scale])
  test_df[cols_to_scale] = scaler.transform(test_df[cols_to_scale])

  return train_df, test_df
    

In [None]:

def IQR_outlier_handling(train_df, test_df, cols, handling_type):
  for col in cols:
    # calculate interquartile range
    q25, q75 = np.percentile(train_df[col].values, 25), np.percentile(train_df[col].values, 75)
    iqr = q75 - q25
    
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
    lower_cutoff, upper_cutoff = q25 - cut_off, q75 + cut_off
    
    num_outliers = train_df[col].loc[(train_df[col] < lower_cutoff) | (train_df[col] > upper_cutoff)].shape[0]
    print("col: {} \t # num_outliers: {}".format(col, num_outliers))

    if handling_type == 'remove_train_clip_test':
      train_df[col] = train_df[col].loc[(not(train_df[col] < lower_cutoff)) & (not(train_df[col] > upper_cutoff))]
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
    elif handling_type == 'clip':
      train_df[col].loc[train_df[col] < lower_cutoff] = lower_cutoff
      train_df[col].loc[train_df[col] > upper_cutoff] = upper_cutoff
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
      
    return train_df, test_df



def winsorize_outlier_handling(train_df, test_df, cols, lower_lim=0.01, upper_lim=0.98):
  # lower_lim = train_df.quantile(0.01)
  # upper_lim = train_df.quantile(0.99)
  for col in cols:
    train_df[col] = winsorize(train_df[col], (lower_lim, upper_lim))
    test_df[col] = winsorize(test_df[col], (lower_lim, upper_lim))
  
  return train_df, test_df



def isolation_forest_outlier_handling(train_df, test_df, cols, outlier_thresh=-0.1, 
                                      handling_method='drop_median', seed=0):
  print("Training Isolation forest model to detect outliers...")
  iso_forest_model = IsolationForest(n_estimators=500, contamination='auto', random_state=seed)
  iso_forest_model.fit(train_df[cols], train_df.MedHouseVal.values)
  
  sample_scores_train = iso_forest_model.decision_function(train_df[cols])
  sample_scores_test = iso_forest_model.decision_function(test_df[cols])

  print("# train outliers: ", np.sum(sample_scores_train < outlier_thresh))
  print("# test outliers: ", np.sum(sample_scores_test < outlier_thresh))
  
  if handling_method == 'drop_median':
    print("Dropping outlier train samples...")
    # drop train samples and replace test sample values with median from train columns
    train_df = train_df.loc[sample_scores_train >= outlier_thresh]
    
    print("Clipping outlier test samples to median value...")
    for col in cols:
      test_df[col].loc[sample_scores_test < outlier_thresh] = train_df[col].median(axis=0)
  elif handling_method == 'winsorize':
    train_df.loc[sample_scores_train < outlier_thresh] = winsorize_outlier_handling(train_df.loc[sample_scores_train < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    test_df.loc[sample_scores_test < outlier_thresh] = winsorize_outlier_handling(test_df.loc[sample_scores_test < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    
  return train_df, test_df


def handle_outliers(config, train_df, test_df):
  cols = config['feature_cols']

  if config['outlier_handling_method'] == 'winsorize':
    train_df, test_df = winsorize_outlier_handling(train_df, test_df, cols=cols)
  elif config['outlier_handling_method'] == 'iso_forest':
    train_df, test_df = isolation_forest_outlier_handling(train_df, test_df, cols=cols, 
                                  outlier_thresh=-0.1, handling_method='drop_median', seed=config['seed'])
  elif config['outlier_handling_method'] == 'iqr':
    train_df, test_df = IQR_outlier_handling(train_df, test_df, cols, handling_type='clip')

  return train_df, test_df


In [None]:


def get_config():
  return config


def save_config(config):
  config_to_save = {}  # to avoid types like object or others that somtimes cause problem reading data back.
  for k, v in config.items():
    if isinstance(v, (bool, int, float, str, list, dict, type(None))):
      config_to_save[k] = v
  
  config_local_save_path = os.path.join(config['local_model_dir'], 'saved_config.json')
  config_drive_save_path = os.path.join(config['drive_model_dir'], 'saved_config.json')
  
  with open(config_local_save_path, 'w') as fp:
    json.dump(config_to_save, fp, indent=4, sort_keys=True)
  
  shutil.copy(config_local_save_path, config_drive_save_path)


def save_fold_idx(config, fold_idx_dict):
  fold_idx_local_save_path = os.path.join(config['local_model_dir'], 'fold_idx_dict.pkl')
  fold_idx_drive_save_path = os.path.join(config['drive_model_dir'], 'fold_idx_dict.pkl')
  
  with open(fold_idx_local_save_path, 'wb') as fp:
    pickle.dump(fold_idx_dict, fp)
  
  shutil.copy(fold_idx_local_save_path, fold_idx_drive_save_path)


def load_fold_idx(config):
  fold_idx_local_save_path = os.path.join(config['local_model_dir'], 'fold_idx_dict.pkl')
  fold_idx_drive_save_path = os.path.join(config['drive_model_dir'], 'fold_idx_dict.pkl')
  
  shutil.copy(fold_idx_drive_save_path, fold_idx_local_save_path)

  with open(fold_idx_local_save_path, 'rb') as fp:
    fold_idx_dict = pickle.load(fp)
  
  return fold_idx_dict


def get_model_config(config):
  config_local_save_path = os.path.join(config['local_model_dir'], 'saved_config.json')
  config_drive_save_path = os.path.join(config['drive_model_dir'], 'saved_config.json')
  
  shutil.copy(config_drive_save_path, config_local_save_path)

  with open(config_local_save_path, 'r') as fp:
    model_config = json.load(fp)
  
  return model_config

'''
'id', 'Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve',
'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve', 'Class'
'''
def across_col_feat_v1(config, df):
  feat_cols = []
  for col in df.columns:
    if col not in ['id', 'Class', 'original_data']:
      feat_cols.append(col)
  
  for col1 in feat_cols:
    for col2 in feat_cols:
      if col1 == col2: 
        continue
      temp_df = pd.DataFrame([])
      temp_df[col1 + '_by_' + col2] = df[col1] / (df[col2]+1e-4)
      df = pd.concat((df, temp_df), axis=1)
      
  for col1 in feat_cols:
    for col2 in feat_cols:
      if col1 == col2: 
        continue
      temp_df = pd.DataFrame([])
      temp_df[col1 + '_into_' + col2] = df[col1] * df[col2]
      df = pd.concat((df, temp_df), axis=1)
  
  for col in feat_cols:
    temp_df = pd.DataFrame([])
    temp_df[col + '_square'] = df[col]**2
    df = pd.concat((df, temp_df), axis=1)
  
  for col in feat_cols:
    if df[col].min() > 0:
      temp_df = pd.DataFrame([])
      temp_df[col + '_sqrt'] = df[col]**0.5
      df = pd.concat((df, temp_df), axis=1)
  
  for col in feat_cols:
    temp_df = pd.DataFrame([])
    temp_df[col + '_cube'] = df[col]**3
    df = pd.concat((df, temp_df), axis=1)
  
  for col in feat_cols:
    if df[col].min() > 0:
      temp_df = pd.DataFrame([])
      temp_df[col+'_log'] = np.log(df[col]+1e-4)
      df = pd.concat((df, temp_df), axis=1)
  
  for col in feat_cols:
    temp_df = pd.DataFrame([])
    temp_df[col + '_exp'] = np.exp(df[col])
    df = pd.concat((df, temp_df), axis=1)
  
  # ------------------------

  for col in df.columns:
    nan_cnt = np.sum(np.isnan(df[col]))
    inf_cnt = np.sum(np.isinf(df[col]))
    raise_exp = False
    if nan_cnt > 0:
      raise_exp = True
      print(f"column {col} has {nan_cnt} nans...")
    if inf_cnt > 0:
      raise_exp = True
      print(f"column {col} has {inf_cnt} infs...")
    
    if raise_exp: 
      print(f"Dropping column: {col}")
      df.drop(col, axis=1, inplace=True)
      print("-"*30)
  
  # if raise_exp: raise

  return df

'''
'id', 'Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve',
'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve', 'Class'

Skewness-kurtosis diagram: This is a graphical representation of the skewness and kurtosis of a distribution, 
and it can be used to compare the shapes of different distributions. 
Pulsars are expected to have high kurtosis and low skewness values, and you can plot the skewness 
and kurtosis of the radio wave recording on the diagram to determine if it is consistent with a pulsar signal.

'''
def across_col_feat_v2(config, df):
  feat_cols = []
  for col in df.columns:
    if col not in ['id', 'Class', 'original_data']:
      feat_cols.append(col)
  
  # -------
  # observations confidence intervals
  for ci_mult in [1,2,3]:
    temp_df = pd.DataFrame([])
    temp_df['CI_'+str(ci_mult)+'_pos_observations'] = df['Mean_Integrated'] + (ci_mult*df['SD'])
    temp_df['CI_'+str(ci_mult)+'_neg_observations'] = df['Mean_Integrated'] - (ci_mult*df['SD'])
    
    df = pd.concat([df, temp_df], axis=1)

  # Dispersion Measure (DM) SNR(Signal-to-Noise ratio) confidence intervals
  for ci_mult in [1,2,3]:
    temp_df = pd.DataFrame([])
    temp_df['CI_'+str(ci_mult)+'_pos_DMSNR'] = df['Mean_DMSNR_Curve'] + (ci_mult*df['SD_DMSNR_Curve'])
    temp_df['CI_'+str(ci_mult)+'_neg_DMSNR'] = df['Mean_DMSNR_Curve'] - (ci_mult*df['SD_DMSNR_Curve'])
    
    df = pd.concat([df, temp_df], axis=1)
  
  # -------
  # observations variance
  df['var_observations'] = df['SD']**2

  # DMSNR variance
  df['var_DMSNR'] = df['SD_DMSNR_Curve']**2

  # -------
  # coef of variation - observations
  df['coef_variation_observations'] = df['var_observations'] / df['Mean_Integrated']
  
  # coef of variation - DMSNR
  df['coef_variation_DMSNR'] = df['var_DMSNR'] / df['Mean_DMSNR_Curve']
  
  # -------
  # modulation index - observations
  df['mod_index_observations'] = df['SD'] / df['Mean_Integrated']
  
  # modulation index - DMSNR
  df['mod_index_DMSNR'] = df['SD_DMSNR_Curve'] / df['Mean_DMSNR_Curve']

  # -------
  # Pulsars are expected to have high kurtosis and low skewness values
  df['skew_by_excess_kurt_observations'] = df['Skewness'] / df['EK']
  df['excess_kurt_by_skewness_observations'] = df['EK'] / df['Skewness']
  df['skew_by_excess_kurt_DMSNR'] = df['Skewness_DMSNR_Curve'] / df['EK_DMSNR_Curve']
  df['excess_kurt_by_skewness_DMSNR'] = df['EK_DMSNR_Curve'] / df['Skewness_DMSNR_Curve']
  df['skew_into_kurt_observations'] = df['Skewness'] * df['EK']
  df['skew_into_kurt_DMSNR'] = df['Skewness_DMSNR_Curve'] * df['EK_DMSNR_Curve']
  
  return df



def extract_features(config, train_df, test_df):
  if config['feature_version'] is None:
    pass
  elif config['feature_version'] == 'v1':
    train_df = across_col_feat_v1(config, train_df)
    test_df = across_col_feat_v1(config, test_df)
  elif config['feature_version'] == 'v2':
    train_df = across_col_feat_v2(config, train_df)
    test_df = across_col_feat_v2(config, test_df)
  else:
    print("Feature version {} not supported. Choose from : None, v1")
  
  return train_df, test_df
    

# consider prior distribution of labels?
def find_optimal_cuts():
  pass


def convert_cont_preds_to_int(preds):
  # can do this in many ways - 
  # use round()
  # find optimal splits
  # find optimal splits considering label priors etc.

  preds = np.round(preds)

  return preds


# # def wqkappa(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
# def wqkappa(predt: np.ndarray, dtrain: xgb.DMatrix):
#   ''' Weighted quadratic metric.'''
  
#   y = dtrain.get_label()
  
#   # predt[predt < -1] = -1 + 1e-6
#   # elements = np.power(np.log1p(y) - np.log1p(predt), 2)
#   wqkappa_score = cohen_kappa_score(y, predt, weights='quadratic')
#   return 'wqkappa', wqkappa_score



# def wqkappa(pred: np.ndarray, label: np.ndarray):
#   ''' Weighted quadratic metric.'''
#   wqkappa_score = cohen_kappa_score(label, pred, weights='quadratic')

#   return 'wqkappa', wqkappa_score


def wqkappa(label, pred):
  ''' Weighted quadratic metric.'''

  # pred_int = convert_cont_preds_to_int(pred)
  pred_cls = np.argmax(pred, axis=1)
  
  wqkappa_score = cohen_kappa_score(label, pred_cls, weights='quadratic')
  # print("wqkappa_score: ", wqkappa_score)
  # raise

  # return 'wqkappa', wqkappa_score
  return -1*wqkappa_score


def pca_dim_red(config, train_df, test_df): 
    pca_obj = PCA(n_components=0.99, svd_solver='full', whiten=False, copy=True, tol=0.0)

    print("Before PCA transform:::::::")
    print("train_df.shape: ", train_df.shape)
    print("test_df.shape: ", train_df.shape)
    train_df_trans = pca_obj.fit_transform(train_df[config['feature_cols']])
    test_df_trans = pca_obj.transform(test_df[config['feature_cols']])
    print("After PCA transform:::::::")
    print("train_df_trans.shape: ", train_df_trans.shape)
    print("test_df_trans.shape: ", test_df_trans.shape)

    non_feat_train_cols = [col for col in train_df.columns if col not in config['feature_cols']]
    non_feat_test_cols = [col for col in test_df.columns if col not in config['feature_cols']]
    
    pca_df_col_names = []
    for i in range(train_df_trans.shape[1]):
      pca_df_col_names.append('pca_'+str(i))

    train_df_trans = pd.DataFrame(train_df_trans, columns=pca_df_col_names)
    test_df_trans = pd.DataFrame(test_df_trans, columns=pca_df_col_names)

    
    train_df = pd.concat((train_df_trans.reset_index(drop=True), 
                          train_df[non_feat_train_cols].reset_index(drop=True)), axis=1)
    test_df = pd.concat((test_df_trans.reset_index(drop=True), 
                         test_df[non_feat_test_cols].reset_index(drop=True)), axis=1)

    print(">>>>")
    print("After merging PCA feats with non feature cols::::")
    print("train_df.shape: ", train_df.shape)
    print("test_df.shape: ", test_df.shape)
    
    return train_df, test_df

  

In [None]:

def train_k_folds():

  # needed becuse variable that is updated within function becomes a local variable and has to be passed in.
  config = get_config()
  
  create_paths(config)
  train_df, test_df, sub_df, orig_data_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  print("test_df.shape: ", test_df.shape)
  print("sub_df.shape: ", sub_df.shape)
  print("orig_data_df.shape: ", orig_data_df.shape)
  
  # ------------------------------------------
  
  if config['validate_only_comp_data']:
    fold_idx_dict = generate_fold_idx(config, train_df)

  if config['include_orig_data']: 
    train_df = pd.concat((train_df, orig_data_df), axis=0)
    print("After appending orig data to train data: ")
    print("train_df.shape: ", train_df.shape)
  
  if not config['validate_only_comp_data']:
    fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in fold_idx_dict.keys():
    val_idx = fold_idx_dict[fold_num]['val_idx']
    all_idx = np.arange(0, train_df.shape[0])
    train_idx = np.setdiff1d(all_idx, val_idx)
    fold_idx_dict[fold_num]['train_idx'] = train_idx
  
  # ------------------------------------------

  for fold_num in fold_idx_dict.keys():
    val_idx = fold_idx_dict[fold_num]['val_idx']
    all_idx = np.arange(0, train_df.shape[0])
    train_idx = np.setdiff1d(all_idx, val_idx)
    fold_idx_dict[fold_num]['train_idx'] = train_idx
  
  print("Saving fold_idx_dict...")
  save_fold_idx(config, fold_idx_dict)

  # ---------------
  
  # train_df, test_df, label_encoder = encode_data(config, train_df, test_df)
  # print("After feature encoding: train_df.shape: ", train_df.shape)
  # print("After feature encoding: test_df.shape: ", test_df.shape)
  
  # ---------------

  print("Before feature extraction: train_df.shape: ", train_df.shape)
  print("Before feature extraction: test_df.shape: ", test_df.shape)
  train_df, test_df = extract_features(config, train_df, test_df)
  print("After feature extraction: train_df.shape: ", train_df.shape)
  print("After feature extraction: test_df.shape: ", test_df.shape)
  
  # ---------------

  config = get_feature_cols(config, train_df)
  print("config['feature_cols']): ", config['feature_cols'])
  print("# feature_cols: ", len(config['feature_cols']))

  # ---------------
  
  if config['handle_outliers']:
    print("Before outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
    train_df, test_df = handle_outliers(config, train_df, test_df)
    print("After outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  # ---------------
  if config['dim_reduction'] and config['dim_red_method']=='PCA':
    config['scale_data'] = True
  
  if config['scale_data']:
    train_df, test_df = scale_data_fn(config, train_df, test_df)
  
  # ------------------
  
  if config['dim_reduction']:
    if config['dim_red_method']=='PCA':
      train_df, test_df = pca_dim_red(config, train_df, test_df)
    elif config['dim_red_method']=='UMAP':
      print("dim reduction method UMAP not yet supported...")
    else: 
      raise
    
  config = get_feature_cols(config, train_df)
  print("config['feature_cols']): ", config['feature_cols'])
  print("# feature_cols: ", len(config['feature_cols']))


  # ------------------------
  
  per_model_metrics = {
                        'AUROC': {'train': [], 'val': []},
                        'f1_score': {'train': [], 'val': []},
                        'logloss': {'train': [], 'val': []},
                      }
  
  shutil.copy('/content/drive/MyDrive/Playground Series/S03_E12/code/play_s03e12_model_xgb_clf_1.ipynb', 
              os.path.join(config['drive_model_dir'], 'play_s03e12_model_xgb_clf_1.ipynb'))
  save_config(config)

  # ------------

  # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
  # if config['use_wandb']:
  wandb.init(name=config['model_name'], project=config['project_name'], 
            tags=['baseline'], config=config)
  if config['choice'] == 3:
    print("Updating sweep configs...")
    for k, v in wandb.config.items():
      config[k] = v
    print("*** Updated sweep config: ", config)
  
  # ------------

  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue

    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
    # # if config['use_wandb']:
    # wandb.init(name=fold_model_name, project=config['project_name'], 
    #           tags=['baseline'], config=config)
    # if config['choice'] == 3:
    #   print("Updating sweep configs...")
    #   for k, v in wandb.config.items():
    #     config[k] = v
    #   print("*** Updated sweep config: ", config)
    
    set_seeds(config)
    
    # -----------
    
    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx]

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]

    xgb_params = get_xgb_params(config)

    if config['scale_pos_weight'] == 'auto':
      auto_pos_cls_weight = train_label[train_label.values==0].shape[0] / train_label[train_label.values==1].shape[0]
      print("Setting scale_pos_weight to: ", auto_pos_cls_weight)
      xgb_params['scale_pos_weight'] = auto_pos_cls_weight
    else:
      xgb_params['scale_pos_weight'] = config['scale_pos_weight']
    
    model = xgb.XGBClassifier(**xgb_params)
    
    print("Training model ...")
    model.fit(train_data, train_label, 
              eval_set=[(train_data, train_label), 
                        (val_data, val_label)], 
              verbose=50, 
              # early_stopping_rounds=config['early_stopping_rounds']
              )
    
    print("Saving Model...")
    save_model(config, model)

    # -----------
    # print(model.evals_result())
    if config['choice'] == 1:
      for i in range(len(model.evals_result()['validation_0'][config['eval_metric']])):
        wandb.log({
          f"Per Epoch Train {config['eval_metric']}": model.evals_result()['validation_0'][config['eval_metric']][i], 
          f"Per Epoch Val {config['eval_metric']}": model.evals_result()['validation_1'][config['eval_metric']][i], 
          }
        )
    
    if config['eval_metric'] in ['loss', 'wqkappa', 'logloss']:
      # Using min and argmin here since eval metric is -1*kappa; 
      # since xgboost does not seem to have option to specify if custom eval_metric should maximize or minimize
      # So results in improper early stopping.
      print("Best val epoch: max validation_1 metric: ", 
            np.min(model.evals_result()['validation_1'][config['eval_metric']]))
      print("Best val epoch number:  ", np.argmin(model.evals_result()['validation_1'][config['eval_metric']]))
    else:
      print("Best val epoch: max validation_1 metric: ", 
            np.max(model.evals_result()['validation_1'][config['eval_metric']]))
      print("Best val epoch number:  ", np.argmax(model.evals_result()['validation_1'][config['eval_metric']]))
    
    print("model.best_ntree_limit: ", model.best_ntree_limit)

    
    # ------------------
    
    train_pred_probs = model.predict_proba(train_data, ntree_limit=model.best_ntree_limit)
    val_pred_probs = model.predict_proba(val_data, ntree_limit=model.best_ntree_limit)
    train_preds = np.argmax(train_pred_probs, axis=1)
    val_preds = np.argmax(val_pred_probs, axis=1)
    
    # ------------------

    train_auroc = roc_auc_score(train_label.values, train_pred_probs[:, 1])
    val_auroc = roc_auc_score(val_label.values, val_pred_probs[:, 1])
    
    train_logloss = log_loss(train_label.values, train_pred_probs[:, 1])
    val_logloss = log_loss(val_label.values, val_pred_probs[:, 1])
    
    train_f1 = f1_score(train_label.values, train_preds, average='weighted')
    val_f1 = f1_score(val_label.values, val_preds, average='weighted')
    
    # ------------------

    per_model_metrics['logloss']['train'].append(train_logloss)
    per_model_metrics['logloss']['val'].append(val_logloss)
    per_model_metrics['AUROC']['train'].append(train_auroc)
    per_model_metrics['AUROC']['val'].append(val_auroc)
    per_model_metrics['f1_score']['train'].append(train_f1)
    per_model_metrics['f1_score']['val'].append(val_f1)
    
    print(f"Logloss: Train: {train_logloss} \t Val: {val_logloss}")
    print(f"AUROC: Train: {train_auroc} \t Val: {val_auroc}")
    print(f"F1 score: Train: {train_f1} \t Val: {val_f1}")
    
    print('-'*30)
  
  print("Fold average stats.: ")
  print(f"Logloss: Train: {np.mean(per_model_metrics['logloss']['train'])} \t Val: {np.mean(per_model_metrics['logloss']['val'])}")
  print(f"AUROC: Train: {np.mean(per_model_metrics['AUROC']['train'])} \t Val: {np.mean(per_model_metrics['AUROC']['val'])}")
  print(f"F1 score: Train: {np.mean(per_model_metrics['f1_score']['train'])} \t Val: {np.mean(per_model_metrics['f1_score']['val'])}")
  
  print("Per fold train logloss: ", per_model_metrics['logloss']['train'])
  print("Per fold val logloss: ", per_model_metrics['logloss']['val'])
  
  wandb.log({
        "Fold Avg. Train Logloss": np.mean(per_model_metrics['logloss']['train']), 
        "Fold Avg. Val Logloss": np.mean(per_model_metrics['logloss']['val']),
        }
      )
  




In [None]:

def create_paths(config):
  config['local_model_dir'] = '/content/model_store'
  config['drive_model_dir'] = os.path.join(config['drive_project_dir'], 'model_store')
  
  if not os.path.exists(config['local_model_dir']):
    os.mkdir(config['local_model_dir'])
  
  if not os.path.exists(config['drive_model_dir']):
    os.mkdir(config['drive_model_dir'])
  
  # -------------
  
  config['local_model_dir'] = os.path.join(config['local_model_dir'], config['model_name']) 
  config['drive_model_dir'] = os.path.join(config['drive_model_dir'], config['model_name']) 

  if not os.path.exists(config['local_model_dir']): 
    os.mkdir(config['local_model_dir'])
  if not os.path.exists(config['drive_model_dir']): 
    os.mkdir(config['drive_model_dir'])
  
  # -------------

  config['local_data_dir'] = '/content/data'
  config['drive_data_dir'] = os.path.join(config['drive_project_dir'], 'data/')

  config['local_feature_dir'] = '/content/feature_store'
  config['drive_feature_dir'] = os.path.join(config['drive_project_dir'], 'feature_store')

  if not os.path.exists(config['local_data_dir']):
    os.mkdir(config['local_data_dir'])
  
  if not os.path.exists(config['local_feature_dir']):
    os.mkdir(config['local_feature_dir'])



In [None]:

def aggregate_preds(config, per_fold_test_preds, per_fold_test_preds_int, per_model_metrics=None): 
  if config['aggr_type'] == 'simple':
    aggr_test_preds_cont = np.average(per_fold_test_preds, axis=0).flatten()
    aggr_test_preds_int = np.argmax(aggr_test_preds_cont, axis=1).flatten()
  elif config['aggr_type'] == 'auroc':
    aggr_test_preds_cont = np.average(per_fold_test_preds, axis=0, weights=per_model_metrics['AUROC']['val'])
    aggr_test_preds_int = np.argmax(aggr_test_preds_cont, axis=1).flatten()
  elif config['aggr_type'] == 'logloss':
    weights = 1/np.array(per_model_metrics['logloss']['val'])
    # print('weights.shape: ', weights.shape)
    # print('per_fold_test_preds.shape: ', per_fold_test_preds.shape)
    
    aggr_test_preds_cont = np.average(per_fold_test_preds, axis=0, weights=weights).flatten()
    aggr_test_preds_int = convert_cont_preds_to_int(aggr_test_preds_cont)
  elif config['aggr_type'] == 'mode':
    aggr_test_preds_int = mode(per_fold_test_preds_int, axis=0)
    aggr_test_preds_cont = aggr_test_preds_int
  
  return aggr_test_preds_cont, aggr_test_preds_int



def test_model(config):
  create_paths(config)

  # ----------------------------------

  model_training_config = get_model_config(config)
  print("model_training_config: ", model_training_config)
  for key in ['scale_data', 'scaler_type', 'handle_outliers', 'outlier_handling_method', 
              'feature_version', 'validate_only_comp_data', 'include_orig_train_data', 'include_orig_test_data', 
              'fold_split_type', 'num_folds', 'random_state', 'seed', 
              'dim_reduction', 'dim_red_method']:  
    if key in model_training_config.keys():
      print(f"Overwriting value for {key} with: {model_training_config[key]}")
      config[key] = model_training_config[key]
  
  if config['dim_reduction'] and config['dim_red_method']=='PCA':
    config['scale_data'] = True
  
  # ----------------------------------

  train_df, test_df, sub_df, orig_data_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  print("test_df.shape: ", test_df.shape)
  print("sub_df.shape: ", sub_df.shape)
  print("orig_data_df.shape: ", orig_data_df.shape)
  
  # -------------------------------------------

  if config['validate_only_comp_data']:
    fold_idx_dict = generate_fold_idx(config, train_df)

  if config['include_orig_data']: 
    train_df = pd.concat((train_df, orig_data_df), axis=0)
    print("After appending orig data to train data: ")
    print("train_df.shape: ", train_df.shape)
  
  if not config['validate_only_comp_data']:
    fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in fold_idx_dict.keys():
    val_idx = fold_idx_dict[fold_num]['val_idx']
    all_idx = np.arange(0, train_df.shape[0])
    train_idx = np.setdiff1d(all_idx, val_idx)
    fold_idx_dict[fold_num]['train_idx'] = train_idx
  
  print("Loading fold_idx_dict...")
  fold_idx_dict = load_fold_idx(config)
  
  # -------------------------------------------

  test_ids = test_df.id.values
  
  # print("Before feature encoding: train_df.shape: ", train_df.shape)
  # print("Before feature encoding: test_df.shape: ", test_df.shape)
  # train_df, test_df, label_encoder = encode_data(config, train_df, test_df)
  # print("After feature encoding: train_df.shape: ", train_df.shape)
  # print("After feature encoding: test_df.shape: ", test_df.shape)
  
  # ---------------

  print("Before feature extraction: train_df.shape: ", train_df.shape)
  print("Before feature extraction: test_df.shape: ", test_df.shape)
  train_df, test_df = extract_features(config, train_df, test_df)
  print("After feature extraction: train_df.shape: ", train_df.shape)
  print("After feature extraction: test_df.shape: ", test_df.shape)
  
  # ---------------

  config = get_feature_cols(config, train_df)
  print("# feature_cols: ", len(config['feature_cols']))
  
  # ------------------
  
  if config['handle_outliers']:
    print("Before outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
    train_df, test_df = handle_outliers(config, train_df, test_df)
    print("After outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  # ------------------

  if config['dim_reduction'] and config['dim_red_method']=='PCA':
    config['scale_data'] = True
  
  if config['scale_data']:
    print("Scaling data...")
    train_df, test_df = scale_data_fn(config, train_df, test_df)
    

  # ------------------
  if config['dim_reduction']:
    if config['dim_red_method']=='PCA':
      train_df, test_df = pca_dim_red(config, train_df, test_df)
    elif config['dim_red_method']=='UMAP':
      print("dim reduction method UMAP not yet supported...")
    else: 
      raise
    
  config = get_feature_cols(config, train_df)
  print("config['feature_cols']): ", config['feature_cols'])
  print("# feature_cols: ", len(config['feature_cols']))
  

  # ------------------------


  # fold_idx_dict = generate_fold_idx(config, train_df)
  
  per_model_metrics = {
                        'logloss': {'train': [], 'val': []},
                        'AUROC': {'train': [], 'val': []},
                        'f1_score': {'train': [], 'val': []},
                      }

  per_fold_test_preds = None
  per_fold_test_preds_int = None
  pred_cnt = 0

  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue
    pred_cnt += 1
    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])

    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx]

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]

    test_data = test_df[config['feature_cols']]
    
    # --------

    model = load_model(config)

    train_pred_probs = model.predict_proba(train_data, ntree_limit=model.best_ntree_limit)
    val_pred_probs = model.predict_proba(val_data, ntree_limit=model.best_ntree_limit)
    test_pred_probs = model.predict_proba(test_data, ntree_limit=model.best_ntree_limit)
    
    train_preds = np.argmax(train_pred_probs, axis=1)
    val_preds = np.argmax(val_pred_probs, axis=1)
    test_preds = np.argmax(test_pred_probs, axis=1)

    # ------------------

    train_logloss = log_loss(train_label.values, train_pred_probs[:, 1])
    val_logloss = log_loss(val_label.values, val_pred_probs[:, 1])
    train_auroc = roc_auc_score(train_label.values, train_pred_probs[:, 1])
    val_auroc = roc_auc_score(val_label.values, val_pred_probs[:, 1])
    train_f1 = f1_score(train_label.values, train_preds, average='weighted')
    val_f1 = f1_score(val_label.values, val_preds, average='weighted')
    
    
    # ------------------
    
    per_model_metrics['logloss']['train'].append(train_logloss)
    per_model_metrics['logloss']['val'].append(val_logloss)
    per_model_metrics['AUROC']['train'].append(train_auroc)
    per_model_metrics['AUROC']['val'].append(val_auroc)
    per_model_metrics['f1_score']['train'].append(train_f1)
    per_model_metrics['f1_score']['val'].append(val_f1)
    
    print(f"Logloss: Train: {train_logloss} \t Val: {val_logloss}")
    print(f"AUROC: Train: {train_auroc} \t Val: {val_auroc}")
    print(f"F1 score: Train: {train_f1} \t Val: {val_f1}")
    
    test_pred_probs = np.reshape(test_pred_probs, newshape=(1, test_pred_probs.shape[0], test_pred_probs.shape[1]))
    test_preds = np.reshape(test_preds, newshape=(1, test_preds.shape[0]))
    
    if per_fold_test_preds is None:
      per_fold_test_pred_probs = test_pred_probs
      per_fold_test_preds = test_preds
    else:
      per_fold_test_pred_probs = np.concatenate((per_fold_test_pred_probs, test_pred_probs), axis=0)
      per_fold_test_preds = np.concatenate((per_fold_test_preds, test_preds), axis=0)
     

  print("Fold average stats.: ")
  print(f"Loss: Train: {np.mean(per_model_metrics['logloss']['train'])} \t Val: {np.mean(per_model_metrics['logloss']['val'])}")
  print(f"AUROC: Train: {np.mean(per_model_metrics['AUROC']['train'])} \t Val: {np.mean(per_model_metrics['AUROC']['val'])}")
  print(f"F1 score: Train: {np.mean(per_model_metrics['f1_score']['train'])} \t Val: {np.mean(per_model_metrics['f1_score']['val'])}")
  
  print("-"*30)
  # print("Per fold train wqkappa: ", per_model_metrics['wqkappa']['train'])
  # print("Per fold val wqkappa: ", per_model_metrics['wqkappa']['val'])

  print("Per fold train AUROC: ", per_model_metrics['AUROC']['train'])
  print("Per fold val AUROC: ", per_model_metrics['AUROC']['val'])
  
  print("Per fold train logloss: ", per_model_metrics['logloss']['train'])
  print("Per fold val logloss: ", per_model_metrics['logloss']['val'])

  print("-"*30)
  
  # ------------------

  print("per_fold_test_pred_probs.shape: ", per_fold_test_pred_probs.shape)
  print("per_fold_test_preds.shape: ", per_fold_test_preds.shape)
  print("per_fold_test_pred_probs[:, :, 1].shape: ", per_fold_test_pred_probs[:, :, 1].shape)
  
  aggr_test_pred_probs, aggr_test_preds = \
                  aggregate_preds(config, per_fold_test_pred_probs[:, :, 1], per_fold_test_preds, per_model_metrics)
  print("aggr_test_pred_probs[:3]: ", aggr_test_pred_probs[:3])
  print("aggr_test_preds[:10]: ", aggr_test_preds[:10])

  aggr_test_preds = np.reshape(aggr_test_preds, newshape=(aggr_test_preds.shape[0], 1))
  # aggr_test_preds = label_encoder.inverse_transform(aggr_test_preds)
  print("2. aggr_test_preds[:10]: ", aggr_test_preds[:10])
  print("2. aggr_test_preds.shape: ", aggr_test_preds.shape)

  sub_df = pd.DataFrame([])
  sub_df['id'] = test_ids 
  sub_df['Class'] = aggr_test_pred_probs
  sub_df.to_csv(os.path.join(config['local_model_dir'], 'sample_submission.csv'), index=False)
  shutil.copy(os.path.join(config['local_model_dir'], 'sample_submission.csv'), 
              os.path.join(config['drive_model_dir'], 'sample_submission.csv'))


# submission_{notebook_name}_{date}_{time}.csv


In [None]:

def model_analysis():
  config = get_config()
  create_paths(config)
  
  train_df, test_df, sub_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  test_ids = test_df.Id.values

  train_df, test_df, label_encoder = encode_data(config, train_df, test_df)

  config = get_feature_cols(config, train_df)

  fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue
    config['curr_fold'] = fold_num

    # -----------
    # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])

    # train_idx = fold_idx_dict[fold_num]['train_idx']
    # val_idx = fold_idx_dict[fold_num]['val_idx']
    # print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    # train_data = train_df[config['feature_cols']].iloc[train_idx]
    # train_label = train_df[config['target_cols']].iloc[train_idx]

    # val_data = train_df[config['feature_cols']].iloc[val_idx]
    # val_label = train_df[config['target_cols']].iloc[val_idx]

    # test_data = test_df[config['feature_cols']]
    
    # --------

    model = load_model(config)
    plt.figure(figsuze=(12, 10))
    xgb.plot_importance(model)
    plt.title(f"Fold: {fold_num}")
    plt.show()
    plt.close()


In [None]:


config = {
    'choice': 1, 
    'random_state': 21, # 2640
    'aggr_type': 'logloss',  # simple / logloss
    
    'model_name': 'xgb_model_1', 
    
    'feature_version': None,  # None, v1, v2, v3
    'include_orig_data': True,
    'validate_only_comp_data': True, 
    
    'handle_outliers': False, 
    'outlier_handling_method': 'iqr', 
    
    'scale_data': True, 
    'scaler_type': 'standard',  # standard / robust / minmax

    #### OrdinalEncoder ??????????????? <<<<<<<<<<<<<<<<  <<<<<<<<<<<
    
    'dim_reduction': False, 
    'dim_red_method': 'PCA',   # PCA / UMAP (UMAP not impl yet)
    
    'enable_categorical': False, 
    'fold_split_type': 'strat_kfold',  # kfold, strat_kfold
    'num_folds': 5, 
    'folds_to_train': [0], #  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    
    'booster': 'gbtree', # 'gbtree', #'gbtree', # gbtree, gblinear, dart;   defalt: gbtree
    'tree_method': 'hist', # auto, exact, approx, hist, gpu_hist,   auto: Use heuristic to choose the fastest method.
    'n_estimators': 100, # 9999, 
    'early_stopping_rounds': 30, #200, 

    'colsample_bytree': 1.0, 
    'subsample': 1.0, 
    
    'max_depth': 5, # 6, 
    'max_leaves': 32, # 48,  
    'learning_rate': 0.1,  

    'reg_alpha': 0.0,  # Default: 0
    'reg_lambda': 0.0,  # Default: 0
    
    # for the dart booster
    'sample_type': 'uniform',  # 'uniform'/ 'weighted'; Default: uniform
    'normalize_type': 'tree',  # 'tree' / 'forest'; Default: tree
    'rate_drop': 0.3,  # Default: 0.0
    'skip_drop': 0.6,  # Default: 0.0 

    'max_bin': 512,  # Default: 256
    'min_child_weight': 1, # Default: 1
    'gamma': 0,   # default: 0

    # if 'auto'; will be overridden as sum(negative instances) / sum(positive instances). 
    # Else; provided value will be used.
    # Defaut: 1
    'scale_pos_weight': 'auto',  # 'auto' / 10 / 25 etc.

    'verbosity': 1,

    'objective': 'reg:logistic', 
    'eval_metric': 'logloss',  

    'use_gpu_if_available': True, 
    'predictor': 'gpu_predictor',
    'use_wandb': True, # Defaults to true if choice==3.
    'n_jobs': -1, 
    'data_dir': '/content/data/', 
    'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E12', 
    'project_name': 'playground_s03_e12', 
}



if config['use_gpu_if_available']:
  if torch.cuda.is_available():
    config['gpu_id'] = 0
    config['tree_method'] = 'gpu_hist'
    config['predictor'] = 'gpu_predictor'
    print("GPU available... XGBoost will use GPU...")
  else:
    print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to TRUE; But NO GPU IS VISIBLE!!!!!")
    if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
    if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor'
else:
  if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
  if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor' 
  print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to False!!!!!!!")    


if config['dim_reduction'] and config['dim_red_method']=='PCA':
  config['scale_data'] = True


if config['choice'] == 3: config['use_wandb'] = True

if config['use_wandb']:
  os.environ['WANDB_MODE'] = 'online'
  try: 
    wandb.login(key='d60ad29783a045de090c17001912975dc8f9f2e2') 
  except:
    wandb.login()
# else:
# os.environ['WANDB_MODE'] = 'offline'

set_seeds(config)

if config['choice'] == 1:
  train_k_folds()
elif config['choice'] == 2:
  test_model(config)
elif config['choice'] == 3:
  sweep_configs = {
      "method": "grid",
      "metric": {
          "name": "Fold Avg. Val Logloss",
          "goal": "minimize",  # this wqkappa is output of sklearn function after predictions. SO can maximize.
      },
      "parameters": { 
          # "colsample_bytree": {
          #     "values": [0.6, 0.8, 1.0], 
          # },
          # "subsample": {
          #     "values": [0.6, 0.8, 1.0], 
          # },
          # "max_depth": {
          #     "values": [5, 8, 12, 24]
          # },
          # 'max_leaves': {
          #     'values': [8, 16, 32, 64],
          # },
          # "reg_alpha": {
          #     "values": [0, 1.0, 2.0, 5.0]
          # },
          # "reg_lambda": {
          #     "values": [0, 1.0, 2.0, 5.0]
          # },
          # "learning_rate": {
          #     "values": [0.01, 0.05, 0.1, 0.3, 0.5]
          # }
          "random_state": {
              "values": random.sample(range(1, 9999), 50), # [0.01, 0.05, 0.1, 0.3, 0.5]
          },
          # "max_bin": {
          #     "values": [64, 128, 256],
          # },
          # "min_child_weight": {
          #     "values": [1, 16, 32],
          # },
          # "gamma": {
          #     "values": [0, 2, 5, 10],
          # },
        }
  }
  print("Running sweep>>>>>>>>>>>>>>>>>>>>>>>>")
  sweep_id = wandb.sweep(sweep=sweep_configs, project=config['project_name']+'_sweep')
  wandb.agent(sweep_id=sweep_id, function=train_k_folds, count=50)
elif config['choice'] == 4:
  model_analysis()
else:
  raise ValueError(f"Incorrect value for 'choice'={config['choice']} in config")





NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to TRUE; But NO GPU IS VISIBLE!!!!!
Read shape: train_df.shape:  (117564, 10)
Read shape: test_df.shape:  (78377, 9)
Read shape: sub_df.shape:  (78377, 2)
train_df.shape:  (117564, 11)
test_df.shape:  (78377, 9)
sub_df.shape:  (78377, 2)
orig_data_df.shape:  (17898, 11)
After appending orig data to train data: 
train_df.shape:  (135462, 11)
Saving fold_idx_dict...
Before feature extraction: train_df.shape:  (135462, 11)
Before feature extraction: test_df.shape:  (78377, 9)
After feature extraction: train_df.shape:  (135462, 11)
After feature extraction: test_df.shape:  (78377, 9)
config['feature_cols']):  ['Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve', 'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve']
# feature_cols:  8
config['feature_cols']):  ['Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve', 'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve']
# feature_cols:  8






0,1
Fold Avg. Train Logloss,▁
Fold Avg. Val Logloss,▁
Per Epoch Train logloss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Per Epoch Val logloss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Fold Avg. Train Logloss,0.02794
Fold Avg. Val Logloss,0.03344
Per Epoch Train logloss,0.02211
Per Epoch Val logloss,0.03376


Training fold:  0
len(train_idx): 111949 	 len(val_idx): 23513
Setting scale_pos_weight to:  9.750888312686065
Training model ...
[0]	validation_0-logloss:0.60647	validation_1-logloss:0.60572
[50]	validation_0-logloss:0.07164	validation_1-logloss:0.06886
[100]	validation_0-logloss:0.05654	validation_1-logloss:0.05697
[150]	validation_0-logloss:0.05156	validation_1-logloss:0.05408
[200]	validation_0-logloss:0.04723	validation_1-logloss:0.05181
[250]	validation_0-logloss:0.04393	validation_1-logloss:0.05011
[300]	validation_0-logloss:0.04080	validation_1-logloss:0.04836
[350]	validation_0-logloss:0.03764	validation_1-logloss:0.04663
[400]	validation_0-logloss:0.03523	validation_1-logloss:0.04552
[450]	validation_0-logloss:0.03246	validation_1-logloss:0.04420
[500]	validation_0-logloss:0.03036	validation_1-logloss:0.04344


KeyboardInterrupt: ignored

In [None]:
raise

RuntimeError: ignored

In [None]:
# import pandas as pd
# import numpy as np

# base_path = '/content/data'
# train_df = pd.read_csv(os.path.join(base_path, 'train.csv'))
# test_df = pd.read_csv(os.path.join(base_path, 'test.csv'))
# orig_data_df = pd.read_csv(os.path.join(base_path, 'orig_data.csv'))



In [None]:
# train_df.columns

In [None]:

# sub_df = pd.read_csv('/content/model_store/xgb_model_1/sample_submission.csv')
# sub_df
