<a href="https://colab.research.google.com/github/jaideepmurkute/100-pandas-puzzles/blob/master/S03_E08/play_s03e08_model_lgb_regr_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
! pip install wandb -U -qqq                                           
! pip install sklearn -U -qqq
! pip install xgboost==1.6.0 #-U -qqq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [71]:
import xgboost
xgboost.__version__

'1.6.0'

In [72]:

import os
import random
import sys
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import joblib

sns.set_style('darkgrid')
from sklearn.datasets import fetch_california_housing 

from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.linear_model import LinearRegression, SGDOneClassSVM
from sklearn.preprocessing import StandardScaler, RobustScaler
# from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# import xgboost as xgb
import lightgbm as lgb
import torch

from scipy.stats.mstats import winsorize
from scipy.stats import mode

import wandb

# import category_encoders as ce



In [73]:

def set_seeds(config):
  np.random.seed(config["random_state"])
  random.seed(config["random_state"])
  os.environ["PYTHONHASHSEED"] = str(config["random_state"])
  '''
  torch.manual_seed(config["random_state"])
  if torch.cuda.is_available():
      torch.cuda.manual_seed(config["random_state"])
      torch.cuda.manual_seed_all(config["random_state"])
      torch.backends.cudnn.deterministic = True
      torch.backends.cudnn.benchmark = True 
  '''


def generate_fold_idx(config, train_df, group_col=None):
  if config['fold_split_type'] == 'kfold':
    splitter = KFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'strat_kfold':
    splitter = StratifiedKFold(n_splits=config['num_folds'], shuffle=True, 
                                     random_state=config['random_state'])
  elif config['fold_split_type'] == 'group_kfold':
    splitter = GroupKFold(n_splits=config['num_folds'])
  elif config['fold_split_type'] == 'time_series_split':  # can use purged as well.
    splitter = TimeSeriesSplit(n_splits=config['num_folds'])
  else:
    raise ValueError("fold_split_type {} not recognized... Choose from: \
                    time_series_split, group_time_series_split, purged_time_series_split, kfold")
  
  fold_idx_dict = dict()
  if config['fold_split_type'] == 'group_kfold':
    if group_col in train_df.columns:
      for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, 
                                                                    groups=train_df[group_col].values)):
        fold_idx_dict[fold_idx] = dict()
        fold_idx_dict[fold_idx]['train_idx'] = train_idx
        fold_idx_dict[fold_idx]['val_idx'] = val_idx
  else:
    # if config['restrict_val_set_to_comp_data']:
    #   comp_data_df = train_df[train_df.original_data==False]
    #   for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=comp_data_df, y=comp_data_df.quality.values)):
    #     fold_idx_dict[fold_idx] = dict()
    #     fold_idx_dict[fold_idx]['train_idx'] = train_idx
    #     fold_idx_dict[fold_idx]['val_idx'] = val_idx
    # else:
    for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X=train_df, y=train_df.price.values)):
      fold_idx_dict[fold_idx] = dict()
      fold_idx_dict[fold_idx]['train_idx'] = train_idx
      fold_idx_dict[fold_idx]['val_idx'] = val_idx

  return fold_idx_dict


def save_model(config, model):
  # if config['enable_categorical']:
  #   model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.json'
  # else:
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.pkl'

  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  print('Saving model...')
  # model.save_model(model_local_save_path)
  joblib.dump(model, model_local_save_path)

  # with open(model_local_save_path, 'wb') as fp:
  #   json.dump(model, fp)

  print('Copying model to drive...')
  shutil.copy(model_local_save_path, model_drive_save_path)
  

def load_model(config):
  # if config['enable_categorical']:
  #   model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.json'
  # else:
  model_save_fname = config['model_name'] + '_fold_' + str(config['curr_fold']) + '.pkl'
  
  model_local_save_path = config['local_model_dir'] + '/' + model_save_fname
  model_drive_save_path = config['drive_model_dir'] + '/' + model_save_fname

  if not os.path.exists(model_local_save_path):
    shutil.copy(model_drive_save_path, model_local_save_path)
  
  print('Loading model...')
  # model = lgb.LGBMRegressor()
  # model.load_model(model_local_save_path)
  model = joblib.load(model_local_save_path)

  # with open(model_local_save_path, 'rb') as fp:
  #   model = json.load(model_local_save_path)

  return model



def get_lgb_params(config):
  xgb_params = {
            'random_state': config['random_state'], 
            'n_jobs': config['n_jobs'], 
            'verbosity': config['verbosity'], 
            
            'boosting_type': config['boosting_type'], 
            'max_depth': config['max_depth'], 
            'max_leaves': config['max_leaves'], 
            'n_estimators': config['n_estimators'], 
            'early_stopping_rounds': config['early_stopping_rounds'], 
            
            'colsample_bytree': config['colsample_bytree'], 
            'subsample': config['subsample'], 
            
            # 'enable_categorical': config['enable_categorical'], 
            
            'reg_alpha': config['reg_alpha'], 
            'reg_lambda': config['reg_lambda'], 
            'drop_rate': config['drop_rate'], 
            'max_drop': config['max_drop'], 

            'max_bin': config['max_bin'], 
            'min_data_in_leaf': config['min_data_in_leaf'], 
            
            'learning_rate': config['learning_rate'], 
            'objective': config['objective'], 
            
            'metric': config['eval_metric'],
            # 'eval_metric': cohen,

        }
  
  return xgb_params



In [74]:

def get_data(config):
  for fname in ['train.csv', 'test.csv', 'sample_submission.csv']:
    shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                os.path.join(config['local_data_dir'], fname)
                )
  train_df = pd.read_csv(os.path.join(config['local_data_dir'], 'train.csv'))
  test_df = pd.read_csv(os.path.join(config['local_data_dir'], 'test.csv'))
  sub_df = pd.read_csv(os.path.join(config['local_data_dir'], 'sample_submission.csv'))
  print("Read shape: train_df.shape: ", train_df.shape)
  print("Read shape: test_df.shape: ", test_df.shape)
  print("Read shape: sub_df.shape: ", sub_df.shape)
  
  # train_df.drop(['Id'], axis=1, inplace=True)
  train_df['original_data'] = False
    
  for fname in ['orig_data.csv', 'descriptor_dict.xlsx']:
    shutil.copy(os.path.join(config['drive_data_dir'], fname), 
                os.path.join(config['local_data_dir'], fname)
                )
  orig_data_df = pd.read_csv(os.path.join(config['local_data_dir'], 'orig_data.csv'))
  
  # orig_data_df.drop(['#'], axis=1, inplace=True)
  orig_data_df.drop(['Unnamed: 0'], axis=1, inplace=True)
  
  orig_data_df['original_data'] = True
    
  return train_df, test_df, sub_df, orig_data_df
  


def encode_features(config, train_df, test_df):
  cut_encoding_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
  train_df['cut'] = train_df['cut'].map(cut_encoding_dict) 
  test_df['cut'] = test_df['cut'].map(cut_encoding_dict) 

  color_encoding_dict = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
  train_df['color'] = train_df['color'].map(color_encoding_dict) 
  test_df['color'] = test_df['color'].map(color_encoding_dict) 

  clarity_encoding_dict = {'FL': 1, 'IF': 2, 'VVS1': 3, 'VVS2': 4, 'VS1': 5, 'VS2': 6, 
                          'SI1': 7, 'SI2': 8, 'I1': 9, 'I2': 10, 'I3': 11}
  train_df['clarity'] = train_df['clarity'].map(clarity_encoding_dict) 
  test_df['clarity'] = test_df['clarity'].map(clarity_encoding_dict) 
  
  train_df['price_log'] = np.log(train_df['price'].values)
  # test_df['price_log'] = np.log(test_df['price'].values)

  return train_df, test_df


def update_feature_dtypes(config, train_df, test_df):
  for col in config['bool_cols']:
      train_df[col] = train_df[col].astype('category')
      test_df[col] = test_df[col].astype('category')
  
  return train_df, test_df



def get_feature_cols(config, train_df):
  config['id_cols'] = ['id', 'original_data']

  if config['predict_log']:
    config['target_cols'] = ['price_log']
  else:
    config['target_cols'] = ['price']

  all_price_cols = [col for col in train_df.columns if col.startswith('price')]

  config['bool_cols'] = []

  non_feature_cols = config['target_cols'] + all_price_cols + config['id_cols']

  config['feature_cols'] = []
  for col in train_df.columns:
    if col not in non_feature_cols:
      config['feature_cols'].append(col)
  
  return config



In [75]:

def scale_data_fn(config, train_df, test_df):
  
  cols_to_scale = config['feature_cols'] # MAKE SURE THESE ARE ALL CONT. FEATURES.

  if config['scaler_type'] == 'standard':
    scaler = StandardScaler()
  elif config['scaler_type'] == 'robust':
    scaler = RobustScaler()
  
  scaler.fit(train_df[cols_to_scale].values)
  train_df[cols_to_scale] = scaler.transform(train_df[cols_to_scale])
  test_df[cols_to_scale] = scaler.transform(test_df[cols_to_scale])

  return train_df, test_df
    

In [76]:

def IQR_outlier_handling(train_df, test_df, cols, handling_type):
  for col in cols:
    # calculate interquartile range
    q25, q75 = np.percentile(train_df[col].values, 25), np.percentile(train_df[col].values, 75)
    iqr = q75 - q25
    
    # calculate the outlier cutoff
    cut_off = iqr * 1.5
    lower_cutoff, upper_cutoff = q25 - cut_off, q75 + cut_off
    
    num_outliers = train_df[col].loc[(train_df[col] < lower_cutoff) | (train_df[col] > upper_cutoff)].shape[0]
    print("col: {} \t # num_outliers: {}".format(col, num_outliers))

    if handling_type == 'remove_train_clip_test':
      train_df[col] = train_df[col].loc[(not(train_df[col] < lower_cutoff)) & (not(train_df[col] > upper_cutoff))]
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
    elif handling_type == 'clip':
      train_df[col].loc[train_df[col] < lower_cutoff] = lower_cutoff
      train_df[col].loc[train_df[col] > upper_cutoff] = upper_cutoff
      if col in test_df.columns:
        test_df[col].loc[test_df[col] < lower_cutoff] = lower_cutoff
        test_df[col].loc[test_df[col] > upper_cutoff] = upper_cutoff
      
    return train_df, test_df



def winsorize_outlier_handling(train_df, test_df, cols, lower_lim=0.01, upper_lim=0.98):
  # lower_lim = train_df.quantile(0.01)
  # upper_lim = train_df.quantile(0.99)
  for col in cols:
    train_df[col] = winsorize(train_df[col], (lower_lim, upper_lim))
    test_df[col] = winsorize(test_df[col], (lower_lim, upper_lim))
  
  return train_df, test_df



def isolation_forest_outlier_handling(train_df, test_df, cols, outlier_thresh=-0.1, 
                                      handling_method='drop_median', seed=0):
  print("Training Isolation forest model to detect outliers...")
  iso_forest_model = IsolationForest(n_estimators=500, contamination='auto', random_state=seed)
  iso_forest_model.fit(train_df[cols], train_df.MedHouseVal.values)
  
  sample_scores_train = iso_forest_model.decision_function(train_df[cols])
  sample_scores_test = iso_forest_model.decision_function(test_df[cols])

  print("# train outliers: ", np.sum(sample_scores_train < outlier_thresh))
  print("# test outliers: ", np.sum(sample_scores_test < outlier_thresh))
  
  if handling_method == 'drop_median':
    print("Dropping outlier train samples...")
    # drop train samples and replace test sample values with median from train columns
    train_df = train_df.loc[sample_scores_train >= outlier_thresh]
    
    print("Clipping outlier test samples to median value...")
    for col in cols:
      test_df[col].loc[sample_scores_test < outlier_thresh] = train_df[col].median(axis=0)
  elif handling_method == 'winsorize':
    train_df.loc[sample_scores_train < outlier_thresh] = winsorize_outlier_handling(train_df.loc[sample_scores_train < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    test_df.loc[sample_scores_test < outlier_thresh] = winsorize_outlier_handling(test_df.loc[sample_scores_test < outlier_thresh], 
                                                                  cols, lower_lim=0.01, upper_lim=0.98)
    
  return train_df, test_df


def manual_outlier_handling(config, train_df, test_df):
  train_df[train_df.made==10000].made = 2000  # Not sure what other year to set here. 

  return train_df, test_df 


def handle_outliers(config, train_df, test_df, cols):
  if config['outlier_handling_method'] == 'manual':
    train_df, test_df = manual_outlier_handling(train_df, test_df, cols=cols)
  if config['outlier_handling_method'] == 'winsorize':
    train_df, test_df = winsorize_outlier_handling(train_df, test_df, cols=cols)
  elif config['outlier_handling_method'] == 'iso_forest':
    train_df, test_df = isolation_forest_outlier_handling(train_df, test_df, cols=cols, 
                                  outlier_thresh=-0.1, handling_method='drop_median', seed=config['seed'])
  elif config['outlier_handling_method'] == 'iqr':
    train_df, test_df = IQR_outlier_handling(train_df, test_df, cols, handling_type='clip')

  return train_df, test_df


In [77]:


def get_config():
  return config


def save_config(config):
  config_to_save = {}  # to avoid types like object or others that somtimes cause problem reading data back.
  for k, v in config.items():
    if isinstance(v, (bool, int, float, str, list, dict)):
      config_to_save[k] = v
  
  config_local_save_path = os.path.join(config['local_model_dir'], 'saved_config.json')
  config_drive_save_path = os.path.join(config['drive_model_dir'], 'saved_config.json')
  
  with open(config_local_save_path, 'w') as fp:
    json.dump(config_to_save, fp, indent=4, sort_keys=True)
  
  shutil.copy(config_local_save_path, config_drive_save_path)


def get_model_config(config):
  config_local_save_path = os.path.join(config['local_model_dir'], 'saved_config.json')
  config_drive_save_path = os.path.join(config['drive_model_dir'], 'saved_config.json')
  
  shutil.copy(config_drive_save_path, config_local_save_path)

  with open(config_local_save_path, 'r') as fp:
    model_config = json.load(fp)
  
  return model_config
  

def across_col_feat_v1(config, df):
  '''
  https://www.kaggle.com/competitions/playground-series-s3e8/discussion/389472
  Features suggested by chatGPT.
  '''

  feat_cols = []
  for col in df.columns:
    if col not in ['id', 'price', 'original_data']:
      feat_cols.append(col)
  
  df['volume'] = df['x'] * df['y'] * df['z']
  df['density'] = df['carat'] / df['volume']
  df['table_percentage'] = (df['table'] / ((df['x'] + df['y']) / 2)) * 100
  df['depth_percentage'] = (df['depth'] / ((df['x'] + df['y']) / 2)) * 100
  df['symmetry'] = (abs(df['x'] - df['z']) + abs(df['y'] - df['z'])) / (df['x'] + df['y'] + df['z'])
  df['surface_area'] = 2 * ((df['x'] * df['y']) + (df['x'] * df['z']) + (df['y'] * df['z']))
  df['depth_to_table_ratio'] = df['depth'] / df['table']
  
  return df


def extract_features(config, train_df, test_df):
  '''
  Keep is min: original data set is imaginary.
  We can best aim for finding logic dataset author may have used to put in price values.
  '''
  if config['feature_version'] is None:
    pass
  elif config['feature_version'] == 'v1':
    train_df = across_col_feat_v1(config, train_df)
    test_df = across_col_feat_v1(config, test_df)
  else:
    print("Feature version {} not supported. Choose from : None, v1")
  
  return train_df, test_df
    

# consider prior distribution of labels?
def find_optimal_cuts():
  pass


def preprocess(config, train_df, test_df):
    val = 1e-2
    train_df['x'] = train_df['x'].replace(0, val)
    train_df['y'] = train_df['z'].replace(0, val)
    train_df['z'] = train_df['z'].replace(0, val)

    train_df['depth'].fillna(train_df['depth'].median(), inplace=True)

    # fill nans
    # fill infs

    raise_exp = False
    for col in train_df.columns:
      if col in ['id']:
        continue
      nan_cnt = np.sum(np.isnan(train_df[col].values))
      inf_cnt = np.sum(np.isinf(train_df[col].values))
      if nan_cnt > 0:
        print("col: {} has {} nans...".format(col, nan_cnt))
        raise_exp = True
      if inf_cnt > 0:
        print("col: {} has {} infs...".format(col, inf_cnt))
        raise_exp = True
    
    
    if raise_exp:
      print("Nans/Infs detected...")
      raise
    
    return train_df, test_df


In [78]:

def train_k_folds():

  # needed becuse variable that is updated within function becomes a local variable and has to be passed in.
  config = get_config()
  
  create_paths(config)
  train_df, test_df, sub_df, orig_data_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  print("test_df.shape: ", test_df.shape)
  print("sub_df.shape: ", sub_df.shape)
  print("orig_data_df.shape: ", orig_data_df.shape)
  
  # ---------------

  if config['include_orig_data']:
    if config['validate_only_comp_data']:
      fold_idx_dict = generate_fold_idx(config, train_df)
      train_df = pd.concat((train_df, orig_data_df), axis=0)
      print("After appending orig data to train data: ")
      print("train_df.shape: ", train_df.shape)
    else:
      train_df = pd.concat((train_df, orig_data_df), axis=0)
      print("After appending orig data to train data: ")
      print("train_df.shape: ", train_df.shape)
      fold_idx_dict = generate_fold_idx(config, train_df)
  else:
    fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in fold_idx_dict.keys():
    val_idx = fold_idx_dict[fold_num]['val_idx']
    all_idx = np.arange(0, train_df.shape[0])
    train_idx = np.setdiff1d(all_idx, val_idx)
    fold_idx_dict[fold_num]['train_idx'] = train_idx
  
  print('id in train_df.columns: ', id in train_df.columns)
  # ---------------

  config = get_feature_cols(config, train_df)
  train_df, test_df = encode_features(config, train_df, test_df)
  print("After feature encoding: train_df.shape: ", train_df.shape)
  print("After feature encoding: test_df.shape: ", test_df.shape)
  
  # ---------------
  
  train_df, test_df = preprocess(config, train_df, test_df)
  print("After preprocessing: train_df.shape: ", train_df.shape)
  print("After preprocessing: test_df.shape: ", test_df.shape)

  # ---------------
  
  train_df, test_df = extract_features(config, train_df, test_df)
  print("After feature extraction: train_df.shape: ", train_df.shape)
  print("After feature extraction: test_df.shape: ", test_df.shape)
  
  # ---------------
  
  if config['enable_categorical']:
    train_df, test_df = update_feature_dtypes(config, train_df, test_df)
  
  # ---------------
  
  config = get_feature_cols(config, train_df)
  print("config['feature_cols']): ", config['feature_cols'])
  print("# feature_cols: ", len(config['feature_cols']))

  # ---------------
  
  if config['handle_outliers']:
    print("Before outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
    train_df, test_df = handle_outliers(config, train_df, test_df)
    print("After outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  # ---------------
  
  if config['scale_data']:
    train_df, test_df = scale_data_fn(config, train_df, test_df)
  
  # ------------------
  
  # fold_idx_dict = generate_fold_idx(config, train_df)
  
  per_model_metrics = {
                        'loss': {'train': [], 'val': []},
                        'rmse': {'train': [], 'val': []},
                      }
  
  shutil.copy('/content/drive/MyDrive/Playground Series/S03_E08/code/play_s03e08_model_lgb_regr_1.ipynb', 
              os.path.join(config['drive_model_dir'], 'play_s03e08_model_lgb_regr_1.ipynb'))
  save_config(config)

  # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
  # if config['use_wandb']:
  wandb.init(name=config['model_name'], project=config['project_name'], 
            tags=['baseline'], config=config)
  if config['choice'] == 3:
    print("Updating sweep configs...")
    for k, v in wandb.config.items():
      config[k] = v
    print("*** Updated sweep config: ", config)
    
  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue

    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])
    # # if config['use_wandb']:
    # wandb.init(name=fold_model_name, project=config['project_name'], 
    #           tags=['baseline'], config=config)
    # if config['choice'] == 3:
    #   print("Updating sweep configs...")
    #   for k, v in wandb.config.items():
    #     config[k] = v
    #   print("*** Updated sweep config: ", config)
    
    set_seeds(config)
    
    # -----------
    
    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx] 
    if config['predict_log']:
      train_label_price = train_df['price'].iloc[train_idx]  # since train label can be price or price_log

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]
    if config['predict_log']:
      val_label_price = train_df['price'].iloc[val_idx]

    lgb_params = get_lgb_params(config)

    # if config['scale_pos_weight'] == 'auto':
    #   auto_pos_cls_weight = train_label[train_label.values==0].shape[0] / train_label[train_label.values==1].shape[0]
    #   print("Setting scale_pos_weight to: ", auto_pos_cls_weight)
    #   xgb_params['scale_pos_weight'] = auto_pos_cls_weight
    # else:
    #   xgb_params['scale_pos_weight'] = config['scale_pos_weight']
    
    # if config['eval_metric'] == 'wqkappa':
    #   xgb_params['disable_default_eval_metric'] = 1
    #   xgb_params['eval_metric'] = wqkappa

    model = lgb.LGBMRegressor(**lgb_params)

    print("Training model ...")
    model.fit(train_data, train_label, 
              eval_set=[(train_data, train_label), 
                        (val_data, val_label)], 
              verbose=50, 
              )
    
    print("Saving Model...")
    save_model(config, model)

    # -----------
    # print(model.evals_result())
    # if config['choice'] == 1:
    #   for i in range(len(model.evals_result()['validation_0'][config['eval_metric']])):
    #     wandb.log({
    #       f"Per Epoch Train {config['eval_metric']}": model.evals_result()['validation_0'][config['eval_metric']][i], 
    #       f"Per Epoch Val {config['eval_metric']}": model.evals_result()['validation_1'][config['eval_metric']][i], 
    #       }
    #     )
      

    # print("Best val epoch: max validation_1 metric: ", 
    #       np.min(model.evals_result()['validation_1'][config['eval_metric']]))
    # print("Best val epoch number:  ", np.argmin(model.evals_result()['validation_1'][config['eval_metric']]))
    # print("model.best_ntree_limit: ", model.best_ntree_limit)
    print("model.best_iteration_: ", model.best_iteration_)
    print("model.best_score_: ", model.best_score_)
    print("best rmse score: ", model.best_score_['valid_1']['rmse'])
    max_iter = np.argmin(model.evals_result_['valid_1']['rmse'])
    
    # ------------------

    train_preds = model.predict(train_data, num_iteration=max_iter)
    val_preds = model.predict(val_data, num_iteration=max_iter)
    if config['predict_log']:
      train_preds_price = np.exp(train_preds)
      val_preds_price = np.exp(val_preds)
    
    # ------------------

    
    if config['predict_log']:
      train_mse = mean_squared_error(train_label_price.values, train_preds_price)
      val_mse = mean_squared_error(val_label_price.values, val_preds_price)
      
      train_rmse = mean_squared_error(train_label_price.values, train_preds_price, squared=False)
      val_rmse = mean_squared_error(val_label_price.values, val_preds_price, squared=False)
    else:
      train_mse = mean_squared_error(train_label.values, train_preds)
      val_mse = mean_squared_error(val_label.values, val_preds)
    
      train_rmse = mean_squared_error(train_label.values, train_preds, squared=False)
      val_rmse = mean_squared_error(val_label.values, val_preds, squared=False)
      
    # ------------------
    


    per_model_metrics['loss']['train'].append(train_mse)
    per_model_metrics['loss']['val'].append(val_mse)
    per_model_metrics['rmse']['train'].append(train_rmse)
    per_model_metrics['rmse']['val'].append(val_rmse)
    
    print(f"MSE Loss: Train: {train_mse} \t Val: {val_mse}")
    print(f"RMSE: Train: {train_rmse} \t Val: {val_rmse}")
    
    # wandb.log({
    #     "Best Epoch Train AUROC": train_auroc, 
    #     "Best Epoch Val AUROC": val_auroc,

    #     "Best Epoch Train F1": train_f1, 
    #     "Best Epoch Val F1": val_f1,
        
    #     "Best Epoch Train Log Loss": train_logloss, 
    #     "Best Epoch Val Log Loss": val_logloss,
    #     }
    #   )

    if config['choice'] == 1:
      plt.figure(figsize=(12, 10))
      lgb.plot_importance(model)
      plt.title(f"Fold: {fold_num}")
      plt.show()
      plt.close()

    print('-'*30)
  
  print("Fold average stats.: ")
  print(f"RMSE: Train: {np.mean(per_model_metrics['rmse']['train'])} \t Val: {np.mean(per_model_metrics['rmse']['val'])}")
  print(f"Loss: Train: {np.mean(per_model_metrics['loss']['train'])} \t Val: {np.mean(per_model_metrics['loss']['val'])}")
  print("per fold VAL RMSEs: ", per_model_metrics['rmse']['val'])

  wandb.log({
        "Fold average train rmse": np.mean(per_model_metrics['rmse']['train']), 
        "Fold average val rmse": np.mean(per_model_metrics['rmse']['val']),
        }
      )
  




In [79]:

def create_paths(config):
  config['local_model_dir'] = '/content/model_store'
  config['drive_model_dir'] = os.path.join(config['drive_project_dir'], 'model_store')
  
  if not os.path.exists(config['local_model_dir']):
    os.mkdir(config['local_model_dir'])
  
  if not os.path.exists(config['drive_model_dir']):
    os.mkdir(config['drive_model_dir'])
  
  # -------------

  config['local_model_dir'] = os.path.join(config['local_model_dir'], config['model_name']) 
  config['drive_model_dir'] = os.path.join(config['drive_model_dir'], config['model_name']) 

  if not os.path.exists(config['local_model_dir']): 
    os.mkdir(config['local_model_dir'])
  if not os.path.exists(config['drive_model_dir']): 
    os.mkdir(config['drive_model_dir'])
  
  # -------------

  config['local_data_dir'] = '/content/data'
  config['drive_data_dir'] = os.path.join(config['drive_project_dir'], 'data/')

  config['local_feature_dir'] = '/content/feature_store'
  config['drive_feature_dir'] = os.path.join(config['drive_project_dir'], 'feature_store')

  if not os.path.exists(config['local_data_dir']):
    os.mkdir(config['local_data_dir'])
  
  if not os.path.exists(config['local_feature_dir']):
    os.mkdir(config['local_feature_dir'])



In [80]:

def aggregate_preds(config, per_fold_test_preds, per_model_metrics=None): 
  
  if config['aggr_type'] == 'simple':
    aggr_test_preds_cont = np.mean(per_fold_test_preds, axis=1).flatten()
  elif config['aggr_type'] == 'rmse_weighted':
    aggr_test_preds_cont = np.average(per_fold_test_preds, axis=1, 
                                      weights=1/np.array(per_model_metrics['rmse']['val'])).flatten()
  elif config['aggr_type'] == 'loss_weighted':
    aggr_test_preds_cont = np.average(per_fold_test_preds, axis=1, 
                                      weights=1/np.array(per_model_metrics['loss']['val'])).flatten()
  else:
    print("aggr_type: {} not supported...".format(config['aggr_type']))
    raise

  return aggr_test_preds_cont



def test_model(config):
  create_paths(config)

  train_df, test_df, sub_df, orig_data_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  print("test_df.shape: ", test_df.shape)
  print("sub_df.shape: ", sub_df.shape)
  print("orig_data_df.shape: ", orig_data_df.shape)
  test_ids = test_df.id.values
  
  # ------------------------------
  model_training_config = get_model_config(config)
  print("model_training_config: ", model_training_config)
  for key in ['scale_data', 'scaler_type', 'handle_outliers', 'predict_log', 
              'outlier_handling_method', 'feature_version', 'include_orig_data', 
              'fold_split_type', 'num_folds', 'random_state', 'validate_only_comp_data', 
              'enable_categorical']:  
    if key in model_training_config.keys():
      # print(f"Overwriting value for {key} with: {model_training_config[key]}")
      config[key] = model_training_config[key]
  
  
  # ---------------
  
  if config['include_orig_data']:
    if config['validate_only_comp_data']:
      fold_idx_dict = generate_fold_idx(config, train_df)
      train_df = pd.concat((train_df, orig_data_df), axis=0)
      print("After appending orig data to train data: ")
      print("train_df.shape: ", train_df.shape)
    else:
      train_df = pd.concat((train_df, orig_data_df), axis=0)
      print("After appending orig data to train data: ")
      print("train_df.shape: ", train_df.shape)
      fold_idx_dict = generate_fold_idx(config, train_df)
  else:
    fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in fold_idx_dict.keys():
    val_idx = fold_idx_dict[fold_num]['val_idx']
    all_idx = np.arange(0, train_df.shape[0])
    train_idx = np.setdiff1d(all_idx, val_idx)
    fold_idx_dict[fold_num]['train_idx'] = train_idx
  
  # ------------------------------

  config = get_feature_cols(config, train_df)
  train_df, test_df = encode_features(config, train_df, test_df)
  print("After feature encoding: train_df.shape: ", train_df.shape)
  print("After feature encoding: test_df.shape: ", test_df.shape)
  
  # ---------------
  train_df, test_df = preprocess(config, train_df, test_df)
  print("After preprocessing 1: train_df.shape: ", train_df.shape)
  print("After preprocessing 2: test_df.shape: ", test_df.shape)

  # ---------------
  
  print("Before feature extraction: train_df.shape: ", train_df.shape)
  print("Before feature extraction: test_df.shape: ", test_df.shape)
  train_df, test_df = extract_features(config, train_df, test_df)
  print("After feature extraction: train_df.shape: ", train_df.shape)
  print("After feature extraction: test_df.shape: ", test_df.shape)
  
  # ---------------
  
  if config['enable_categorical']:
    train_df, test_df = update_feature_dtypes(config, train_df, test_df)
  
  # ---------------
  
  config = get_feature_cols(config, train_df)
  print("# feature_cols: ", len(config['feature_cols']))
  
  # ------------------
  
  if config['handle_outliers']:
    print("Before outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
    train_df, test_df = handle_outliers(config, train_df, test_df)
    print("After outlier handling: ")
    print(f"train_df.shape: {train_df.shape} \t test_df.shape: {test_df.shape}")
  
  if config['scale_data']:
    print("Scaling data...")
    train_df, test_df = scale_data_fn(config, train_df, test_df)
    

  # ------------------

  # fold_idx_dict = generate_fold_idx(config, train_df)
  
  per_model_metrics = {
                        'loss': {'train': [], 'val': []},
                       'rmse': {'train': [], 'val': []},
                      }

  per_fold_test_preds = None
  pred_cnt = 0

  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue
    pred_cnt += 1
    print("Training fold: ", fold_num)
    config['curr_fold'] = fold_num

    # -----------
    fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])

    train_idx = fold_idx_dict[fold_num]['train_idx']
    val_idx = fold_idx_dict[fold_num]['val_idx']
    print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    train_data = train_df[config['feature_cols']].iloc[train_idx]
    train_label = train_df[config['target_cols']].iloc[train_idx]
    if config['predict_log']:
      train_label_price = train_df['price'].iloc[train_idx]  # since train label can be price or price_log

    val_data = train_df[config['feature_cols']].iloc[val_idx]
    val_label = train_df[config['target_cols']].iloc[val_idx]
    if config['predict_log']:
      val_label_price = train_df['price'].iloc[val_idx]

    test_data = test_df[config['feature_cols']]
    
    # --------

    model = load_model(config)
    
    max_iter = np.argmin(model.evals_result_['valid_1']['rmse'])
    # max_iter = model.best_iteration_

    train_preds = model.predict(train_data, num_iteration=max_iter)  # model.best_iteration_
    val_preds = model.predict(val_data, num_iteration=max_iter)  # model.best_iteration_
    test_preds = model.predict(test_data, ntree_limit=max_iter)  # model.best_iteration_
    
    if config['predict_log']:
      train_preds_price = np.exp(train_preds)
      val_preds_price = np.exp(val_preds)
      test_preds = np.exp(test_preds)
    
    # val_preds = np.clip(val_preds, np.min(train_label.values), np.max(train_label.values))
    # test_preds = np.clip(test_preds, np.min(train_label.values), np.max(train_label.values))
    
    # ------------------

    # train_mse = mean_squared_error(train_label.values, train_preds)
    # val_mse = mean_squared_error(val_label.values, val_preds)
    
    # train_rmse = mean_squared_error(train_label.values, train_preds, squared=False)
    # val_rmse = mean_squared_error(val_label.values, val_preds, squared=False)
    if config['predict_log']:
      train_mse = mean_squared_error(train_label_price.values, train_preds_price)
      val_mse = mean_squared_error(val_label_price.values, val_preds_price)
      
      train_rmse = mean_squared_error(train_label_price.values, train_preds_price, squared=False)
      val_rmse = mean_squared_error(val_label_price.values, val_preds_price, squared=False)
    else:
      train_mse = mean_squared_error(train_label.values, train_preds)
      val_mse = mean_squared_error(val_label.values, val_preds)
    
      train_rmse = mean_squared_error(train_label.values, train_preds, squared=False)
      val_rmse = mean_squared_error(val_label.values, val_preds, squared=False)
      

    # ------------------
    
    per_model_metrics['loss']['train'].append(train_mse)
    per_model_metrics['loss']['val'].append(val_mse)
    per_model_metrics['rmse']['train'].append(train_rmse)
    per_model_metrics['rmse']['val'].append(val_rmse)

    print(f"MSE: Train: {train_mse} \t Val: {val_mse}")
    print(f"RMSE: Train: {train_rmse} \t Val: {val_rmse}")
    
    
    test_preds = np.reshape(test_preds, newshape=(test_preds.shape[0], 1))
    
    if per_fold_test_preds is None:
      per_fold_test_preds = test_preds
    else:
      per_fold_test_preds = np.concatenate((per_fold_test_preds, test_preds), axis=1)
     

  print("Fold average stats.: ")
  print(f"RMSE: Train: {np.mean(per_model_metrics['rmse']['train'])} \t Val: {np.mean(per_model_metrics['rmse']['val'])}")
  print(f"MSE Loss: Train: {np.mean(per_model_metrics['loss']['train'])} \t Val: {np.mean(per_model_metrics['loss']['val'])}")
  
  # ------------------

  print("per_fold_test_preds.shape: ", per_fold_test_preds.shape)
  
  test_preds_aggr = aggregate_preds(config, per_fold_test_preds, per_model_metrics)
  print("test_preds_aggr[:10]: ", test_preds_aggr[:10])

  sub_df = pd.DataFrame([])
  sub_df['id'] = test_ids 
  sub_df['price'] = test_preds_aggr
  sub_df.to_csv(os.path.join(config['local_model_dir'], 'sample_submission.csv'), index=False)
  shutil.copy(os.path.join(config['local_model_dir'], 'sample_submission.csv'), 
              os.path.join(config['drive_model_dir'], 'sample_submission.csv'))


# submission_{notebook_name}_{date}_{time}.csv


In [81]:

def model_analysis():
  config = get_config()
  create_paths(config)
  
  train_df, test_df, sub_df = get_data(config)
  print("train_df.shape: ", train_df.shape)
  test_ids = test_df.Id.values

  train_df, test_df = encode_features(config, train_df, test_df)

  config = get_feature_cols(config, train_df)

  fold_idx_dict = generate_fold_idx(config, train_df)
  
  for fold_num in range(config['num_folds']):
    if fold_num not in config['folds_to_train']:
      continue
    config['curr_fold'] = fold_num

    # -----------
    # fold_model_name = config['model_name'] + '_fold_' + str(config['curr_fold'])

    # train_idx = fold_idx_dict[fold_num]['train_idx']
    # val_idx = fold_idx_dict[fold_num]['val_idx']
    # print("len(train_idx): {} \t len(val_idx): {}".format(len(train_idx), len(val_idx)))
    
    # train_data = train_df[config['feature_cols']].iloc[train_idx]
    # train_label = train_df[config['target_cols']].iloc[train_idx]

    # val_data = train_df[config['feature_cols']].iloc[val_idx]
    # val_label = train_df[config['target_cols']].iloc[val_idx]

    # test_data = test_df[config['feature_cols']]
    
    # --------

    model = load_model(config)
    plt.figure(figsuze=(12, 10))
    xgb.plot_importance(model)
    plt.title(f"Fold: {fold_num}")
    plt.show()
    plt.close()


In [84]:

config = {
    'choice': 2, 
    'random_state': 21, 
    'aggr_type': 'rmse_weighted',  # simple / wqkappa_weighted / loss_weighted
    
    'model_name': 'lgb_model_1', 
    
    'include_orig_data': True, 

    'feature_version': 'v1',  # None, v1, v2, v3
    
    'predict_log': False, # to train model to predict log(price); for price is a heavy tailed distribution
 
    'handle_outliers': False, 
    'outlier_handling_method': 'winsorize', 
    
    'scale_data': False, 
    'scaler_type': 'standard',  # standard / robust

    'enable_categorical': False, 

    'fold_split_type': 'kfold',  # kfold, strat_kfold
    'num_folds': 10, 
    'folds_to_train': [0,1,2,3,4,5,6,7,8,9], #5,6,7,8,9], #,5,6,7,8,9,10], 
    'validate_only_comp_data': True, 
    
    'boosting_type': 'gbdt',  # gbdt, rf, dart
    'drop_rate': 0.1, 
    'max_drop': 50, 

    'n_estimators': 9999, 
    'early_stopping_rounds': 200, 

    'colsample_bytree': 1.0, 
    'subsample': 0.8,  
    
    'max_depth': 8, 
    'max_leaves': 64,   
    'learning_rate': 0.05, 

    'reg_alpha': 0.0,  # Default: 0
    'reg_lambda': 1.0,  # Default: 0
    
    'max_bin': 256,  # Default: 256
    'min_data_in_leaf': 1, # Default: 1
    'gamma': 0,   # default: 0

    # if 'auto'; will be overridden as sum(negative instances) / sum(positive instances). 
    # Else; provided value will be used.
    'scale_pos_weight': 'auto',  # 'auto' / 10 / 25 etc.

    'verbosity': -1,

    'objective': 'rmse', 
    'eval_metric': 'rmse',  
    
    'use_gpu_if_available': True, 
    'predictor': 'gpu_predictor',
    'use_wandb': False, # Defaults to true if choice==3.
    'n_jobs': -1, 
    'data_dir': '/content/data/', 
    'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E08', 
    'project_name': 'playground_s03_e08', 
}



# if config['use_gpu_if_available']:
#   if torch.cuda.is_available():
#     config['gpu_id'] = 0
#     config['tree_method'] = 'gpu_hist'
#     config['predictor'] = 'gpu_predictor'
#     print("GPU available... XGBoost will use GPU...")
#   else:
#     print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to TRUE; But NO GPU IS VISIBLE!!!!!")
#     if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
#     if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor'
# else:
#   if config['tree_method'] == 'gpu_hist': config['tree_method'] = 'hist'
#   if config['predictor'] == 'gpu_predictor': config['predictor'] = 'cpu_predictor' 
#   print("NOT USING GPU!!!!!! Parameter 'use_gpu_if_available' is set to False!!!!!!!")    


if config['choice'] == 3: config['use_wandb'] = True

if config['use_wandb']:
  os.environ['WANDB_MODE'] = 'online'
  try: 
    wandb.login(key='d60ad29783a045de090c17001912975dc8f9f2e2') 
  except:
    wandb.login()
else:
  os.environ['WANDB_MODE'] = 'offline'

set_seeds(config)

if config['choice'] == 1:
  train_k_folds()
elif config['choice'] == 2:
  test_model(config)
elif config['choice'] == 3:
  sweep_configs = {
      "method": "bayes",
      "metric": {
          "name": "Fold average val rmse",
          "goal": "minimize",  # this wqkappa is output of sklearn function after predictions. SO can maximize.
      },
      "parameters": {      
          "colsample_bytree": {
              "values": [0.6, 0.8, 1.0]
          },
          "subsample": {
              "values": [0.6, 0.8, 1.0]
          },
          "max_depth": {
              "values": [5, 8, 12, 24]
          },
          'max_leaves': {
              'values': [32, 64, 128, 256],
          },
          # "reg_alpha": {
          #     "values": [0, 1.0, 2.0, 5.0]
          # },
          "reg_lambda": {
              "values": [1.0, 2.0, 5.0]
          },

          # "learning_rate": {
          #     "values": [0.01, 0.05, 0.1, 0.3, 0.5]
          # }
          # "random_state": {
          #     "values": [0.01, 0.05, 0.1, 0.3, 0.5]
          # },
          # "max_bin": {
          #     "values": [64, 128, 256, 512, 1024],
          # },
          # "min_data_in_leaf": {
          #     "values": [1, 32, 64, 128, 256],
          # },
          
        }
  }
  print("Running sweep>>>>>>>>>>>>>>>>>>>>>>>>")
  sweep_id = wandb.sweep(sweep=sweep_configs, project=config['project_name']+'_sweep')
  wandb.agent(sweep_id=sweep_id, function=train_k_folds, count=50)
elif config['choice'] == 4:
  model_analysis()
else:
  raise ValueError(f"Incorrect value for 'choice'={config['choice']} in config")



Read shape: train_df.shape:  (193573, 11)
Read shape: test_df.shape:  (129050, 10)
Read shape: sub_df.shape:  (129050, 2)
train_df.shape:  (193573, 12)
test_df.shape:  (129050, 10)
sub_df.shape:  (129050, 2)
orig_data_df.shape:  (26967, 11)
model_training_config:  {'aggr_type': 'rmse_weighted', 'bool_cols': [], 'boosting_type': 'gbdt', 'choice': 1, 'colsample_bytree': 1.0, 'data_dir': '/content/data/', 'drive_data_dir': '/content/drive/MyDrive/Playground Series/S03_E08/data/', 'drive_feature_dir': '/content/drive/MyDrive/Playground Series/S03_E08/feature_store', 'drive_model_dir': '/content/drive/MyDrive/Playground Series/S03_E08/model_store/lgb_model_1', 'drive_project_dir': '/content/drive/MyDrive/Playground Series/S03_E08', 'drop_rate': 0.1, 'early_stopping_rounds': 200, 'enable_categorical': False, 'eval_metric': 'rmse', 'feature_cols': ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume', 'density', 'table_percentage', 'depth_percentage', 'symmetry', 'sur

In [83]:
raise

RuntimeError: ignored

In [None]:

'''

Original dataset: 
https://www.kaggle.com/datasets/colearninglounge/gemstone-price-prediction


Problem Statement: 
You are hired by a company Gem Stones co ltd, which is a cubic zirconia manufacturer. 
You are provided with the dataset containing the prices and other attributes of almost 27,000 cubic zirconia 
(which is an inexpensive diamond alternative with many of the same qualities as a diamond). 
The company is earning different profits on different prize slots. You have to help the company in predicting 
the price for the stone on the basis of the details given in the dataset so it can distinguish between higher 
profitable stones and lower profitable stones so as to have a better profit share. 
Also, provide them with the best 5 attributes that are most important.


Attributes:
Carat:   Carat weight of the cubic zirconia.
Cut:     Describe the cut quality of the cubic zirconia.
         Quality is increasing order Fair, Good, Very Good, Premium, Ideal.
Color:   Colour of the cubic zirconia.With D being the best and J the worst.
Clarity: Cubic zirconia Clarity refers to the absence of the Inclusions and Blemishes.
         (In order from Best to Worst, FL = flawless, I3= level 3 inclusions) 
         FL, IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3
Depth:   The Height of a cubic zirconia, measured from the Culet to the table, divided by its average Girdle Diameter.
Table:   The Width of the cubic zirconia's Table expressed as a Percentage of its Average Diameter.
Price:   The Price of the cubic zirconia.
X:       Length of the cubic zirconia in mm.
Y:       Width of the cubic zirconia in mm.
Z:       Height of the cubic zirconia in mm.


'''



In [None]:

# 
# 
# Carat: ordinal, continuous. Do not encode for now.
# Cut: ordinal, discrete. Encode with dict: {Fair: 1, Good: 2, Very Good: 3, Premium: 4, Ideal: 5}.
# Color: ordinal, discrete. Encode with dict: {J: 1, I: 2, H: 3, G: 4, F: 5, E: 6, D: 7}
# Clarity: ordinal, discrete. 
#           Encode with dict: {FL: 1, IF: 2, VVS1: 3, VVS2: 4, VS1: 5, VS2: 6, SI1: 7, SI2: 8, I1: 9, I2: 10, I3: 11}
# Depth: ordinal, continuous. Do not encode for now.
# Table: ordinal, continuous. Do not encode for now.
# Price: ordinal, continuous. Do not encode for now.
# X: ordinal, continuous. Do not encode for now.
# Y: ordinal, continuous. Do not encode for now.
# Z: ordinal, continuous. Do not encode for now.
# 
# 


In [None]:
import pandas as pd
import numpy as np

In [None]:

base_dir_path = '/content/data'

train_df = pd.read_csv(base_dir_path + '/train.csv')
test_df = pd.read_csv(base_dir_path + '/test.csv')
orig_data_df = pd.read_csv(base_dir_path + '/orig_data.csv')
descriptor_df = pd.read_excel(base_dir_path + '/descriptor_dict.xlsx')


In [None]:
print("train_df.shape: ", train_df.shape)
print("test_df.shape: ", test_df.shape)
print("orig_data_df.shape: ", orig_data_df.shape)
print("descriptor_df.shape: ", descriptor_df.shape)


In [None]:
train_df.columns

In [None]:
descriptor_df

In [None]:
orig_data_df.columns

In [None]:

# np.sum(np.isnan(train_df.color.values))



In [None]:

# np.sum(np.isnan(orig_data_df.depth.values))
