**Mission :** *Investigate cv/data leak*

- Why lb and cv difference????


# Imports

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder


from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



warnings.filterwarnings('ignore')

In [2]:
# # wandb stuff for tracking
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

# import wandb
# wandb.login(key=wandb_login)

# Config

In [3]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'trees-{config.unique_id}'   

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# Img config
config['IMG_SIZE'] = (136, 136)
config['CHANNELS'] = 3


config['desc'] = 'test run for kaggle ml nb setup'

unique_id: 1674126879


In [4]:
# seed everything
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(config.RANDOM_SEED)

# Utils

In [5]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

# Load data

In [6]:
INPUT_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_DIR, 'train_labels.csv'))


metadata.date = pd.to_datetime(metadata.date)

In [7]:
# IMG_DIR = "/kaggle/input/pull-landsat-data-v1-500m/landsat8_500m_v1"

# img_files = os.listdir(IMG_DIR)
# img_file_names = [f.split('.')[0] for f in img_files]

# # get only data for those only in dataset
# metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
# data = metadata_subset[metadata_subset.split == 'train']
# data = data.merge(train_labels, on='uid')

In [8]:
#  get data
metadata.date = pd.to_datetime(metadata.date)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
print(data.shape)

(23570, 8)


In [9]:
# seasons
seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

#  most of the samples are collected in the months of June, July, August.

# add date time fts.
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year
data['week'] = data.date.dt.isocalendar().week
# data['day_of_year'] = data.date.dt.""
data['season'] = data.month.map(seasons)
                                

data.sort_values(by='date', inplace=True)

In [10]:
data['expanding_severity'] = data.severity.expanding().mean()
data['expanding_severity'] = data['expanding_severity'].apply(np.round)
data


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity
4387,evep,44.847993,-93.476318,2013-01-04,train,midwest,1.0,115.0,1,2013,1,1,1.0
13644,paev,44.822478,-93.367962,2013-01-04,train,midwest,1.0,1884.0,1,2013,1,1,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,midwest,1.0,1416.0,1,2013,1,1,1.0
6144,guny,44.878889,-93.490833,2013-01-04,train,midwest,1.0,558.0,1,2013,1,1,1.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,midwest,1.0,476.0,1,2013,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,west,,,12,2021,52,1,2.0
17559,thki,36.725400,-121.730000,2021-12-29,test,west,,,12,2021,52,1,2.0
17452,teuu,36.772300,-121.788000,2021-12-29,test,west,,,12,2021,52,1,2.0
14254,prfi,36.751800,-121.742000,2021-12-29,test,west,,,12,2021,52,1,2.0


In [11]:
train_data = data[data.split == 'train']
test_data = data[data.split == 'test']

In [12]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)
grp_by_region.isna().sum()

4

In [13]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

print(mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).loc[train_data.index].sort_index(), squared=False))

data['expndng_sev_by_reg'] = np.nan

south = data.region == 'south'
midwest = data.region == 'midwest'
northeast = data.region == 'northeast'
west = data.region == 'west'

data.loc[south , 'expndng_sev_by_reg'] = grp_by_region['south']
data.loc[midwest , 'expndng_sev_by_reg'] = grp_by_region['midwest']
data.loc[northeast , 'expndng_sev_by_reg'] = grp_by_region['northeast']
data.loc[west , 'expndng_sev_by_reg'] = grp_by_region['west']

print(data.shape)
data.isna().sum()

data.sort_index()

0
0.894165010958815
(23570, 14)


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0
1,aabn,36.559700,-121.510000,2016-08-31,test,west,,,8,2016,35,3,2.0,4.0
2,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0
3,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0
4,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,west,,,12,2014,49,1,2.0,4.0
23566,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,south,1.0,9682.0,11,2016,47,4,2.0,2.0


In [14]:
grp_by_rs = data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum()) # 5 --> .86

data['expanding_sev_rs'] =  grp_by_rs.droplevel(0).droplevel(0).sort_index()

# fillna with expanding sev by region
data['expanding_sev_rs'] = np.where(data.expanding_sev_rs.isna(), data.expndng_sev_by_reg, data.expanding_sev_rs)

print(mse(train_data.severity.sort_index(), data['expanding_sev_rs'].sort_index()[data.split == 'train'], squared=False))



# #  make submission for expanding severity by region and season

# expanding_sev_rs = data[data.split == 'test'][['uid', 'expanding_sev_rs']]          # picking up only uids and expanding_sev_rs from test samples
# expanding_sev_rs.expanding_sev_rs = expanding_sev_rs.expanding_sev_rs.astype(int)   # casting to int
# expanding_sev_rs.sort_values(by='uid', inplace=True)                                # sorting by uid -- safest option
# expanding_sev_rs.reset_index(drop=True, inplace=True)                               # matching indexes with submissoin

# sub_format.severity = expanding_sev_rs.expanding_sev_rs
# sub_format.severity.value_counts()  # expected 0.8594349134502333

# sub_format.to_csv('expanding_sev_rs_preds.csv', index=False)

5
0.8594349134502333


In [15]:
all_train = data[data.split == 'train']
all_train.shape

(17060, 15)

In [16]:
# reset training index to avoid wrong submissions
all_train = all_train.sort_values('uid').reset_index(drop=True)
all_train

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg,expanding_sev_rs
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0,1.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0,2.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0,2.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0,2.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,4.0,2017313.0,8,2021,34,3,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17055,zzsv,38.707825,-75.080867,2018-06-27,train,south,3.0,113125.0,6,2018,26,3,2.0,2.0,2.0
17056,zzuq,35.794000,-79.015368,2015-08-06,train,south,3.0,175726.0,8,2015,32,3,2.0,1.0,2.0
17057,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0,2.0
17058,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0,1.0


In [17]:
test_data = metadata[metadata.split == 'test']
test_data = test_data.merge(sub_format, on='uid')

test_data['month'] = test_data.date.dt.month
test_data['year'] = test_data.date.dt.year
test_data['season'] = test_data.month.map(seasons)

In [18]:
tar_cols = ['expanding_severity', 'expndng_sev_by_reg', 'expanding_sev_rs']

In [95]:
req_cols = ['region', 'month', 'year', 'season', tar_cols[0]]
req_cols

['region', 'month', 'year', 'season', 'expanding_severity']

In [96]:
test_data['expanding_severity'] = data[data.split == 'test']['expanding_severity'].sort_index().values
test_data['expndng_sev_by_reg'] = data[data.split == 'test']['expndng_sev_by_reg'].sort_index().values
test_data['expanding_sev_rs'] = data[data.split == 'test']['expanding_sev_rs'].sort_index().values

In [97]:
all_train_ = all_train.sort_values(by='date')[:-1000]
all_test_ = all_train.sort_values(by='date')[-1000:]  # last 1000 samples only for testing

In [98]:
X_ , y_ = all_train_[req_cols], all_train_['severity']
X_.shape, y_.shape

((16060, 5), (16060,))

In [99]:
Xtest_, ytest_ = all_test_[req_cols], all_test_['severity']
Xtest_.shape, ytest_.shape

((1000, 5), (1000,))

In [100]:
X_.isna().sum().sum(), y_.isna().sum()

(0, 0)

In [101]:
# X_train_,X_val_, y_train_, y_val_ = train_test_split(X_, y_, test_size=0.15, random_state=config.RANDOM_SEED, stratify=y_)
# X_train_.shape, y_train_.shape, X_val_.shape, y_val_.shape

# # thuis is causing validataion data leak!

In [102]:
X_test_ = test_data[req_cols]
X_test_.shape

(6510, 5)

In [103]:
# def get_imgs(uids) :
#     imgs = []
#     for uid in uids:
#         arr = joblib.load(IMG_DIR + f'/{uid}.npy')
#         img_arr = arr[:11]
#         # img_arr = np.transpose(img_arr, (2, 1, 0))
#         # resize img
#         img_arr = cv2.resize(img_arr, config.IMG_SIZE)
#         img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
#         imgs.append(img_arr)
#     return np.array(imgs) 


# def get_np_data(split : float = 0.2):
#     """Return np data for training and testing."""

#     print("Loading data...")
#     x_train_uids, x_test_uids, y_train, y_test = train_test_split(
#         data['uid'],
#         data.severity,
#         test_size=split,
#         random_state=config.RANDOM_SEED,
#         stratify=data.severity
#     )

#     x_train = get_imgs(x_train_uids)
#     x_test = get_imgs(x_test_uids)

#     return x_train, y_train, x_test, y_test

In [104]:
# x_train, y_train, x_test, y_test = get_np_data()
# print(y_train.value_counts(normalize=True))
# print(y_test.value_counts(normalize=True))
# print('Done')

# Preprocess

In [105]:
y = y_ - 1
y.value_counts()

0.0    7143
3.0    3415
1.0    3000
2.0    2451
4.0      51
Name: severity, dtype: int64

In [106]:
# y_train = y_train_  -1
# y_val = y_val_ - 1

# y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

In [107]:
ytest = ytest_-1
ytest.value_counts(normalize=True)

0.0    0.354
2.0    0.268
1.0    0.239
3.0    0.132
4.0    0.007
Name: severity, dtype: float64

In [108]:
# from category_encoders import OrdinalEncoder as COE

# enc_cols = ['year', 'region']

# oe = OrdinalEncoder()
# X_train = oe.fit_transform(X_train_)
# X_test = oe.transform(X_test_)
# X_val = oe.transform(X_val_)
# Xtest = oe.transform(Xtest_)

In [109]:
X_.shape, X_test_.shape, Xtest_.shape

((16060, 5), (6510, 5), (1000, 5))

# Engine Train eval

In [39]:
%%time

def train_eval(model, X_train=None, X_val=None, y_train=None, y_val=None):
    """
    train and eval util func,
    returns trained model, soft_preds, and rmse
    REMEMBER to round myself
    """
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mse(y_val, np.round(preds), squared=False)
    test_rmse = mse(ytest, np.round(model.predict(Xtest_)), squared=False)
    print("Train RMSE: ", mse(y_train, np.round(model.predict(X_train)), squared=False))
    print("Val RMSE:", rmse)
    print('TEST RMSE: ', test_rmse)
    
    # print("TEST RMSE:", mse(y_val, np.round(model.predict(X_val)), squared=False))
    return model, preds, rmse, test_rmse


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


In [41]:
def get_pipe(model):
    """ return simple sklearn pipeline with encoder and model"""
    oe = OrdinalEncoder()
    pipe = make_pipeline(oe, model)
    return pipe

In [42]:
# m, p, e = train_eval(XGBRegressor(n_estimators=500))

In [43]:
#  It's better to cv!

def cv_it(model, X_train=X_, y_train=y, X_test=X_test_, splits=10, cv_predict=False):
#     skf = StratifiedKFold(n_splits=splits, random_state=config.RANDOM_STATE, shuffle=True)
    
    tscv = TimeSeriesSplit(n_splits=splits, test_size=1000)
    
    val_rmse = []
    test_rmses = []
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(6510, splits))
        cvpreds_train = np.zeros(shape=(len(X_train)))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train, y_train)):

        print(f'-----------------------Fold-{fold}-------------------------')
        X_train_subset, y_train_subset = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_subset, y_val_subset = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        print(f'Training on {X_train_subset.shape[0]} samples' )
        print(f'Validating on {X_val_subset.shape[0]} samples' )

        model, val_preds, rmse, test_rmse= train_eval(model=model, X_train=X_train_subset, y_train=y_train_subset, X_val=X_val_subset, y_val=y_val_subset)
        val_rmse.append(rmse)
        test_rmses.append(test_rmse)
        
        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(X_test)
            cvpreds_train[val_idx] = val_preds
                    
    print()
    print("Mean Val RMSE:", np.mean(val_rmse), "std:", np.std(val_rmse))
    print("Mean Test RMSE:", np.mean(test_rmses), "std:", np.std(test_rmses))

    
    if cv_predict:
        return cvpreds_test, cvpreds_train


In [44]:
# expanding_severity : Mean RMSE: 0.8326926618960704 std: 0.010792728721340388
# expndng_sev_by_reg : Mean RMSE: 0.8326043320130792 std: 0.017949794678433155
# expanding_sev_rs : Mean RMSE: 0.831528416210013 std: 0.0188192039517455

In [45]:
req_cols

['region', 'month', 'season', 'expanding_sev_rs']

In [46]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=5_000, tree_method='gpu_hist', gpu_id=0, random_state=config.RANDOM_SEED))
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Train RMSE:  0.7958224257542215
Val RMSE: 0.950263121456368
TEST RMSE:  0.8700574693662483
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Train RMSE:  0.8131357830643765
Val RMSE: 0.7609204951898719
TEST RMSE:  0.8602325267042626
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Train RMSE:  0.8066873104700787
Val RMSE: 1.0737783756436894
TEST RMSE:  0.8602325267042626
-----------------------Fold-3-------------------------
Training on 9060 samples
Validating on 1000 samples
Train RMSE:  0.8402800962635022
Val RMSE: 1.0601886624558858
TEST RMSE:  0.8602325267042626
-----------------------Fold-4-------------------------
Training on 10060 samples
Validating on 1000 samples
Train RMSE:  0.8565616224891659
Val RMSE: 0.8882567196480982
TEST RMSE:  0.8780660567406077
-----

((6510, 10), (16060,))

In [47]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=10_000, tree_method='gpu_hist', gpu_id=0, random_state=config.RANDOM_SEED))
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Train RMSE:  0.7958224257542215
Val RMSE: 0.950263121456368
TEST RMSE:  0.8700574693662483
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Train RMSE:  0.8131357830643765
Val RMSE: 0.7609204951898719
TEST RMSE:  0.8602325267042626
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Train RMSE:  0.8066873104700787
Val RMSE: 1.0737783756436894
TEST RMSE:  0.8602325267042626
-----------------------Fold-3-------------------------
Training on 9060 samples
Validating on 1000 samples
Train RMSE:  0.8402800962635022
Val RMSE: 1.0601886624558858
TEST RMSE:  0.8602325267042626
-----------------------Fold-4-------------------------
Training on 10060 samples
Validating on 1000 samples
Train RMSE:  0.8565616224891659
Val RMSE: 0.8882567196480982
TEST RMSE:  0.8780660567406077
-----

((6510, 10), (16060,))

In [65]:
req_cols

['region', 'month', 'season', 'expndng_sev_by_reg']

In [66]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=10_000, tree_method='gpu_hist', gpu_id=0, random_state=config.RANDOM_SEED))
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Train RMSE:  0.7950963575704407
Val RMSE: 0.9289779329994874
TEST RMSE:  0.8843076387773657
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Train RMSE:  0.8119155121481887
Val RMSE: 0.7687652437513027
TEST RMSE:  0.8837420438114281
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Train RMSE:  0.8054559610623774
Val RMSE: 1.0681760154581266
TEST RMSE:  0.8602325267042626
-----------------------Fold-3-------------------------
Training on 9060 samples
Validating on 1000 samples
Train RMSE:  0.8531895484396187
Val RMSE: 0.8983317872590283
TEST RMSE:  0.8854377448471462
-----------------------Fold-5-------------------------
Training on 11060 samples
Validating on 1000 samples
Train RMSE:  0.8555213716558198
Val RMSE: 1.004987562112089
TEST RMSE:  0.8854377448471462
-----

((6510, 10), (16060,))

In [67]:
X_.columns

Index(['region', 'month', 'season', 'expndng_sev_by_reg'], dtype='object')

In [83]:
req_cols

['region', 'month', 'season', 'expanding_severity']

In [84]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=10_000, tree_method='gpu_hist', gpu_id=0, random_state=config.RANDOM_SEED))
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Train RMSE:  0.8063281072253715
Val RMSE: 0.950263121456368
TEST RMSE:  0.8938679992034618
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Train RMSE:  0.8248100476660539
Val RMSE: 0.762889244910426
TEST RMSE:  0.8677557259966655
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Train RMSE:  0.8173824954821308
Val RMSE: 1.074709263010234
TEST RMSE:  0.8677557259966655
-----------------------Fold-3-------------------------
Training on 9060 samples
Validating on 1000 samples
Train RMSE:  0.8496200942984821
Val RMSE: 1.0630145812734648
TEST RMSE:  0.8677557259966655
-----------------------Fold-4-------------------------
Training on 10060 samples
Validating on 1000 samples
Train RMSE:  0.8669431684121292
Val RMSE: 0.8689073598491384
TEST RMSE:  0.8854377448471462
-------

((6510, 10), (16060,))

In [110]:
req_cols

['region', 'month', 'year', 'season', 'expanding_severity']

In [111]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=10_000, tree_method='gpu_hist', gpu_id=0, random_state=config.RANDOM_SEED))
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples


ValueError: Found unknown categories [2021] in column 2 during transform

In [None]:
# save cv_preds (raw/soft) hoping all the indices are right!!

xgb_cv_train = pd.DataFrame(columns=['uid', 'xgbreg1k'])
xgb_cv_train.uid = all_train.uid
xgb_cv_train.xgbreg1k = xgbreg_cv_train
xgb_cv_train.to_csv('xgbreg1k_cv_train.csv', index=False)


xgb_cv_test = pd.DataFrame(columns=['uid', 'xgbreg1k_cvpreds'])
xgb_cv_test.uid = test_data.uid
xgb_cv_test.xgbreg1k_cvpreds = np.mean(xgbreg_cv_test, axis=1)
xgb_cv_test.to_csv('xgbreg1k_cv_test.csv', index=False)

region                 object
month                   int64
year                    int64
season                  int64
expanding_severity    float64
dtype: object

In [86]:
catreg_pipe = get_pipe(CatBoostRegressor(n_estimators=1000, task_type="GPU", devices='0:1', verbose=1000, random_seed=config.RANDOM_SEED))
catreg_cv_test, catreg_cv_train = cv_it(
    catreg_pipe,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
catreg_cv_test.shape, catreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Learning rate set to 0.051147
0:	learn: 1.1938681	total: 50.7ms	remaining: 50.6s
999:	learn: 0.7343039	total: 19.1s	remaining: 0us
Train RMSE:  0.807044070675946
Val RMSE: 0.9513148795220224
TEST RMSE:  0.9235799911215054
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Learning rate set to 0.05218
0:	learn: 1.1755810	total: 22.1ms	remaining: 22.1s
999:	learn: 0.7568325	total: 22.2s	remaining: 0us
Train RMSE:  0.8249817578891417
Val RMSE: 0.762889244910426
TEST RMSE:  0.8677557259966655
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Learning rate set to 0.053094
0:	learn: 1.1755055	total: 17.3ms	remaining: 17.3s
Learning rate set to 0.053913
0:	learn: 1.1634227	total: 18.9ms	remaining: 18.9s
999:	learn: 0.7768448	total: 22.2s	remaining: 0us
Train RMSE:  0.849814938

((6510, 10), (16060,))

In [115]:
categorical_features_indices = np.where(X_.dtypes != np.float)[0]
categorical_features_indices

array([0, 1, 2, 3])

In [116]:
cbreg = CatBoostRegressor(n_estimators=1000, task_type="GPU", devices='0:1',cat_features=categorical_features_indices, verbose=1000, random_seed=config.RANDOM_SEED)
catreg_cv_test, catreg_cv_train = cv_it(
    cbreg,
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
catreg_cv_test.shape, catreg_cv_train.shape

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Learning rate set to 0.051147
0:	learn: 1.1953147	total: 105ms	remaining: 1m 44s
999:	learn: 0.7147809	total: 1m 15s	remaining: 0us
Train RMSE:  0.7900996302635347
Val RMSE: 0.9093954035511725
TEST RMSE:  0.9077444574328174
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Learning rate set to 0.05218
0:	learn: 1.1775949	total: 140ms	remaining: 2m 19s
999:	learn: 0.7326565	total: 1m 19s	remaining: 0us
Train RMSE:  0.8042909005398595
Val RMSE: 0.7661592523751182
TEST RMSE:  0.8608135686662938
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Learning rate set to 0.053094
0:	learn: 1.1783108	total: 43.9ms	remaining: 43.9s
Learning rate set to 0.053913
0:	learn: 1.1662855	total: 46.8ms	remaining: 46.7s
999:	learn: 0.7497437	total: 1m 16s	remaining: 0us
Train RMSE:  0.8178

((6510, 10), (16060,))

In [None]:

cat_cv_train = pd.DataFrame(columns=['uid', 'catreg1k'])
cat_cv_train.uid = all_train.uid
cat_cv_train.catreg1k = catreg_cv_train
cat_cv_train.to_csv('catreg1k_cv_train.csv', index=False)


cat_cv_test = pd.DataFrame(columns=['uid', 'catreg1k_cvpreds'])
cat_cv_test.uid = test_data.uid
cat_cv_test.catreg1k_cvpreds = np.mean(catreg_cv_test, axis=1)
cat_cv_test.to_csv('catreg1k_cv_test.csv', index=False)

In [None]:
lgbreg_pipe = get_pipe(LGBMRegressor(n_estimators=500, random_state=config.RANDOM_SEED))
lgbreg_cv_test, lgbreg_cv_train = cv_it(lgbreg_pipe, cv_predict=True, splits=10, X_test=X_test_)   # (.82 to 0.84)
lgbreg_cv_test.shape, lgbreg_cv_train.shape

In [None]:
# save cv preds

lgb_cv_train = pd.DataFrame(columns=['uid', 'lgbreg1k'])
lgb_cv_train.uid = all_train.uid
lgb_cv_train.lgbreg1k = lgbreg_cv_train
lgb_cv_train.to_csv('lgbreg1k_cv_train.csv', index=False)


lgb_cv_test = pd.DataFrame(columns=['uid', 'lgbreg1k_cvpreds'])
lgb_cv_test.uid = test_data.uid
lgb_cv_test.lgbreg1k_cvpreds = np.mean(lgbreg_cv_test, axis=1)
lgb_cv_test.to_csv('lgbreg1k_cv_test.csv', index=False)

In [None]:
%%time

xgb_pipe = get_pipe(XGBClassifier(n_estimators=500))
cv_it(xgb_pipe, cv_predict=False, splits=10, X_test=X_test_)

In [90]:
from sklearn.linear_model import LinearRegression, LogisticRegression
linreg = LinearRegression()
logreg = LogisticRegression()
_ = cv_it(
    get_pipe(linreg),
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)

print('\n\n')

_ = cv_it(
    get_pipe(logreg),
    X_train = X_,
    y_train = y,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)

-----------------------Fold-0-------------------------
Training on 6060 samples
Validating on 1000 samples
Train RMSE:  0.9837793361207674
Val RMSE: 1.0945318634009702
TEST RMSE:  1.1836384583140243
-----------------------Fold-1-------------------------
Training on 7060 samples
Validating on 1000 samples
Train RMSE:  0.9998583469077106
Val RMSE: 0.9348796714016194
TEST RMSE:  1.1184811129384349
-----------------------Fold-2-------------------------
Training on 8060 samples
Validating on 1000 samples
Train RMSE:  0.9928403748466106
Val RMSE: 1.2755391017134676
TEST RMSE:  1.1549891774384728
-----------------------Fold-3-------------------------
Training on 9060 samples
Validating on 1000 samples
Train RMSE:  1.0422106747235542
Val RMSE: 1.2037441588643327
TEST RMSE:  1.1198214143335534
-----------------------Fold-4-------------------------
Training on 10060 samples
Validating on 1000 samples
Train RMSE:  1.0590892219823995
Val RMSE: 0.997997995989972
TEST RMSE:  1.1198214143335534
-----

In [None]:
# %%time
# def train_eval_density(model, X_train=X_train_trans, y_train=yd_train, X_val=X_val_trans, y_val=yd_val):
#     model.fit(X_train, y_train)
#     preds = model.predict(X_val)
#     hard_preds = pd.Series(preds).map(dens_to_sev)
#     print("Compe RMSE: ", mse(ys_val+1, hard_preds, squared=False))
#     print("RMSE: ", mse(y_val, preds, squared=False))
#     return None

In [None]:
# train_eval_density(XGBRegressor(n_estimators=1000, verbose=0))
# print('-------------------')
# train_eval_density(CatBoostRegressor(n_estimators=1000, verbose=1000))

# Generate predictions

In [None]:
xgbreg_cv_preds = np.round(xgbreg_cv_test.mean(axis=1)) 
xgbreg_cv_preds.shape

In [None]:
np.unique(xgbreg_cv_preds)

In [None]:
sub_format.severity = xgbreg_cv_preds
sub_format.severity = sub_format.severity.astype(int) + 1
sub_format

In [None]:
sub_format.severity.value_counts()

In [None]:
sub_format.to_csv('xgbreg_cvpreds_on_expanding_sev_rs_preds.csv', index=False)
# expecting (0.81 to 0.84)

# So....

- regressing severity is better than classifying it. 
> probbaly coz of optimizing differnet functions! clfs optimzie logloss whereas regs optimize rmse       which is similar to the task, but is this the same for NNs???
- Validation is v.imp and be careful with timeseries data


♦ Now start all again 

- No great improvements with mtdata even! 
- Figure out something else...