**Mission :** *Investigate cv/data leak*

- Why lb and cv difference????


# Imports

In [37]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder


from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



warnings.filterwarnings('ignore')

In [None]:
# # wandb stuff for tracking
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

# import wandb
# wandb.login(key=wandb_login)

# Config

In [38]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'trees-{config.unique_id}'   

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# Img config
config['IMG_SIZE'] = (136, 136)
config['CHANNELS'] = 3


config['desc'] = 'test run for kaggle ml nb setup'

unique_id: 1674117487


In [39]:
# seed everything
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(config.RANDOM_SEED)

# Utils

In [2]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

# Load data

In [3]:
INPUT_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_DIR, 'train_labels.csv'))


metadata.date = pd.to_datetime(metadata.date)

In [4]:
# IMG_DIR = "/kaggle/input/pull-landsat-data-v1-500m/landsat8_500m_v1"

# img_files = os.listdir(IMG_DIR)
# img_file_names = [f.split('.')[0] for f in img_files]

# # get only data for those only in dataset
# metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
# data = metadata_subset[metadata_subset.split == 'train']
# data = data.merge(train_labels, on='uid')

In [6]:
#  get data
metadata.date = pd.to_datetime(metadata.date)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
print(data.shape)

(23570, 8)


In [7]:
# seasons
seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

#  most of the samples are collected in the months of June, July, August.

# add date time fts.
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year
data['week'] = data.date.dt.isocalendar().week
# data['day_of_year'] = data.date.dt.""
data['season'] = data.month.map(seasons)
                                

data.sort_values(by='date', inplace=True)

In [8]:
data['expanding_severity'] = data.severity.expanding().mean()
data['expanding_severity'] = data['expanding_severity'].apply(np.round)
data


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity
4387,evep,44.847993,-93.476318,2013-01-04,train,midwest,1.0,115.0,1,2013,1,1,1.0
13644,paev,44.822478,-93.367962,2013-01-04,train,midwest,1.0,1884.0,1,2013,1,1,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,midwest,1.0,1416.0,1,2013,1,1,1.0
6144,guny,44.878889,-93.490833,2013-01-04,train,midwest,1.0,558.0,1,2013,1,1,1.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,midwest,1.0,476.0,1,2013,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,west,,,12,2021,52,1,2.0
17559,thki,36.725400,-121.730000,2021-12-29,test,west,,,12,2021,52,1,2.0
17452,teuu,36.772300,-121.788000,2021-12-29,test,west,,,12,2021,52,1,2.0
14254,prfi,36.751800,-121.742000,2021-12-29,test,west,,,12,2021,52,1,2.0


In [9]:
train_data = data[data.split == 'train']
test_data = data[data.split == 'test']

In [11]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)
grp_by_region.isna().sum()

4

In [10]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

print(mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).loc[train_data.index].sort_index(), squared=False))

data['expndng_sev_by_reg'] = np.nan

south = data.region == 'south'
midwest = data.region == 'midwest'
northeast = data.region == 'northeast'
west = data.region == 'west'

data.loc[south , 'expndng_sev_by_reg'] = grp_by_region['south']
data.loc[midwest , 'expndng_sev_by_reg'] = grp_by_region['midwest']
data.loc[northeast , 'expndng_sev_by_reg'] = grp_by_region['northeast']
data.loc[west , 'expndng_sev_by_reg'] = grp_by_region['west']

print(data.shape)
data.isna().sum()

data.sort_index()

0
0.894165010958815
(23570, 14)


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0
1,aabn,36.559700,-121.510000,2016-08-31,test,west,,,8,2016,35,3,2.0,4.0
2,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0
3,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0
4,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,west,,,12,2014,49,1,2.0,4.0
23566,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,south,1.0,9682.0,11,2016,47,4,2.0,2.0


In [26]:
grp_by_rs = data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)



Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            23560, 23561, 23562, 23563, 23564, 23565, 23566, 23567, 23568,
            23569],
           dtype='int64', length=23570)

In [27]:
grp_by_rs = data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum()) # 5 --> .86

data['expanding_sev_rs'] =  grp_by_rs.droplevel(0).droplevel(0).sort_index()

# fillna with expanding sev by region
data['expanding_sev_rs'] = np.where(data.expanding_sev_rs.isna(), data.expndng_sev_by_reg, data.expanding_sev_rs)

print(mse(train_data.severity.sort_index(), data['expanding_sev_rs'].sort_index()[data.split == 'train'], squared=False))



# #  make submission for expanding severity by region and season

# expanding_sev_rs = data[data.split == 'test'][['uid', 'expanding_sev_rs']]          # picking up only uids and expanding_sev_rs from test samples
# expanding_sev_rs.expanding_sev_rs = expanding_sev_rs.expanding_sev_rs.astype(int)   # casting to int
# expanding_sev_rs.sort_values(by='uid', inplace=True)                                # sorting by uid -- safest option
# expanding_sev_rs.reset_index(drop=True, inplace=True)                               # matching indexes with submissoin

# sub_format.severity = expanding_sev_rs.expanding_sev_rs
# sub_format.severity.value_counts()  # expected 0.8594349134502333

# sub_format.to_csv('expanding_sev_rs_preds.csv', index=False)

5
0.8594349134502333


In [45]:
all_train = data[data.split == 'train']
all_train.shape

(17060, 15)

In [46]:
# reset training index to avoid wrong submissions
all_train = all_train.sort_values('uid').reset_index(drop=True)
all_train

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg,expanding_sev_rs
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0,1.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0,2.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0,2.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0,2.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,4.0,2017313.0,8,2021,34,3,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17055,zzsv,38.707825,-75.080867,2018-06-27,train,south,3.0,113125.0,6,2018,26,3,2.0,2.0,2.0
17056,zzuq,35.794000,-79.015368,2015-08-06,train,south,3.0,175726.0,8,2015,32,3,2.0,1.0,2.0
17057,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0,2.0
17058,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0,1.0


In [47]:
test_data = metadata[metadata.split == 'test']
test_data = test_data.merge(sub_format, on='uid')

test_data['month'] = test_data.date.dt.month
test_data['year'] = test_data.date.dt.year
test_data['season'] = test_data.month.map(seasons)

In [48]:
tar_cols = ['expanding_severity', 'expndng_sev_by_reg', 'expanding_sev_rs']

In [49]:
req_cols = ['region', 'month', 'year', 'season', tar_cols[2]]
req_cols

['region', 'month', 'year', 'season', 'expanding_sev_rs']

In [50]:
test_data['expanding_severity'] = data[data.split == 'test']['expanding_severity'].sort_index().values
test_data['expndng_sev_by_reg'] = data[data.split == 'test']['expndng_sev_by_reg'].sort_index().values
test_data['expanding_sev_rs'] = data[data.split == 'test']['expanding_sev_rs'].sort_index().values

In [53]:
all_train_ = all_train.sort_values(by='date')[:-1000]
all_test_ = all_train.sort_values(by='date')[-1000:]  # last 1000 samples only for testing

In [54]:
X_ , y_ = all_train_[req_cols], all_train_['severity']
X_.shape, y_.shape

((16060, 5), (16060,))

In [70]:
Xtest_, ytest_ = all_test_[req_cols], all_test_['severity']
Xtest_.shape, ytest_.shape

((1000, 5), (1000,))

In [71]:
X_.isna().sum().sum(), y_.isna().sum()

(0, 0)

In [65]:
X_train_,X_val_, y_train_, y_val_ = train_test_split(X_, y_, test_size=0.15, random_state=config.RANDOM_SEED, stratify=y_)
X_train_.shape, y_train_.shape, X_val_.shape, y_val_.shape

((13651, 5), (13651,), (2409, 5), (2409,))

In [66]:
X_test_ = test_data[req_cols]
X_test_.shape

(6510, 5)

In [67]:
# def get_imgs(uids) :
#     imgs = []
#     for uid in uids:
#         arr = joblib.load(IMG_DIR + f'/{uid}.npy')
#         img_arr = arr[:11]
#         # img_arr = np.transpose(img_arr, (2, 1, 0))
#         # resize img
#         img_arr = cv2.resize(img_arr, config.IMG_SIZE)
#         img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
#         imgs.append(img_arr)
#     return np.array(imgs) 


# def get_np_data(split : float = 0.2):
#     """Return np data for training and testing."""

#     print("Loading data...")
#     x_train_uids, x_test_uids, y_train, y_test = train_test_split(
#         data['uid'],
#         data.severity,
#         test_size=split,
#         random_state=config.RANDOM_SEED,
#         stratify=data.severity
#     )

#     x_train = get_imgs(x_train_uids)
#     x_test = get_imgs(x_test_uids)

#     return x_train, y_train, x_test, y_test

In [68]:
# x_train, y_train, x_test, y_test = get_np_data()
# print(y_train.value_counts(normalize=True))
# print(y_test.value_counts(normalize=True))
# print('Done')

# Preprocess

In [69]:
y = y_ - 1
y.value_counts()

0.0    7143
3.0    3415
1.0    3000
2.0    2451
4.0      51
Name: severity, dtype: int64

In [76]:
y_train = y_train_  -1
y_val = y_val_ - 1

y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

(0.0    0.444803
 3.0    0.212658
 1.0    0.186800
 2.0    0.152590
 4.0    0.003150
 Name: severity, dtype: float64,
 0.0    0.444583
 3.0    0.212536
 1.0    0.186800
 2.0    0.152760
 4.0    0.003321
 Name: severity, dtype: float64)

In [72]:
ytest = ytest_-1
ytest.value_counts(normalize=True)

0.0    0.354
2.0    0.268
1.0    0.239
3.0    0.132
4.0    0.007
Name: severity, dtype: float64

In [73]:
# from category_encoders import OrdinalEncoder as COE

enc_cols = ['year', 'region']

oe = OrdinalEncoder()
X_train = oe.fit_transform(X_train_)
X_test = oe.transform(X_test_)
X_val = oe.transform(X_val_)
Xtest = oe.transform(Xtest_)

In [74]:
X_train.shape, X_val.shape, X_test.shape, Xtest.shape

((13651, 5), (2409, 5), (6510, 5), (1000, 5))

# Engine Train eval

In [87]:
%%time

def train_eval(model, X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val):
    """
    train and eval util func,
    returns trained model, soft_preds, and rmse
    REMEMBER to round myself
    """
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mse(y_val, np.round(preds), squared=False)
    print("Train RMSE: ", mse(y_train, np.round(model.predict(X_train)), squared=False))
    print("Val RMSE:", rmse)
    print('TEST RMSE: ', mse(ytest, np.round(model.predict(Xtest_)), squared=False))
    
    # print("TEST RMSE:", mse(y_val, np.round(model.predict(X_val)), squared=False))
    return model, preds, rmse


CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 10.5 µs


In [88]:
def get_pipe(model, cols=enc_cols):
    """ return simple sklearn pipeline with encoder and model"""
    oe = OrdinalEncoder()
    pipe = make_pipeline(oe, model)
    return pipe

In [89]:
# m, p, e = train_eval(XGBRegressor(n_estimators=500))

In [90]:
#  It's better to cv!

def cv_it(model, X_train=X_, y_train=y, X_test=X_test, splits=10, cv_predict=False):
    skf = StratifiedKFold(n_splits=splits, random_state=config.RANDOM_STATE, shuffle=True)

    rmse_list = []
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(6510, splits))
        cvpreds_train = np.zeros(shape=(len(X_train)))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):

        print(f'-----------------------Fold-{fold}-------------------------')
        X_train_subset, y_train_subset = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_subset, y_val_subset = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        
        model, val_preds, rmse = train_eval(model=model, X_train=X_train_subset, y_train=y_train_subset, X_val=X_val_subset, y_val=y_val_subset)
        rmse_list.append(rmse)
        
        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(X_test)
            cvpreds_train[val_idx] = val_preds
                    
    print()
    print("Mean RMSE:", np.mean(rmse_list), "std:", np.std(rmse_list))
    
    if cv_predict:
        return cvpreds_test, cvpreds_train


In [84]:
# expanding_severity : Mean RMSE: 0.8326926618960704 std: 0.010792728721340388
# expndng_sev_by_reg : Mean RMSE: 0.8326043320130792 std: 0.017949794678433155
# expanding_sev_rs : Mean RMSE: 0.831528416210013 std: 0.0188192039517455

In [85]:
req_cols

['region', 'month', 'year', 'season', 'expanding_sev_rs']

In [98]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=1000, random_state=config.RANDOM_SEED))   # (0.81 to 0.85)
xgbreg_cv_test,xgbreg_cv_train = cv_it(
    xgb_pipe,
    X_train = X_train_,
    y_train = y_train,
    cv_predict=True,
    splits=10,
    X_test=X_test_
)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Train RMSE:  0.8070204776101108
Val RMSE: 0.8391936449030586
TEST RMSE:  0.8520563361656317
-----------------------Fold-1-------------------------
Train RMSE:  0.8101585208531449
Val RMSE: 0.8245322794490704
TEST RMSE:  0.9005553841935542
-----------------------Fold-2-------------------------
Train RMSE:  0.8088514072781452
Val RMSE: 0.8285209028424608
TEST RMSE:  0.8544003745317531
-----------------------Fold-3-------------------------
Train RMSE:  0.8070380626965836
Val RMSE: 0.8473185457363234
TEST RMSE:  0.8689073598491384
-----------------------Fold-4-------------------------
Train RMSE:  0.8071389109397386
Val RMSE: 0.8434188232161477
TEST RMSE:  0.867179335547152
-----------------------Fold-5-------------------------
Train RMSE:  0.8099575637569484
Val RMSE: 0.8227533512074423
TEST RMSE:  0.8882567196480982
-----------------------Fold-6-------------------------
Train RMSE:  0.8110622116202592
Val RMSE: 0.8056576500500998
TES

((6510, 10), (13651,))

In [99]:
# Clearlyy theres val set is leakeddd...
# yeah ofcourse... random splitting time series data am such an idiot 


In [48]:
test_rmse = [
     0.8520563361656317,
     0.9005553841935542,
     0.8544003745317531,
     0.8689073598491384,
     0.867179335547152,
     0.8882567196480982,
     0.8933084573650918,
     0.8988882021697693,
     0.901665126307988,
     0.9137833441248533
]

np.mean(test_rmse), np.std(test_rmse)

(0.8839000639903031, 0.02051040901440761)

In [None]:
# save cv_preds (raw/soft) hoping all the indices are right!!

xgb_cv_train = pd.DataFrame(columns=['uid', 'xgbreg1k'])
xgb_cv_train.uid = all_train.uid
xgb_cv_train.xgbreg1k = xgbreg_cv_train
xgb_cv_train.to_csv('xgbreg1k_cv_train.csv', index=False)


xgb_cv_test = pd.DataFrame(columns=['uid', 'xgbreg1k_cvpreds'])
xgb_cv_test.uid = test_data.uid
xgb_cv_test.xgbreg1k_cvpreds = np.mean(xgbreg_cv_test, axis=1)
xgb_cv_test.to_csv('xgbreg1k_cv_test.csv', index=False)

In [None]:
catreg_pipe = get_pipe(CatBoostRegressor(n_estimators=1000, verbose=1000, random_seed=config.RANDOM_SEED))
catreg_cv_test, catreg_cv_train = cv_it(catreg_pipe, cv_predict=True, splits=10, X_test=X_test_)  # (.82 to 0.83)
catreg_cv_test.shape, catreg_cv_train.shape

In [None]:
0.01434931216301096 + 0.8277672088852217

In [None]:

cat_cv_train = pd.DataFrame(columns=['uid', 'catreg1k'])
cat_cv_train.uid = all_train.uid
cat_cv_train.catreg1k = catreg_cv_train
cat_cv_train.to_csv('catreg1k_cv_train.csv', index=False)


cat_cv_test = pd.DataFrame(columns=['uid', 'catreg1k_cvpreds'])
cat_cv_test.uid = test_data.uid
cat_cv_test.catreg1k_cvpreds = np.mean(catreg_cv_test, axis=1)
cat_cv_test.to_csv('catreg1k_cv_test.csv', index=False)

In [None]:
lgbreg_pipe = get_pipe(LGBMRegressor(n_estimators=500, random_state=config.RANDOM_SEED))
lgbreg_cv_test, lgbreg_cv_train = cv_it(lgbreg_pipe, cv_predict=True, splits=10, X_test=X_test_)   # (.82 to 0.84)
lgbreg_cv_test.shape, lgbreg_cv_train.shape

In [None]:
# save cv preds

lgb_cv_train = pd.DataFrame(columns=['uid', 'lgbreg1k'])
lgb_cv_train.uid = all_train.uid
lgb_cv_train.lgbreg1k = lgbreg_cv_train
lgb_cv_train.to_csv('lgbreg1k_cv_train.csv', index=False)


lgb_cv_test = pd.DataFrame(columns=['uid', 'lgbreg1k_cvpreds'])
lgb_cv_test.uid = test_data.uid
lgb_cv_test.lgbreg1k_cvpreds = np.mean(lgbreg_cv_test, axis=1)
lgb_cv_test.to_csv('lgbreg1k_cv_test.csv', index=False)

In [None]:
%%time

xgb_pipe = get_pipe(XGBClassifier(n_estimators=500))
cv_it(xgb_pipe, cv_predict=False, splits=10, X_test=X_test_)

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
linreg = LinearRegression()
logreg = LogisticRegression()
cv_it(get_pipe(linreg))
cv_it(get_pipe(logreg))

In [None]:
# %%time
# def train_eval_density(model, X_train=X_train_trans, y_train=yd_train, X_val=X_val_trans, y_val=yd_val):
#     model.fit(X_train, y_train)
#     preds = model.predict(X_val)
#     hard_preds = pd.Series(preds).map(dens_to_sev)
#     print("Compe RMSE: ", mse(ys_val+1, hard_preds, squared=False))
#     print("RMSE: ", mse(y_val, preds, squared=False))
#     return None

In [None]:
# train_eval_density(XGBRegressor(n_estimators=1000, verbose=0))
# print('-------------------')
# train_eval_density(CatBoostRegressor(n_estimators=1000, verbose=1000))

# Generate predictions

In [None]:
xgbreg_cv_preds = np.round(xgbreg_cv_test.mean(axis=1)) 
xgbreg_cv_preds.shape

In [None]:
np.unique(xgbreg_cv_preds)

In [None]:
sub_format.severity = xgbreg_cv_preds
sub_format.severity = sub_format.severity.astype(int) + 1
sub_format

In [None]:
sub_format.severity.value_counts()

In [None]:
sub_format.to_csv('xgbreg_cvpreds_on_expanding_sev_rs_preds.csv', index=False)
# expecting (0.81 to 0.84)

# So....

- regressing severity is better than classifying it. 
> probbaly coz of optimizing differnet functions! clfs optimzie logloss whereas regs optimize rmse       which is similar to the task, but is this the same for NNs???
- Lst time felt cheated.. with trees @0.84/0.89 but why that discrepancy tho?
- So this time with 10 fold cv avg stands @ 0.83 still doubtful tho!


DIFF MTDTA:

-  expanding_severity : Mean RMSE: 0.8326926618960704 std: 0.010792728721340388

- expndng_sev_by_reg : Mean RMSE: 0.8326043320130792 std: 0.017949794678433155

- expanding_sev_rs : Mean RMSE: 0.831528416210013 std: 0.0188192039517455

All seem to be the same! But Can't they learn an identity function!