**Mission :** *See How differnt ML models perform on metadata*

- expanding_sev --> 1.202 local-rmse  
- expanding_sev_by_reg --> 0.8941 local-rmse (trust worthy/ 5 imputed values)
- expanding_sev_by_rs --> 0.859 local-rmse     (")

So Can the boosted trees get any better than thesee??

> Get as much as possible from the metadata (without using future data), then add more info with sat images

> NNs seem to fail at extracting tabular info from metadata!

# Imports

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from tick_tick_bloom_utils import my_keras_rmse, comp_metric

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



warnings.filterwarnings('ignore')

In [2]:
# # wandb stuff for tracking
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

# import wandb
# wandb.login(key=wandb_login)

# Config

In [3]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'trees-{config.unique_id}'   

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# Img config
config['IMG_SIZE'] = (136, 136)
config['CHANNELS'] = 3


config['desc'] = 'test run for kaggle ml nb setup'

unique_id: 1673459117


In [4]:
# seed everything
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(config.RANDOM_SEED)

# Utils

In [5]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

# Load data

In [6]:
INPUT_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_DIR, 'train_labels.csv'))


metadata.date = pd.to_datetime(metadata.date)

In [7]:
IMG_DIR = "/kaggle/input/pull-landsat-data-v1-500m/landsat8_500m_v1"

img_files = os.listdir(IMG_DIR)
img_file_names = [f.split('.')[0] for f in img_files]

# # get only data for those only in dataset
# metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
# data = metadata_subset[metadata_subset.split == 'train']
# data = data.merge(train_labels, on='uid')

In [8]:
len(os.listdir(IMG_DIR))

22766

In [9]:
#  get data
metadata.date = pd.to_datetime(metadata.date)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
print(data.shape)

(23570, 8)


In [10]:
# seasons
seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

#  most of the samples are collected in the months of June, July, August.

# add date time fts.
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year
data['week'] = data.date.dt.isocalendar().week
# data['day_of_year'] = data.date.dt.""
data['season'] = data.month.map(seasons)
                                

data.sort_values(by='date', inplace=True)

In [11]:
data['expanding_severity'] = data.severity.expanding().mean()
data['expanding_severity'] = data['expanding_severity'].apply(np.round)
data


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity
4387,evep,44.847993,-93.476318,2013-01-04,train,midwest,1.0,115.0,1,2013,1,1,1.0
13644,paev,44.822478,-93.367962,2013-01-04,train,midwest,1.0,1884.0,1,2013,1,1,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,midwest,1.0,1416.0,1,2013,1,1,1.0
6144,guny,44.878889,-93.490833,2013-01-04,train,midwest,1.0,558.0,1,2013,1,1,1.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,midwest,1.0,476.0,1,2013,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,west,,,12,2021,52,1,2.0
17559,thki,36.725400,-121.730000,2021-12-29,test,west,,,12,2021,52,1,2.0
17452,teuu,36.772300,-121.788000,2021-12-29,test,west,,,12,2021,52,1,2.0
14254,prfi,36.751800,-121.742000,2021-12-29,test,west,,,12,2021,52,1,2.0


In [12]:
train_data = data[data.split == 'train']
test_data = data[data.split == 'test']

In [13]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

print(mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).loc[train_data.index].sort_index(), squared=False))

data['expndng_sev_by_reg'] = np.nan

south = data.region == 'south'
midwest = data.region == 'midwest'
northeast = data.region == 'northeast'
west = data.region == 'west'

data.loc[south , 'expndng_sev_by_reg'] = grp_by_region['south']
data.loc[midwest , 'expndng_sev_by_reg'] = grp_by_region['midwest']
data.loc[northeast , 'expndng_sev_by_reg'] = grp_by_region['northeast']
data.loc[west , 'expndng_sev_by_reg'] = grp_by_region['west']

print(data.shape)
data.isna().sum()

data.sort_index()

0
0.894165010958815
(23570, 14)


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0
1,aabn,36.559700,-121.510000,2016-08-31,test,west,,,8,2016,35,3,2.0,4.0
2,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0
3,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0
4,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,west,,,12,2014,49,1,2.0,4.0
23566,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,south,1.0,9682.0,11,2016,47,4,2.0,2.0


In [14]:
grp_by_rs = data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum()) # 5 --> .86

data['expanding_sev_rs'] =  grp_by_rs.droplevel(0).droplevel(0).sort_index()

# fillna with expanding sev by region
data['expanding_sev_rs'] = np.where(data.expanding_sev_rs.isna(), data.expndng_sev_by_reg, data.expanding_sev_rs)

print(mse(train_data.severity.sort_index(), data['expanding_sev_rs'].sort_index()[data.split == 'train'], squared=False))



# #  make submission for expanding severity by region and season

# expanding_sev_rs = data[data.split == 'test'][['uid', 'expanding_sev_rs']]          # picking up only uids and expanding_sev_rs from test samples
# expanding_sev_rs.expanding_sev_rs = expanding_sev_rs.expanding_sev_rs.astype(int)   # casting to int
# expanding_sev_rs.sort_values(by='uid', inplace=True)                                # sorting by uid -- safest option
# expanding_sev_rs.reset_index(drop=True, inplace=True)                               # matching indexes with submissoin

# sub_format.severity = expanding_sev_rs.expanding_sev_rs
# sub_format.severity.value_counts()  # expected 0.8594349134502333

# sub_format.to_csv('expanding_sev_rs_preds.csv', index=False)

5
0.8594349134502333


In [15]:
all_train = data[data.split == 'train']
all_train.shape

(17060, 15)

In [18]:
# reset training index to avoid wrong submissions
all_train = all_train.sort_values('uid').reset_index(drop=True)
all_train

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg,expanding_sev_rs
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0,1.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0,2.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0,2.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0,2.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,4.0,2017313.0,8,2021,34,3,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17055,zzsv,38.707825,-75.080867,2018-06-27,train,south,3.0,113125.0,6,2018,26,3,2.0,2.0,2.0
17056,zzuq,35.794000,-79.015368,2015-08-06,train,south,3.0,175726.0,8,2015,32,3,2.0,1.0,2.0
17057,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0,2.0
17058,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0,1.0


In [20]:
test_data = metadata[metadata.split == 'test']
test_data = test_data.merge(sub_format, on='uid')

test_data['month'] = test_data.date.dt.month
test_data['year'] = test_data.date.dt.year
test_data['season'] = test_data.month.map(seasons)

In [21]:
tar_cols = ['expanding_severity', 'expndng_sev_by_reg', 'expanding_sev_rs']

In [22]:
req_cols = ['region', 'month', 'year', 'season', tar_cols[2]]
req_cols

['region', 'month', 'year', 'season', 'expanding_sev_rs']

In [23]:
test_data['expanding_severity'] = data[data.split == 'test']['expanding_severity'].sort_index().values
test_data['expndng_sev_by_reg'] = data[data.split == 'test']['expndng_sev_by_reg'].sort_index().values
test_data['expanding_sev_rs'] = data[data.split == 'test']['expanding_sev_rs'].sort_index().values

In [26]:
X_ , y_ = all_train[req_cols], all_train['severity']
X_.shape, y_.shape

((17060, 5), (17060,))

In [27]:
X_.isna().sum().sum(), y_.isna().sum()

(0, 0)

In [30]:
X_train_,X_val_, y_train_, y_val_ = train_test_split(X_, y_, test_size=0.20, random_state=config.RANDOM_SEED, stratify=y_)
X_train_.shape, y_train_.shape, X_val_.shape, y_val_.shape

((13648, 5), (13648,), (3412, 5), (3412,))

In [31]:
X_test_ = test_data[req_cols]
X_test_.shape

(6510, 5)

In [32]:
# def get_imgs(uids) :
#     imgs = []
#     for uid in uids:
#         arr = joblib.load(IMG_DIR + f'/{uid}.npy')
#         img_arr = arr[:11]
#         # img_arr = np.transpose(img_arr, (2, 1, 0))
#         # resize img
#         img_arr = cv2.resize(img_arr, config.IMG_SIZE)
#         img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
#         imgs.append(img_arr)
#     return np.array(imgs) 


# def get_np_data(split : float = 0.2):
#     """Return np data for training and testing."""

#     print("Loading data...")
#     x_train_uids, x_test_uids, y_train, y_test = train_test_split(
#         data['uid'],
#         data.severity,
#         test_size=split,
#         random_state=config.RANDOM_SEED,
#         stratify=data.severity
#     )

#     x_train = get_imgs(x_train_uids)
#     x_test = get_imgs(x_test_uids)

#     return x_train, y_train, x_test, y_test

In [33]:
# x_train, y_train, x_test, y_test = get_np_data()
# print(y_train.value_counts(normalize=True))
# print(y_test.value_counts(normalize=True))
# print('Done')

# Preprocess

In [34]:
y = y_ - 1
y.value_counts()

0.0    7497
3.0    3547
1.0    3239
2.0    2719
4.0      58
Name: severity, dtype: int64

In [35]:
y_train = y_train_  -1
y_val = y_val_ - 1

y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

(0.0    0.439478
 3.0    0.207943
 1.0    0.189845
 2.0    0.159364
 4.0    0.003370
 Name: severity, dtype: float64,
 0.0    0.439332
 3.0    0.207796
 1.0    0.189918
 2.0    0.159437
 4.0    0.003517
 Name: severity, dtype: float64)

In [36]:
# from category_encoders import OrdinalEncoder as COE

enc_cols = ['year', 'region']

oe = OrdinalEncoder()
X_train = oe.fit_transform(X_train_)
X_test = oe.transform(X_test_)
X_val = oe.transform(X_val_)

In [37]:
X_train.shape, X_val.shape, X_test.shape

((13648, 5), (3412, 5), (6510, 5))

# Engine Train eval

In [38]:
%%time

def train_eval(model, X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val):
    """
    train and eval util func,
    returns trained model, soft_preds, and rmse
    REMEMBER to round myself
    """
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mse(y_val, np.round(preds), squared=False)
    print("Train RMSE: ", mse(y_train, np.round(model.predict(X_train)), squared=False))
    print("Val RMSE:", rmse)
    # print("TEST RMSE:", mse(y_val, np.round(model.predict(X_val)), squared=False))
    return model, preds, rmse


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.82 µs


In [39]:
def get_pipe(model, cols=enc_cols):
    """ return simple sklearn pipeline with encoder and model"""
    oe = OrdinalEncoder()
    pipe = make_pipeline(oe, model)
    return pipe

In [40]:
# m, p, e = train_eval(XGBRegressor(n_estimators=500))

In [41]:
#  It's better to cv!

def cv_it(model, X_train=X_, y_train=y, X_test=X_test, splits=10, cv_predict=False):
    skf = StratifiedKFold(n_splits=splits, random_state=config.RANDOM_STATE, shuffle=True)

    rmse_list = []
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(6510, splits))
        cvpreds_train = np.zeros(shape=(len(X_train)))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):

        print(f'-----------------------Fold-{fold}-------------------------')
        X_train_subset, y_train_subset = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_subset, y_val_subset = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        
        model, val_preds, rmse = train_eval(model=model, X_train=X_train_subset, y_train=y_train_subset, X_val=X_val_subset, y_val=y_val_subset)
        rmse_list.append(rmse)
        
        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(X_test)
            cvpreds_train[val_idx] = val_preds
                    
    print()
    print("Mean RMSE:", np.mean(rmse_list), "std:", np.std(rmse_list))
    
    if cv_predict:
        return cvpreds_test, cvpreds_train


In [42]:
# expanding_severity : Mean RMSE: 0.8326926618960704 std: 0.010792728721340388
# expndng_sev_by_reg : Mean RMSE: 0.8326043320130792 std: 0.017949794678433155
# expanding_sev_rs : Mean RMSE: 0.831528416210013 std: 0.0188192039517455

In [43]:
req_cols

['region', 'month', 'year', 'season', 'expanding_sev_rs']

In [45]:
%%time

xgb_pipe = get_pipe(XGBRegressor(n_estimators=1000, random_state=config.RANDOM_SEED))   # (0.81 to 0.85)
xgbreg_cv_test,xgbreg_cv_train = cv_it(xgb_pipe, cv_predict=True, splits=10, X_test=X_test_)
xgbreg_cv_test.shape, xgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Train RMSE:  0.8129792501470918
Val RMSE: 0.8084402361800127
-----------------------Fold-1-------------------------
Train RMSE:  0.8103312329208492
Val RMSE: 0.8284931730958041
-----------------------Fold-2-------------------------
Train RMSE:  0.8096879851648457
Val RMSE: 0.8330792891393748
-----------------------Fold-3-------------------------
Train RMSE:  0.8083999541482391
Val RMSE: 0.8497982658338993
-----------------------Fold-4-------------------------
Train RMSE:  0.8102910448962791
Val RMSE: 0.8242371868649339
-----------------------Fold-5-------------------------
Train RMSE:  0.8096879851648457
Val RMSE: 0.8327274076921499
-----------------------Fold-6-------------------------
Train RMSE:  0.8090442259812447
Val RMSE: 0.83938793086151
-----------------------Fold-7-------------------------
Train RMSE:  0.8094064035235137
Val RMSE: 0.8407834229818512
-----------------------Fold-8-------------------------
Train RMSE:  0.8108

((6510, 10), (17060,))

In [48]:
0.8295434138937197 - 0.011592887474493057

0.8179505264192266

In [64]:
# save cv_preds (raw/soft) hoping all the indices are right!!

xgb_cv_train = pd.DataFrame(columns=['uid', 'xgbreg1k'])
xgb_cv_train.uid = all_train.uid
xgb_cv_train.xgbreg1k = xgbreg_cv_train
xgb_cv_train.to_csv('xgbreg1k_cv_train.csv', index=False)


xgb_cv_test = pd.DataFrame(columns=['uid', 'xgbreg1k_cvpreds'])
xgb_cv_test.uid = test_data.uid
xgb_cv_test.xgbreg1k_cvpreds = np.mean(xgbreg_cv_test, axis=1)
xgb_cv_test.to_csv('xgbreg1k_cv_test.csv', index=False)

In [65]:
catreg_pipe = get_pipe(CatBoostRegressor(n_estimators=1000, verbose=1000, random_seed=config.RANDOM_SEED))
catreg_cv_test, catreg_cv_train = cv_it(catreg_pipe, cv_predict=True, splits=10, X_test=X_test_)  # (.82 to 0.83)
catreg_cv_test.shape, catreg_cv_train.shape

-----------------------Fold-0-------------------------
Learning rate set to 0.063038
0:	learn: 1.1585140	total: 56.8ms	remaining: 56.7s
999:	learn: 0.7541295	total: 2.16s	remaining: 0us
Train RMSE:  0.8143400197945571
Val RMSE: 0.8048067785263842
-----------------------Fold-1-------------------------
Learning rate set to 0.063038
0:	learn: 1.1585479	total: 2.33ms	remaining: 2.32s
999:	learn: 0.7526989	total: 2.18s	remaining: 0us
Train RMSE:  0.8139400295843681
Val RMSE: 0.806625553226778
-----------------------Fold-2-------------------------
Learning rate set to 0.063038
0:	learn: 1.1583104	total: 2.16ms	remaining: 2.16s
999:	learn: 0.7516737	total: 2.1s	remaining: 0us
Train RMSE:  0.8115760746181075
Val RMSE: 0.8292003803719037
-----------------------Fold-3-------------------------
Learning rate set to 0.063038
0:	learn: 1.1581960	total: 2.25ms	remaining: 2.25s
999:	learn: 0.7507795	total: 2.26s	remaining: 0us
Train RMSE:  0.8104116029912597
Val RMSE: 0.8459960173843882
--------------

((6510, 10), (17060,))

In [84]:
0.01434931216301096 + 0.8277672088852217

0.8421165210482326

In [77]:

cat_cv_train = pd.DataFrame(columns=['uid', 'catreg1k'])
cat_cv_train.uid = all_train.uid
cat_cv_train.catreg1k = catreg_cv_train
cat_cv_train.to_csv('catreg1k_cv_train.csv', index=False)


cat_cv_test = pd.DataFrame(columns=['uid', 'catreg1k_cvpreds'])
cat_cv_test.uid = test_data.uid
cat_cv_test.catreg1k_cvpreds = np.mean(catreg_cv_test, axis=1)
cat_cv_test.to_csv('catreg1k_cv_test.csv', index=False)

In [67]:
lgbreg_pipe = get_pipe(LGBMRegressor(n_estimators=500, random_state=config.RANDOM_SEED))
lgbreg_cv_test, lgbreg_cv_train = cv_it(lgbreg_pipe, cv_predict=True, splits=10, X_test=X_test_)   # (.82 to 0.84)
lgbreg_cv_test.shape, lgbreg_cv_train.shape

-----------------------Fold-0-------------------------
Train RMSE:  0.8095673193110924
Val RMSE: 0.8404347671829151
-----------------------Fold-1-------------------------
Train RMSE:  0.8105321431544972
Val RMSE: 0.8421766027658272
-----------------------Fold-2-------------------------
Train RMSE:  0.8126988088549715
Val RMSE: 0.8156585957987273
-----------------------Fold-3-------------------------
Train RMSE:  0.8088026841857492
Val RMSE: 0.8546130254168797
-----------------------Fold-4-------------------------
Train RMSE:  0.8120173339157567
Val RMSE: 0.8160178376552988
-----------------------Fold-5-------------------------
Train RMSE:  0.8143000296150552
Val RMSE: 0.7996922033908588
-----------------------Fold-6-------------------------
Train RMSE:  0.8125785901010008
Val RMSE: 0.8203164688258816
-----------------------Fold-7-------------------------
Train RMSE:  0.8113352865770215
Val RMSE: 0.8327274076921499
-----------------------Fold-8-------------------------
Train RMSE:  0.80

((6510, 10), (17060,))

In [76]:
# save cv preds

lgb_cv_train = pd.DataFrame(columns=['uid', 'lgbreg1k'])
lgb_cv_train.uid = all_train.uid
lgb_cv_train.lgbreg1k = lgbreg_cv_train
lgb_cv_train.to_csv('lgbreg1k_cv_train.csv', index=False)


lgb_cv_test = pd.DataFrame(columns=['uid', 'lgbreg1k_cvpreds'])
lgb_cv_test.uid = test_data.uid
lgb_cv_test.lgbreg1k_cvpreds = np.mean(lgbreg_cv_test, axis=1)
lgb_cv_test.to_csv('lgbreg1k_cv_test.csv', index=False)

In [337]:
%%time

xgb_pipe = get_pipe(XGBClassifier(n_estimators=500))
cv_it(xgb_pipe, cv_predict=False, splits=10, X_test=X_test_)

-----------------------Fold-0-------------------------
Train RMSE:  0.9375094980197083
Val RMSE: 0.92604619079152
-----------------------Fold-1-------------------------
Train RMSE:  0.9392099923154746
Val RMSE: 0.9651002657032545
-----------------------Fold-2-------------------------
Train RMSE:  0.9406997304165573
Val RMSE: 0.9507200948283536
-----------------------Fold-3-------------------------
Train RMSE:  0.941806843391582
Val RMSE: 0.9314109605814221
-----------------------Fold-4-------------------------
Train RMSE:  0.9447068420015152
Val RMSE: 0.9547192586499162
-----------------------Fold-5-------------------------
Train RMSE:  0.9392099923154746
Val RMSE: 0.9705511420546027
-----------------------Fold-6-------------------------
Train RMSE:  0.9369188094720533
Val RMSE: 0.9726626777266816
-----------------------Fold-7-------------------------
Train RMSE:  0.9396606273374397
Val RMSE: 0.9638847737024281
-----------------------Fold-8-------------------------
Train RMSE:  0.94325

In [338]:
from sklearn.linear_model import LinearRegression, LogisticRegression
linreg = LinearRegression()
logreg = LogisticRegression()
cv_it(get_pipe(linreg))
cv_it(get_pipe(logreg))

-----------------------Fold-0-------------------------
Train RMSE:  0.8736440208755917
Val RMSE: 0.863144011735086
-----------------------Fold-1-------------------------
Train RMSE:  0.8736812947634801
Val RMSE: 0.8661945989416229
-----------------------Fold-2-------------------------
Train RMSE:  0.8734203441437306
Val RMSE: 0.868559870437602
-----------------------Fold-3-------------------------
Train RMSE:  0.8733830591190974
Val RMSE: 0.866871049336748
-----------------------Fold-4-------------------------
Train RMSE:  0.8708439324113645
Val RMSE: 0.8915389700135127
-----------------------Fold-5-------------------------
Train RMSE:  0.8731593155368121
Val RMSE: 0.8709187182702693
-----------------------Fold-6-------------------------
Train RMSE:  0.8739421674659196
Val RMSE: 0.8617847289834202
-----------------------Fold-7-------------------------
Train RMSE:  0.8727489699917136
Val RMSE: 0.8746126256907026
-----------------------Fold-8-------------------------
Train RMSE:  0.87200

In [None]:
# %%time
# def train_eval_density(model, X_train=X_train_trans, y_train=yd_train, X_val=X_val_trans, y_val=yd_val):
#     model.fit(X_train, y_train)
#     preds = model.predict(X_val)
#     hard_preds = pd.Series(preds).map(dens_to_sev)
#     print("Compe RMSE: ", mse(ys_val+1, hard_preds, squared=False))
#     print("RMSE: ", mse(y_val, preds, squared=False))
#     return None

In [None]:
# train_eval_density(XGBRegressor(n_estimators=1000, verbose=0))
# print('-------------------')
# train_eval_density(CatBoostRegressor(n_estimators=1000, verbose=1000))

# Generate predictions

In [85]:
xgbreg_cv_preds = np.round(xgbreg_cv_test.mean(axis=1)) 
xgbreg_cv_preds.shape

(6510,)

In [86]:
np.unique(xgbreg_cv_preds)

array([0., 1., 2., 3., 4.])

In [87]:
sub_format.severity = xgbreg_cv_preds
sub_format.severity = sub_format.severity.astype(int) + 1
sub_format

Unnamed: 0,uid,region,severity
0,aabn,west,4
1,aair,west,4
2,aajw,northeast,1
3,aalr,midwest,2
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,2
6506,zzrv,west,4
6507,zzsx,south,2
6508,zzvv,west,4


In [88]:
sub_format.severity.value_counts()

2    2672
4    2029
1     987
3     820
5       2
Name: severity, dtype: int64

In [89]:
sub_format.to_csv('xgbreg_cvpreds_on_expanding_sev_rs_preds.csv', index=False)
# expecting (0.81 to 0.84)

# So....

- regressing severity is better than classifying it. 
> probbaly coz of optimizing differnet functions! clfs optimzie logloss whereas regs optimize rmse       which is similar to the task, but is this the same for NNs???
- Lst time felt cheated.. with trees @0.84/0.89 but why that discrepancy tho?
- So this time with 10 fold cv avg stands @ 0.83 still doubtful tho!


DIFF MTDTA:

-  expanding_severity : Mean RMSE: 0.8326926618960704 std: 0.010792728721340388

- expndng_sev_by_reg : Mean RMSE: 0.8326043320130792 std: 0.017949794678433155

- expanding_sev_rs : Mean RMSE: 0.831528416210013 std: 0.0188192039517455

All seem to be the same! But Can't they learn an identity function!