**Mission :** *Setup ML framework to train and generate submissions*

# Imports

In [2]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold


from tick_tick_bloom_utils import my_keras_rmse, comp_metric

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



warnings.filterwarnings('ignore')

In [3]:
# # wandb stuff for tracking
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

# import wandb
# wandb.login(key=wandb_login)

# Config

In [4]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'trees-{config.unique_id}'   

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# Img config
config['IMG_SIZE'] = (136, 136)
config['CHANNELS'] = 3


config['desc'] = 'test run for kaggle ml nb setup'

unique_id: 1673175409


In [5]:
# seed everything
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(config.RANDOM_SEED)

# Load data

In [6]:
INPUT_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_DIR, 'train_labels.csv'))


metadata.date = pd.to_datetime(metadata.date)

In [7]:
IMG_DIR = "/kaggle/input/pull-landsat-data-v1-500m/landsat8_500m_v1"

img_files = os.listdir(IMG_DIR)
img_file_names = [f.split('.')[0] for f in img_files]

# get only data for those only in dataset
metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
data = metadata_subset[metadata_subset.split == 'train']
data = data.merge(train_labels, on='uid')

In [8]:
len(os.listdir(IMG_DIR))

22306

In [9]:
data

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1,585.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,1,290.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,1,1614.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3,111825.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,4,2017313.0
...,...,...,...,...,...,...,...,...
16116,zzrl,36.170000,-79.052197,2018-06-28,train,south,1,4293.0
16117,zzsv,38.707825,-75.080867,2018-06-27,train,south,3,113125.0
16118,zzuq,35.794000,-79.015368,2015-08-06,train,south,3,175726.0
16119,zzwq,35.794000,-79.012551,2015-03-24,train,south,1,1271.0


In [10]:
def get_imgs(uids) :
    imgs = []
    for uid in uids:
        arr = joblib.load(IMG_DIR + f'/{uid}.npy')
        img_arr = arr[:11]
        # img_arr = np.transpose(img_arr, (2, 1, 0))
        # resize img
        img_arr = cv2.resize(img_arr, config.IMG_SIZE)
        img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
        imgs.append(img_arr)
    return np.array(imgs) 


def get_np_data(split : float = 0.2):
    """Return np data for training and testing."""

    print("Loading data...")
    x_train_uids, x_test_uids, y_train, y_test = train_test_split(
        data['uid'],
        data.severity,
        test_size=split,
        random_state=config.RANDOM_SEED,
        stratify=data.severity
    )

    x_train = get_imgs(x_train_uids)
    x_test = get_imgs(x_test_uids)

    return x_train, y_train, x_test, y_test

In [11]:
# x_train, y_train, x_test, y_test = get_np_data()
# print(y_train.value_counts(normalize=True))
# print(y_test.value_counts(normalize=True))
# print('Done')

In [55]:
# only use metadata for now!
train_data = metadata[metadata.split == 'train']
train  = pd.merge(train_data, train_labels, on='uid')
train.severity = train.severity-1
train

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,0,585.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,0,290.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,0,1614.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,2,111825.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,3,2017313.0
...,...,...,...,...,...,...,...,...
17055,zzsv,38.707825,-75.080867,2018-06-27,train,south,2,113125.0
17056,zzuq,35.794000,-79.015368,2015-08-06,train,south,2,175726.0
17057,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,1,48510.0
17058,zzwq,35.794000,-79.012551,2015-03-24,train,south,0,1271.0


In [56]:
seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

In [57]:
train['month'] = train.date.dt.month
train['year'] = train.date.dt.year
train['season'] = train.month.map(seasons)

In [58]:
# lets leave last year completely to avoid data leak

my_test = train[train.year == 2021]
train.drop(my_test.index, inplace=True)
train.shape, my_test.shape

((15462, 11), (1598, 11))

In [59]:
test_data = metadata[metadata.split == 'test']
test_data = pd.merge(test_data, sub_format, on='uid')
test_data['month'] = test_data.date.dt.month
test_data['year'] = test_data.date.dt.year
test_data['season'] = test_data.month.map(seasons)

In [60]:
X, y = train[['region', 'month', 'year', 'season']], train[['severity', 'density']]
X.shape, y.shape

((15462, 4), (15462, 2))

In [61]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=config.RANDOM_SEED)
X_train.shape, X_val.shape,

((11596, 4), (3866, 4))

In [62]:
ys_train = y_train['severity']
ys_val = y_val['severity'] 

yd_train = y_train['density']
yd_val = y_val['density'] 

In [63]:
X_mytest, y_mytest = my_test[['region', 'month', 'year', 'season']], my_test[['severity', 'density']] 
X_mytest.shape, y_mytest.shape

((1598, 4), (1598, 2))

In [64]:
from category_encoders import TargetEncoder
te = TargetEncoder(cols=['region', 'month', 'year', 'season'])

te.fit(X_train, ys_train)
X_train_trans = te.transform(X_train)
X_val_trans = te.transform(X_val)
X_mytest_trans = te.transform(X_mytest)

In [65]:
X_test = test_data[['region', 'month', 'year', 'season']]
X_test_trans = te.transform(X_test)
X_test_trans

Unnamed: 0,region,month,year,season
0,2.730225,1.029610,1.040975,1.060384
1,2.730225,1.193900,1.310193,1.150000
2,0.766839,1.029610,1.084379,1.060384
3,1.155232,1.029610,1.225997,1.060384
4,2.730225,1.459794,1.077724,1.375000
...,...,...,...,...
6505,0.766839,1.181660,1.225997,1.060384
6506,2.730225,1.179574,1.225997,1.150000
6507,0.566529,1.181660,1.225997,1.060384
6508,2.730225,1.154639,1.310193,1.375000


# Preprocess

In [132]:
22766+544

23310

# Engine Train eval

In [47]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

In [104]:
%%time

def train_eval(model, model_type='reg', X_train=X_train_trans, X_val=X_val_trans, y_train=ys_train, y_val=ys_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    if model_type:
        rmse = mse(y_val, np.round(preds), squared=False)
        print("val RMSE:", rmse)
        print("TEST RMSE:", mse(y_mytest.severity, np.round(model.predict(X_mytest_trans)), squared=False))
        return model, preds, rmse

    

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 11.7 µs


In [105]:
%%time
xgb_reg = XGBRegressor(n_estimators=500)
xgb_reg, xgb_reg_preds, _ = train_eval(xgb_reg)

val RMSE: 0.8473811639639326
TEST RMSE: 0.7902725247211632
CPU times: user 6.73 s, sys: 60 ms, total: 6.79 s
Wall time: 1.71 s


In [97]:
mse(y_mytest.severity, np.round(xgb_reg.predict(X_mytest_trans)), squared=False)

0.7902725247211632

In [98]:
np.round(xgb_reg.predict(X_mytest_trans)).max(), np.round(xgb_reg.predict(X_mytest_trans)).min()

(3.0, 0.0)

In [106]:
%%time
xgb_clf = XGBClassifier(n_estimators=100)
xgb_clf, xgb_clf_preds, _ = train_eval(xgb_clf, 'clf')

val RMSE: 0.9672602253310643
TEST RMSE: 0.8512671847162431
CPU times: user 7.95 s, sys: 111 ms, total: 8.06 s
Wall time: 2.03 s


In [107]:
xgb_clf_preds.max()

3

In [108]:
lgb_reg, lgb_reg_preds, _ = train_eval(LGBMRegressor(n_estimators=2000))
lgb_clf, lgb_clf_preds, _ = train_eval(LGBMClassifier(n_estimators=2000), 'clf')

val RMSE: 0.8472285239259539
TEST RMSE: 0.7886872221724037
val RMSE: 0.9665914426850204
TEST RMSE: 0.8512671847162431


In [109]:
cat_reg, cat_reg_preds, _ = train_eval(CatBoostRegressor(n_estimators=2000, verbose=1000))
cat_clf, cat_clf_preds, _ = train_eval(CatBoostClassifier(n_estimators=2000, verbose=1000), 'clf')

Learning rate set to 0.034324
0:	learn: 1.1840464	total: 2.61ms	remaining: 5.21s
1000:	learn: 0.7610108	total: 1.9s	remaining: 1.9s
1999:	learn: 0.7593144	total: 3.82s	remaining: 0us
val RMSE: 0.8450886685633006
TEST RMSE: 0.7815135663044483
Learning rate set to 0.049915
0:	learn: 1.5398243	total: 6.22ms	remaining: 12.4s
1000:	learn: 0.8384896	total: 4.58s	remaining: 4.57s
1999:	learn: 0.8334819	total: 9.24s	remaining: 0us
val RMSE: 0.9665914426850204
TEST RMSE: 0.8512671847162431


In [110]:
rf_reg, rf_reg_preds, _ = train_eval(RandomForestRegressor(n_estimators=2000))
rf_clf, rf_clf_preds, _ = train_eval(RandomForestClassifier(n_estimators=2000), 'clf')

val RMSE: 0.8472285239259539
TEST RMSE: 0.7902725247211632
val RMSE: 0.9712632506797811
TEST RMSE: 0.8629489272626913


In [126]:
%%time
def train_eval_density(model, X_train=X_train_trans, y_train=yd_train, X_val=X_val_trans, y_val=yd_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    hard_preds = pd.Series(preds).map(dens_to_sev)
    print("Compe RMSE: ", mse(ys_val+1, hard_preds, squared=False))
    print("RMSE: ", mse(y_val, preds, squared=False))
    return None

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 12.2 µs


In [127]:
train_eval_density(XGBRegressor(n_estimators=1000, verbose=0))
print('-------------------')
train_eval_density(CatBoostRegressor(n_estimators=1000, verbose=1000))

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Compe RMSE:  1.1205183192045756
RMSE:  3481549.4073781334
-------------------
Learning rate set to 0.060303
0:	learn: 8057831.1141807	total: 3.6ms	remaining: 3.59s
999:	learn: 7618120.0822832	total: 1.95s	remaining: 0us
Compe RMSE:  1.163885779893981
RMSE:  3432895.860905185


In [114]:
# why my_test is easy?? 

train.year.isin([2021]).sum()

0

In [128]:
# how does linear algorithm fares?

from sklearn.linear_model import LinearRegression, LogisticRegression

lr = LinearRegression()
train_eval(lr)
train_eval_density(lr)

val RMSE: 0.8883624209738096
TEST RMSE: 0.8167520154970682
Compe RMSE:  1.4533618732474771
RMSE:  2104962.6374228154


In [129]:
log_reg = LogisticRegression()
train_eval(log_reg)

val RMSE: 1.0567204192644475
TEST RMSE: 1.0503590184239295


(LogisticRegression(), array([0, 0, 3, ..., 3, 0, 0]), 1.0567204192644475)

# Generate predictions

In [115]:
xgb_val = np.round(xgb_reg.predict(X_val_trans))
lgb_val = np.round(lgb_reg.predict(X_val_trans))
cat_val = np.round(cat_reg.predict(X_val_trans))
rf_val = np.round(rf_reg.predict(X_val_trans))

In [117]:
mse(y_val.severity, np.round(np.mean([xgb_val, lgb_val, cat_val, rf_val], axis=0)), squared=False)
# expected...

0.8473811639639326

In [119]:
xgb = np.round(xgb_reg.predict(X_test_trans))
lgb = np.round(xgb_reg.predict(X_test_trans))
cat = np.round(xgb_reg.predict(X_test_trans))
rf = np.round(xgb_reg.predict(X_test_trans))

In [122]:
sub_format.severity = np.round(np.mean([xgb, lgb, cat, rf], axis=0))
sub_format.severity = sub_format.severity.astype(int) + 1
sub_format

Unnamed: 0,uid,region,severity
0,aabn,west,4
1,aair,west,4
2,aajw,northeast,2
3,aalr,midwest,2
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,2
6506,zzrv,west,4
6507,zzsx,south,2
6508,zzvv,west,4


In [123]:
sub_format.describe()

Unnamed: 0,severity
count,6510.0
mean,2.584946
std,1.079319
min,1.0
25%,2.0
50%,2.0
75%,4.0
max,4.0


In [130]:
sub_format.to_csv('xgbrflgbcat_metadata_preds_corrected.csv', index=False)
# 0.8473811639639326

# So....

- regressing severity is better than classifying it.
- Ensemeble tree regressors at 0.8412016191892476 rmse in classifying severity
- Same classifeires stand at 0.9561303408034513 in severity
- Modelling density gives 1.1205183192045756