# Main ...

 
``config, load, preprocess, train, eval  for  Tick tick bloom``

**Yo DON'T rerun this unless you want to overwrite past models, always fork and do your stuff and 
DON'T forget to change the name**

**``Mission: NNs on expanding avg of severity metadata``**

- wondering how nns perform on metadata!

# Load imports and dependencies

In [3]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
from tensorflow.keras import layers, activations, losses, metrics, models, optimizers, callbacks
from category_encoders.target_encoder import TargetEncoder

warnings.filterwarnings('ignore')

In [4]:
# local utilities imports
from tick_tick_bloom_utils import comp_metric, den2sev_map

In [5]:
# wandb stuff for tracking
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

import wandb
wandb.login(key=wandb_login)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Config

In [8]:
# seed everything
def seed_everything(seed=config.RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['PYTHONHASHSEED'] = str(seed)

# tf.keras.utils.set_random_seed(config.RANDOM_SEED)  # supposedly sets seed for python, numpy, tf

seed_everything()

# Utils

In [49]:
def keras_rmse_clf(y_true, y_pred):
    """
    valid competetion metric for clf type settings.
    Can be trusted!
    y_true and y_pred should be [0-4]
    """
    y_pred = tf.argmax(y_pred, axis=1)
    y_pred = tf.cast(y_pred, tf.float16)
    y_true = tf.cast(y_true, tf.float16)
    squared_difference = tf.square(y_true - y_pred)
    return tf.sqrt(tf.reduce_mean(squared_difference, axis=-1))

def keras_rmse_reg(y_true, y_pred):
    """
    valid competetion metric for reg type settings.
    Can be trusted!
    y_true and y_pred should be [0-4]
    """
    y_pred = tf.math.round(y_pred)
    y_pred = tf.cast(y_pred, tf.float16)
    y_true = tf.cast(y_true, tf.float16)
    squared_difference = tf.square(y_true - y_pred)
    return tf.sqrt(tf.reduce_mean(squared_difference, axis=-1))


def rmse_loss(y_true, y_pred):
    """loss func to use in reg type settings"""
    return tf.sqrt(losses.mean_squared_error(y_true, y_pred))

# Load data

In [10]:
INPUT_METADATA_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'train_labels.csv'))

In [11]:
# IMG_DIR = '/kaggle/input/pull-landsat-data-v1-500m/landsat8_500m_v1'   # landsat 8 data with raw 
# img_files = os.listdir(IMG_DIR)
# img_file_names = [f.split('.')[0] for f in img_files]

# # get only data for those 1k imgs
# metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
# data = metadata_subset[metadata_subset.split == 'train']
# data = data.merge(train_labels, on='uid')

# test_data = metadata[metadata.split == 'train']

In [12]:
# def get_imgs(uids) :
#     imgs = []
#     for uid in uids:
#         img_arr = np.load(IMG_DIR + f'/{uid}.npy')
#         img_arr = np.transpose(img_arr, (2, 1, 0))
#         # resize img
#         img_arr = cv2.resize(img_arr, config.IMG_SIZE)
#         img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
#         imgs.append(img_arr)
#     return np.array(imgs) 


# def get_np_data(split : float = 0.2, task='train'):
#     """Return np data for training and validation."""
#     if task == 'train':
#         print("Loading train and validation data...")
#         x_train_uids, x_val_uids, y_train, y_val = train_val_split(
#             data['uid'],
#             data.severity,
#             val_size=split,
#             random_state=config.RANDOM_SEED,
#             stratify=data.severity
#         )

#         x_train = get_imgs(x_train_uids)
#         x_val = get_imgs(x_val_uids)

#         return x_train, y_train, x_val, y_val


#     if task == 'test':
#         test_ids = test_data.uids
#         x_test
#         return x_test

In [13]:
# x_train, y_train, x_val, y_val = get_np_data()
# print(y_train.value_counts(normalize=True))
# print(y_val.value_counts(normalize=True))
# print('Done')

In [14]:
#  get data
metadata.date = pd.to_datetime(metadata.date)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
print(data.shape)

(23570, 8)


In [15]:
# seasons
seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}


#  most of the samples are collected in the months of June, July, August.

# add date time fts.
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year
data['week'] = data.date.dt.isocalendar().week
# data['day_of_year'] = data.date.dt.
data['season'] = data.month.map(seasons)

In [16]:
data.sort_values(by='date', inplace=True)

In [17]:
data['expanding_severity'] = data.severity.expanding().mean()
data['expanding_severity'] = data['expanding_severity'].apply(np.round)
data

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity
4387,evep,44.847993,-93.476318,2013-01-04,train,midwest,1.0,115.0,1,2013,1,1,1.0
13644,paev,44.822478,-93.367962,2013-01-04,train,midwest,1.0,1884.0,1,2013,1,1,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,midwest,1.0,1416.0,1,2013,1,1,1.0
6144,guny,44.878889,-93.490833,2013-01-04,train,midwest,1.0,558.0,1,2013,1,1,1.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,midwest,1.0,476.0,1,2013,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,west,,,12,2021,52,1,2.0
17559,thki,36.725400,-121.730000,2021-12-29,test,west,,,12,2021,52,1,2.0
17452,teuu,36.772300,-121.788000,2021-12-29,test,west,,,12,2021,52,1,2.0
14254,prfi,36.751800,-121.742000,2021-12-29,test,west,,,12,2021,52,1,2.0


In [18]:
train_data = data[data.split == 'train']
test_data = data[data.split == 'test']

In [19]:
# expanding avg of severity
mse(train_data.severity, train_data.expanding_severity, squared=False)

1.2085810811762927

In [20]:
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

print(mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).loc[train_data.index].sort_index(), squared=False))

data['expndng_sev_by_reg'] = np.nan

south = data.region == 'south'
midwest = data.region == 'midwest'
northeast = data.region == 'northeast'
west = data.region == 'west'

data.loc[south , 'expndng_sev_by_reg'] = grp_by_region['south']
data.loc[midwest , 'expndng_sev_by_reg'] = grp_by_region['midwest']
data.loc[northeast , 'expndng_sev_by_reg'] = grp_by_region['northeast']
data.loc[west , 'expndng_sev_by_reg'] = grp_by_region['west']

print(data.shape)
data.isna().sum()

data.sort_index()

0
0.894165010958815
(23570, 14)


Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0
1,aabn,36.559700,-121.510000,2016-08-31,test,west,,,8,2016,35,3,2.0,4.0
2,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0
3,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0
4,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,west,,,12,2014,49,1,2.0,4.0
23566,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,south,1.0,9682.0,11,2016,47,4,2.0,2.0


In [21]:
# # MY ASSUMPTION: less the missing values --> more inital samples in the group are not test --> less imputations/ffills needed --> much realiable score!


In [22]:
grp_by_rs = data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum()) # 5 --> .86

data['expanding_sev_rs'] =  grp_by_rs.droplevel(0).droplevel(0).sort_index()
# fillna with expanding sev by region
data['expanding_sev_rs'] = np.where(data.expanding_sev_rs.isna(), data.expndng_sev_by_reg, data.expanding_sev_rs)

print(mse(train_data.severity.sort_index(), data['expanding_sev_rs'].sort_index()[data.split == 'train'], squared=False))

# #  make submission for expanding severity by region and season

# expanding_sev_rs = data[data.split == 'test'][['uid', 'expanding_sev_rs']]          # picking up only uids and expanding_sev_rs from test samples
# expanding_sev_rs.expanding_sev_rs = expanding_sev_rs.expanding_sev_rs.astype(int)   # casting to int
# expanding_sev_rs.sort_values(by='uid', inplace=True)                                # sorting by uid -- safest option
# expanding_sev_rs.reset_index(drop=True, inplace=True)                               # matching indexes with submissoin

# sub_format.severity = expanding_sev_rs.expanding_sev_rs
# sub_format.severity.value_counts()  # expected 0.8594349134502333

# sub_format.to_csv('expanding_sev_rs_preds.csv', index=False)

5
0.8594349134502333


In [23]:
train_labels.severity.value_counts(normalize=True)
# since test and train dists are almost similar my ideal model should follow this dist!

1    0.439449
4    0.207913
2    0.189859
3    0.159379
5    0.003400
Name: severity, dtype: float64

In [24]:
data.head()

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg,expanding_sev_rs
4387,evep,44.847993,-93.476318,2013-01-04,train,midwest,1.0,115.0,1,2013,1,1,1.0,1.0,1.0
13644,paev,44.822478,-93.367962,2013-01-04,train,midwest,1.0,1884.0,1,2013,1,1,1.0,1.0,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,midwest,1.0,1416.0,1,2013,1,1,1.0,1.0,1.0
6144,guny,44.878889,-93.490833,2013-01-04,train,midwest,1.0,558.0,1,2013,1,1,1.0,1.0,1.0
5317,fwbt,44.8505,-93.5157,2013-01-04,train,midwest,1.0,476.0,1,2013,1,1,1.0,1.0,1.0


In [25]:
data.isna().sum()

uid                      0
latitude                 0
longitude                0
date                     0
split                    0
region                   0
severity              6510
density               6510
month                    0
year                     0
week                     0
season                   0
expanding_severity       0
expndng_sev_by_reg       0
expanding_sev_rs         0
dtype: int64

In [26]:
all_train = data[data.split == 'train']
all_train.sort_values(by='uid', inplace=True)
all_train.reset_index(drop=True, inplace=True)
all_train.shape

(17060, 15)

In [27]:
all_train

Unnamed: 0,uid,latitude,longitude,date,split,region,severity,density,month,year,week,season,expanding_severity,expndng_sev_by_reg,expanding_sev_rs
0,aabm,39.080319,-86.430867,2018-05-14,train,midwest,1.0,585.0,5,2018,20,2,2.0,2.0,1.0
1,aacd,35.875083,-78.878434,2020-11-19,train,south,1.0,290.0,11,2020,47,4,2.0,2.0,2.0
2,aaee,35.487000,-79.062133,2016-08-24,train,south,1.0,1614.0,8,2016,34,3,2.0,2.0,2.0
3,aaff,38.049471,-99.827001,2019-07-23,train,midwest,3.0,111825.0,7,2019,30,3,2.0,2.0,2.0
4,aafl,39.474744,-86.898353,2021-08-23,train,midwest,4.0,2017313.0,8,2021,34,3,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17055,zzsv,38.707825,-75.080867,2018-06-27,train,south,3.0,113125.0,6,2018,26,3,2.0,2.0,2.0
17056,zzuq,35.794000,-79.015368,2015-08-06,train,south,3.0,175726.0,8,2015,32,3,2.0,1.0,2.0
17057,zzwo,39.792190,-99.971050,2017-06-19,train,midwest,2.0,48510.0,6,2017,25,3,2.0,2.0,2.0
17058,zzwq,35.794000,-79.012551,2015-03-24,train,south,1.0,1271.0,3,2015,13,2,2.0,1.0,1.0


In [28]:
all_train.expanding_sev_rs.value_counts(normalize=False)

2.0    8558
1.0    4736
4.0    3565
3.0     201
Name: expanding_sev_rs, dtype: int64

In [29]:
all_train.expndng_sev_by_reg.value_counts(normalize=False)

2.0    10241
4.0     3769
1.0     3050
Name: expndng_sev_by_reg, dtype: int64

In [30]:
sum(all_train.expanding_sev_rs == all_train.expndng_sev_by_reg)

12866

In [31]:
test_data = metadata[metadata.split == 'test']
test_data = test_data.merge(sub_format, on='uid')

test_data['month'] = test_data.date.dt.month
test_data['year'] = test_data.date.dt.year
test_data['season'] = test_data.month.map(seasons)

In [32]:
all_train.columns

Index(['uid', 'latitude', 'longitude', 'date', 'split', 'region', 'severity',
       'density', 'month', 'year', 'week', 'season', 'expanding_severity',
       'expndng_sev_by_reg', 'expanding_sev_rs'],
      dtype='object')

In [33]:
req_cols = ['region', 'month', 'year', 'season', 'expanding_sev_rs']

In [34]:
X_ , y_ = all_train[req_cols], all_train['severity']
X_.shape, y_.shape

((17060, 5), (17060,))

In [35]:
X_.isna().sum().sum(), y_.isna().sum()

(0, 0)

In [36]:
X_train_,X_val_, y_train_, y_val_ = train_test_split(X_, y_, test_size=0.20, random_state=config.RANDOM_SEED, stratify=y_)
X_train_.shape, y_train_.shape, X_val_.shape, y_val_.shape

((13648, 5), (13648,), (3412, 5), (3412,))

In [37]:
test_data['expanding_severity'] = data[data.split == 'test']['expanding_severity'].sort_index().values
test_data['expndng_sev_by_reg'] = data[data.split == 'test']['expndng_sev_by_reg'].sort_index().values
test_data['expanding_sev_rs'] = data[data.split == 'test']['expanding_sev_rs'].sort_index().values

In [38]:
X_test_ = test_data[req_cols]
X_test_.shape

(6510, 5)

# Preprocess

In [39]:
# # change labels to 0-3(model works this way) instead of 1-4 given range(given severity)
# -1 for to make labels look like sparse encoded labels

y_train = y_train_  -1
y_val = y_val_ - 1

y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)  # guessing alwyas 0 gives 43% acc

(0.0    0.439478
 3.0    0.207943
 1.0    0.189845
 2.0    0.159364
 4.0    0.003370
 Name: severity, dtype: float64,
 0.0    0.439332
 3.0    0.207796
 1.0    0.189918
 2.0    0.159437
 4.0    0.003517
 Name: severity, dtype: float64)

In [40]:
y = y_ - 1
y.value_counts()

0.0    7497
3.0    3547
1.0    3239
2.0    2719
4.0      58
Name: severity, dtype: int64

In [41]:
# #  target encode the cat fts.

# te = TargetEncoder(cols=['region', 'month', 'year', 'season'])
# te.fit(X_train_, y_train)
# X_train =  te.transform(X_train_)
# X_val = te.transform(X_val_)

# X_test = te.transform(X_test_)
# X_test

In [42]:
X_train_.dtypes

region               object
month                 int64
year                  int64
season                int64
expanding_sev_rs    float64
dtype: object

In [43]:
#  Encode region
# from category_encoders.ordinal import OrdinalEncoder as COE

oe = OrdinalEncoder()
X_train = oe.fit_transform(X_train_)
X_test = oe.transform(X_test_)
X_val = oe.transform(X_val_)

In [44]:
X_train = pd.DataFrame(X_train, columns=X_train_.columns, index=X_train_.index)
X_test = pd.DataFrame(X_test, columns=X_test_.columns, index=X_test_.index)
X_val = pd.DataFrame(X_val, columns=X_val_.columns, index=X_val_.index)

In [45]:
# Normalize values




# Model

In [60]:
def get_model(mdtype='reg'):
    print(f'Loading {mdtype} type model...')
    #     input_shape = (*config.IMG_SIZE, config.CHANNELS)

    if mdtype == 'clf':
        loss = losses.SparseCategoricalCrossentropy()
        comp_metric = keras_rmse_clf
        last_layer = layers.Dense(5, activation='softmax')

    if mdtype == 'reg':
        loss = rmse_loss
        comp_metric = keras_rmse_reg
        last_layer = layers.Dense(1)
    
    
    
    input_imgs = layers.Input(shape=(X_train.shape[1],))
    #     x = layers.Conv2D(32, (3, 3), activation='relu')(input_imgs)
    #     x = layers.MaxPooling2D((2, 2))(x)
    #     x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(input_imgs)
    output = last_layer(x)

    model = models.Model(inputs=input_imgs, outputs=output, name=config.name)
    
    

    model.compile(optimizer=optimizers.Adam(learning_rate=config.train.lr),
                    loss = loss,
                    metrics=[
                        comp_metric,
                        metrics.SparseCategoricalAccuracy(name='acc')
                    ])

    return model

In [67]:
model = get_model('reg')
model.summary()

Loading reg type model...
Model: "d128_clf_mtdata-1673523296"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 5)]               0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               768       
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 129       
Total params: 897
Trainable params: 897
Non-trainable params: 0
_________________________________________________________________


In [69]:
# model.fit(X_train, y_train)

# Train and eval

In [108]:
def train_(model, config=config, x_train=X_train, y_train=y_train, debug=None):
    """fits given model to x_train and y_train"""
    
    train_config = config['train']
    my_callbacks = []
    
    earlystopping = callbacks.EarlyStopping(patience=15, monitor='val_loss', restore_best_weights=True)
    my_callbacks.append(earlystopping)
    
    try:
        wandb_callback = wandb.keras.WandbCallback(
            monitor='val_loss',
            log_weights=True,
            log_gradients=True,
            save_model=False,
            training_data=(x_train, y_train),
            log_batch_frequency=None,
        )

        my_callbacks.append(wandb_callback)
    except:
        print('wandb not tracking')
        
    print(f'Training model... {config.name}')
    if debug == True:
        epochs = 1000
    else:
        epochs = train_config.epochs
    history = model.fit(
                x_train, y_train,
                epochs=epochs,
                batch_size=train_config.batch_size, 
                callbacks=my_callbacks, 
                validation_split=0.2, 
                shuffle=True, 
                verbose=1 
            )

    return model, history


def eval_(model, x_val=X_val, y_val=y_val):
    print('Evaluating model....')
    model.evaluate(x_val, y_val, return_dict=True)


In [109]:
def train_eval(model=None, mdtype='reg', X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, debug=False):
    
    if model is None:
        print('Getting New model')
        model = get_model()
    
    # train
    model, history = train_(model, config, X_train, y_train=y_train, debug=debug)  # try to overfit thsi batch
    # eval
    eval_(model)

    # classification report
    y_pred = model.predict(X_val)
    if mdtype == 'clf':
        y_pred_hard = np.argmax(y_pred, axis=1)             # fuck this shit forgot to 
    if mdtype == 'reg':
        y_pred_hard = np.round(y_pred)
        
    print(y_pred_hard)
    error = mse(y_val, y_pred_hard, squared=False)
    print("Comp Metric: ", error)
    cr = classification_report(y_val, y_pred_hard)     # +1 to account for 0-4 as it should be 1-5 originallly
    print(cr)


In [98]:
model = get_model('clf')
train_eval(model, 'clf', y_train=y_train, y_val=y_val)

Loading clf type model...
wandb not tracking
Training model... d128_clf_mtdata-1673523296
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/

In [106]:
preds = np.argmax(model.predict(X_val), axis=1)
mse(y_val, preds.ravel(), squared=False)

1.1679081208141078

In [112]:
config.train.epochs = 300
config.desc = "overfitting dense model on mtdata sev by rs log-loss"

In [113]:
with wandb.init(project=config.PROJECT_NAME, config=config, name=config.name):
    model = get_model('clf')
    train_eval(model, 'clf', y_train=y_train, y_val=y_val)

Loading clf type model...
Training model... d128_clf_mtdata-1673523296


2023-01-12 11:58:17.497585: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-01-12 11:58:17.497871: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-01-12 11:58:17.504628: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1137] Optimization results for grappler item: graph_to_optimize
  function_optimizer: function_optimizer did nothing. time = 0.017ms.
  function_optimizer: function_optimizer did nothing. time = 0.002ms.



Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/3

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁▂▅▅▅▆▆▇▇▇▇█████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
keras_rmse_clf,█▆▃▁▁▁▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
loss,█▇▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▂▅▅▅▆▆▇▇▇▇▇████████████████████████████
val_keras_rmse_clf,█▆▂▁▁▁▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
val_loss,█▇▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
GFLOPS,0.0
acc,0.63876
best_epoch,299.0
best_val_loss,0.95489
epoch,299.0
keras_rmse_clf,1.72016
loss,0.95613
val_acc,0.64396
val_keras_rmse_clf,1.7265
val_loss,0.95489


In [117]:
pd.Series(np.argmax(model.predict(X_val), axis=1)).value_counts(normalize=True)

0    0.687866
3    0.221864
2    0.090270
dtype: float64

In [114]:
y_val.value_counts(normalize=True)

0.0    0.439332
3.0    0.207796
1.0    0.189918
2.0    0.159437
4.0    0.003517
Name: severity, dtype: float64

In [64]:
# this is intresting...
# rmse going up while log loss is going down!
# When regressing severity the rmse loss is going down but acc is not going up! there is definetly some disconnect between loss func and my metric
# predicting all zeros (ones) --> 1.65, have to fix the metrics ig

In [None]:
# history.history

In [None]:
# model = models.load_model('/kaggle/working/d128_rmse_lndsat8_raw_v1-1673283452.h5', custom_objects={'comp_loss': comp_loss})
# preds = model.predict(X_val)
# int_preds = np.round(preds)
# mse(y_val, int_preds, squared=False)

# Save something..

In [118]:
# save model
if config.SAVE_MODEL:
    model.save(config.name + '.h5')
    print("Model saved as ",config.name + '.h5')

Model saved as  d128_clf_mtdata-1673523296.h5


# Make submission

In [None]:
model.summary()

In [121]:
X_test.shape

(6510, 5)

In [122]:
# test_preds = np.round(model.predict(X_test)).ravel()
test_preds = np.argmax(model.predict(X_test), axis=1)
test_preds = test_preds + 1
sub_format.severity = test_preds
sub_format.severity = sub_format.severity.astype(int) 
sub_format.severity.value_counts()

1    2972
4    2387
3    1151
Name: severity, dtype: int64

In [123]:
sub_format.to_csv(f'{config.name}_preds.csv', index=False) # expect @ 0.98

# So...

- Training even simple NNs is really hard!!
- NNs with log_loss not at all improving mostly coz of loss -func! --> I thought but
- NNs with log loss is better compared to rmse-loss??

# ToDos:

- Does adding data provide any value??
- **Try to beat expanding avg_severity_by_region with the help of imgs, Other wise no use for img data**