# Main Notebook 

 
``config, load, preprocess, train, eval models for  Tick tick bloom``

**Yo DON'T rerun this unless you want to overwrite past models, always fork and do your stuff and 
DON'T forget to change the name**

# Load imports and dependencies

In [2]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold

import tensorflow as tf
from tensorflow.keras import layers, activations, losses, metrics, models, optimizers


warnings.filterwarnings('ignore')

In [3]:
# local utilities imports
from tick_tick_bloom_utils import my_keras_rmse, comp_metric

In [4]:
# wandb stuff for tracking
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

import wandb
wandb.login(key=wandb_login)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Config

In [5]:
# dot dictionary
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'conv2d_32_64_img1k -  {config.unique_id}'   # 'conv2d_32_64_img_1k'

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# Img config
config['IMG_SIZE'] = (136, 136)
config['CHANNELS'] = 3


config['train'] =  dotdict({
                        'epochs': 20,
                        'batch_size': 128,
                        'validation_split': 0.2,
                        'shuffle': True,
                        'verbose': 1
                        })

config['desc'] = 'test run for kaggle nb setup'


unique_id: 1672510178


In [6]:
# seed everything
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
tf.random.set_seed(config.RANDOM_SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED'] = str(config.RANDOM_SEED)

# tf.keras.utils.set_random_seed(config.RANDOM_SEED)  # supposedly sets seed for python, numpy, tf

# Load data

In [7]:
INPUT_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_DIR, 'train_labels.csv'))

In [8]:
IMG_DIR = '/kaggle/input/pull-landsat-data/imgs'
img_files = os.listdir(IMG_DIR)
img_file_names = [f.split('.')[0] for f in img_files]

# get only data for those 1k imgs
metadata_subset = metadata[metadata['uid'].isin(img_file_names)]
data = metadata_subset[metadata_subset.split == 'train']
data = data.merge(train_labels, on='uid')

In [9]:
# drop that 5th severity for noww
data = data[data.severity != 5]  # omg!! this one sample messed up the network  (especially the activation at output layer)

In [10]:
def get_imgs(uids) :
    imgs = []
    for uid in uids:
        img_arr = np.load(IMG_DIR + f'/{uid}.npy')
        img_arr = np.transpose(img_arr, (2, 1, 0))
        # resize img
        img_arr = cv2.resize(img_arr, config.IMG_SIZE)
        img_arr = img_arr / 255   # normalizeee bro... other wise it's blowing up the networks...
        imgs.append(img_arr)
    return np.array(imgs) 


def get_np_data(split : float = 0.2):
    """Return np data for training and testing."""

    print("Loading data...")
    x_train_uids, x_test_uids, y_train, y_test = train_test_split(
        data['uid'],
        data.severity,
        test_size=split,
        random_state=config.RANDOM_SEED,
        stratify=data.severity
    )

    x_train = get_imgs(x_train_uids)
    x_test = get_imgs(x_test_uids)

    return x_train, y_train, x_test, y_test

In [11]:
x_train, y_train, x_test, y_test = get_np_data()
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))
print('Done')

Loading data...
1    0.461140
4    0.202073
2    0.174439
3    0.162349
Name: severity, dtype: float64
1    0.462069
4    0.206897
2    0.172414
3    0.158621
Name: severity, dtype: float64
Done


# Preprocess

In [12]:
# change labels to 0-3(model works this way) instead of 1-4 given range(given severity)
y_train_03 = y_train - 1
y_test_03 = y_test - 1

In [13]:
#  No img processing for now!

# Model

In [14]:
def get_model():
    print('Loading model...')
    input_shape = (*config.IMG_SIZE, config.CHANNELS)

    input_imgs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation='relu')(input_imgs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(32, (3, 3), activation='relu')(input_imgs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    output = layers.Dense(4, activation='softmax')(x)

    model = models.Model(inputs=input_imgs, outputs=output)

    model.compile(optimizer=optimizers.Adam(learning_rate=0.0001),
                    loss=losses.sparse_categorical_crossentropy,
                    metrics=[my_keras_rmse,
                            metrics.SparseCategoricalAccuracy(name='acc')])

    return model

In [15]:
model = get_model()
model.summary()

Loading model...
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 136, 136, 3)]     0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 134, 134, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 67, 67, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 143648)            0         
_________________________________________________________________
dense (Dense)                (None, 128)               18387072  
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
Total params: 18,388,484
Trainable params: 18,388,484
Non-trainable params: 0
________________________________

# Train 

In [16]:
# np and tf metrics are aggreable, but why evaluate and final epoch's loss and metric each, are different?!
# 1.5447352189134953 (np) 1.5447352189134953 (tf) , 1.5175646543502808(evaluate), 

In [17]:
def train(config=config, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test):
    
    with wandb.init(project=config.PROJECT_NAME, config=config, name=config.name):
        model = get_model()
        print(f'Training model...{config.name}')
        model.summary()
        
        train_config = config['train']
        
        wandb_callback = wandb.keras.WandbCallback(
            monitor='val_loss',
            log_weights=False,
            log_gradients=False,
            save_model=False,
            training_data=(x_train, y_train),
            validation_data=(x_test, y_test),
            log_batch_frequency=None,
        )

        callbacks = [wandb_callback]

        history = model.fit(
                    x_train, y_train,
                    epochs=train_config.epochs,
                    batch_size=train_config.batch_size, 
                    callbacks=callbacks, 
                    validation_split=0.2, 
                    shuffle=True, 
                    verbose=1   
                )
        
        return model, history

In [18]:
train(y_train=y_train_03, y_test=y_test_03)

[34m[1mwandb[0m: Currently logged in as: [33mk_loki[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading model...
Training model...conv2d_32_64_img1k -  1672510178
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 136, 136, 3)]     0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 134, 134, 32)      896       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 67, 67, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 143648)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               18387072  
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 516       
Total params: 18,388,484
Trainable params: 18,388,484
Non-

2022-12-31 18:10:02.988810: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2022-12-31 18:10:02.989155: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2022-12-31 18:10:03.013803: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1137] Optimization results for grappler item: graph_to_optimize
  function_optimizer: function_optimizer did nothing. time = 1.622ms.
  function_optimizer: function_optimizer did nothing. time = 0.006ms.

2022-12-31 18:10:05.830936: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁▂▃▃▄▄▅▅▆▅▆▇▆▇▇▇▇███
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁
my_keras_rmse,▅▁▂▇▆▄█▄█▅▆▇▆▆▇▆▆▇▇▆
val_acc,▃▆▁▇▇▂█▅█▆▇▇▇██▇▆█▆▇
val_loss,█▆█▅▅▅▄▄▄▃▂▂▁▂▂▃▂▃▁▂
val_my_keras_rmse,▄▁█▃▅▇▄▆▄▆▅▅▅▆▄▅▆▅▅▆

0,1
GFLOPS,0.03419
acc,0.71274
best_epoch,18.0
best_val_loss,1.16784
epoch,19.0
loss,0.74744
my_keras_rmse,1.61424
val_acc,0.49138
val_loss,1.20669
val_my_keras_rmse,1.75626


(<keras.engine.functional.Functional at 0x7faa0c211650>,
 <keras.callbacks.History at 0x7faa0c1fc410>)

# Eval 

In [19]:
model.evaluate(x_test, y_test_03)

y_pred = model.predict(x_test)
y_pred_hard = np.argmax(y_pred, axis=1)
error = mse(y_test_03, y_pred_hard, squared=False)
print("Comp Metric: ", error)

Comp Metric:  1.2401334744626316


In [20]:
pd.Series(y_pred_hard).value_counts()

1    122
2     21
0      2
dtype: int64

In [21]:
# classification report
from sklearn.metrics import classification_report
y_pred_hard = np.argmax(y_pred, axis=1)
cr = classification_report(y_test_03+1, y_pred_hard+1)
print(cr)

              precision    recall  f1-score   support

           1       1.00      0.03      0.06        67
           2       0.19      0.92      0.31        25
           3       0.10      0.09      0.09        23
           4       0.00      0.00      0.00        30

    accuracy                           0.19       145
   macro avg       0.32      0.26      0.12       145
weighted avg       0.51      0.19      0.10       145



# Save anything..

In [22]:
# save model
if config.SAVE_MODEL:
    model.save(config.name + '.h5')
    print("Model saved as ",config.name + '.h5')

Model saved as  conv2d_32_64_img1k -  1672510178.h5
