# Main ...

 
``config, load, preprocess, train, eval  for  Tick tick bloom``

**Yo DON'T rerun this unless you want to overwrite past models, always fork and do your stuff and 
DON'T forget to change the name**

**``Mission: NNs on landsat8-500x500m-v1``**

# Load imports and dependencies

In [1]:
# ! pip uninstall tensorflow -y
# ! pip install tensorflow==2.11

In [2]:
import warnings
import sys
import os
import time
import glob
import joblib
import random
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
from tensorflow.keras import layers, activations, losses, metrics, models, optimizers, callbacks
from category_encoders.target_encoder import TargetEncoder

warnings.filterwarnings('ignore')

In [3]:
# local utilities imports
from tick_tick_bloom_utils import comp_metric, den2sev_map

In [4]:
# wandb stuff for tracking
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb_bloom_tracker")

import wandb
wandb.login(key=wandb_login)

[34m[1mwandb[0m: Currently logged in as: [33mk_loki[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Config

In [5]:
# dot dictionary
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


# Config
config = {}
config = dotdict(config)
config['RANDOM_SEED'] = 18952


config['unique_id'] = int(time.time())
print(f'unique_id: {config.unique_id}')
config['name'] = f'conv2d64_d128-{config.unique_id}'

config['PROJECT_NAME'] = 'tick-tick-bloom'
# config['DATA_DIR'] = '../data/'
# config['MODEL_DIR'] = '../models/'
config['SAVE_MODEL'] = True


# # Img config
config['IMG_SIZE'] = (35, 35)
config['CHANNELS'] = 10

# training configuration
config['train'] =  dotdict({
                        'epochs': 100,
                        'batch_size': 64,
                        'validation_split': 0.2,
                        'shuffle': True,
                        'verbose': 1,
                        'lr' : 1e-5
                        })

config['desc'] = """simple nns on landsat8-500x500m-v1 --> Not expecting much since data looks soo noisy..."""

unique_id: 1674063395


In [6]:
# seed everything
def seed_everything(seed=config.RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
#     os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['PYTHONHASHSEED'] = str(seed)

# tf.keras.utils.set_random_seed(config.RANDOM_SEED)  # supposedly sets seed for python, numpy, tf

seed_everything()

# Utils

In [7]:
def keras_rmse_clf(y_true, y_pred):
    """
    valid competetion metric for clf type settings.
    Can be trusted!
    y_true and y_pred should be [0-4]
    """
    y_pred = tf.argmax(y_pred, axis=1)
    y_pred = tf.cast(y_pred, tf.float16)
    y_true = tf.cast(y_true, tf.float16)
    squared_difference = tf.square(y_true - y_pred)
    return tf.sqrt(tf.reduce_mean(squared_difference, axis=-1))

def keras_rmse_reg(y_true, y_pred):
    """
    valid competetion metric for reg type settings.
    Can be trusted!
    y_true and y_pred should be [0-4]
    """
    y_pred = tf.math.round(y_pred)
    y_pred = tf.cast(y_pred, tf.float16)
    y_true = tf.cast(y_true, tf.float16)
    squared_difference = tf.square(y_true - y_pred)
    return tf.sqrt(tf.reduce_mean(squared_difference, axis=-1))


def rmse_loss(y_true, y_pred):
    """loss func to use in reg type settings"""
    return tf.sqrt(losses.mean_squared_error(y_true, y_pred))

# Load data

In [8]:
INPUT_METADATA_DIR = '/kaggle/input/ticktickbloomdataset'

metadata = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'metadata.csv'))
sub_format = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'submission_format.csv'))
train_labels = pd.read_csv(os.path.join(INPUT_METADATA_DIR, 'train_labels.csv'))

In [9]:
part1 = '/kaggle/input/pull-landsat-sr-v1-part-1'
part2 = '/kaggle/input/pull-landsat-sr-v1-part-2'

part1_imgs = os.path.join(part1, "landsat8_sr_500m_v1")
part2_imgs = os.path.join(part2, "landsat8_sr_500m_v1")

p1imgs = [name.split('.')[0] for name in os.listdir(part1_imgs)]
p2imgs = [name.split('.')[0] for name in os.listdir(part2_imgs)]

img_uids = p1imgs + p2imgs
len(img_uids)                           # 1754 imgs missing!

21816

In [10]:
metadata_subset = metadata[metadata['uid'].isin(img_uids)]
data = metadata_subset[metadata_subset.split == 'train']
data = data.merge(train_labels, on='uid')

test_data = metadata[metadata.split == 'test']

data.shape, test_data.shape

((15724, 8), (6510, 5))

In [11]:
missing_data = metadata[~metadata.uid.isin(metadata_subset.uid)]
missing_data.split.value_counts()  # 181 test samples are missing.

train    1336
test      418
Name: split, dtype: int64

In [12]:
img_file_list1 = glob.glob(f"{part1_imgs}/*.npy")
img_file_list2 = glob.glob(f"{part2_imgs}/*.npy")

img_files = img_file_list1 + img_file_list2
len(img_files)

21816

In [13]:
img_files[0]

'/kaggle/input/pull-landsat-sr-v1-part-1/landsat8_sr_500m_v1/rpgg.npy'

In [14]:
sample_img = joblib.load(img_files[0])
sample_img.shape

(15,)

In [15]:
# sample_img[:10]          # 10 bands

In [16]:
%%time
# not so efficient method!!
# get train and test uids from present in downloaded data

train_uids = metadata[metadata.split == 'train'].uid.values
test_uids = metadata[metadata.split == 'test'].uid.values

train_files = [x for x in img_files if x.split('/')[-1].split('.')[0] in train_uids]
test_files = [x for x in img_files if x.split('/')[-1].split('.')[0] in test_uids]

len(train_files), len(test_files)

CPU times: user 11.1 s, sys: 293 ms, total: 11.4 s
Wall time: 11.2 s


(15724, 6092)

In [17]:
train_files_, val_files_ = train_test_split(train_files, random_state=config.RANDOM_STATE, test_size=0.2)
len(train_files_), len(val_files_)

(12579, 3145)

In [38]:
%%time
train_data = tf.data.Dataset.list_files(train_files_)
val_data = tf.data.Dataset.list_files(val_files_)
test_data = tf.data.Dataset.list_files(test_files)

# num_parallel_calls=tf.data.AUTOTUNE

CPU times: user 116 ms, sys: 70.3 ms, total: 186 ms
Wall time: 185 ms


In [39]:
def read_train_val_files(item):
    file_path = item.numpy()
    file_name = str(file_path).split('/')[-1]
    uid = file_name.split('.')[0]   
    # load file 
    arr = joblib.load(file_path)

    arr = arr[:10]
    arr = [a/a.max() for a in arr ]                        # Figuure out better way to normalize
    arr = [cv2.resize(a, config.IMG_SIZE) for a in arr]    # resizing all images to same shape

    arr = np.dstack((arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7], arr[8], arr[9]))
    # load target
    severity = train_labels[train_labels.uid == uid]['severity'] - 1
        
    return (arr, severity)

def read_test_files(item):
    file_path = item.numpy()
    file_name = str(file_path).split('/')[-1]
    uid = file_name.split('.')[0]   
    # load file 
    arr = joblib.load(file_path)

    arr = arr[:10]
    arr = [a/a.max() for a in arr ]                        # Figuure out better way to normalize
    arr = [cv2.resize(a, config.IMG_SIZE) for a in arr]    # resizing all images to same shape

    arr = np.dstack((arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7], arr[8], arr[9]))

    return (arr, )

In [40]:
train_data = train_data.map(
        lambda item: tuple(tf.py_function(read_train_val_files, [item], (tf.float64, tf.int32))))


val_data = val_data.map(
        lambda item: tuple(tf.py_function(read_train_val_files, [item], [tf.float32, tf.int32])))

test_data = test_data.map(
        lambda item: tuple(tf.py_function(read_test_files, [item], [tf.float32])))

In [41]:
for x in train_data.take(8):
    print(x[0].shape)
    print(x[1].shape)
    break

(35, 35, 10)
(1,)


In [42]:
config.train.batch_size = 32

In [43]:
train_data = train_data.batch(config.train.batch_size, drop_remainder=True)
val_data = val_data.batch(config.train.batch_size, drop_remainder=True)
test_data = test_data.batch(config.train.batch_size)

In [44]:
for x in train_data.take(100):
    print(x[0].shape)
    print(x[1].shape)
    break

(32, 35, 35, 10)
(32, 1)


In [45]:
def reshape_data(x, y):
    x = tf.squeeze(x)
    y = tf.reshape(y, (config.train.batch_size, ))
    return x, y

In [46]:
train_data = train_data.map(reshape_data)
val_data = val_data.map(reshape_data)

In [47]:
for x in train_data.take(100):
    print(x[0].shape)
    print(x[1].shape)
    break

(32, 35, 35, 10)
(32,)


In [None]:
# # load img from img paths
# def get_img(uid: str) -> np.ndarray:
#     """return data arr for given uid 
#     Only give uids already present in the downloaded data"""
#     try:
#         arr = joblib.load(part1_imgs + f'/{uid}.npy')
#     except Exception as e:
#         arr = joblib.load(part2_imgs + f'/{uid}.npy')
#     return arr

# def normalize_band(img_band):
#     # temp normailze to 0 and 1
#     m = img_band.max()
#     return img_band/m

# def resize_band(norm_img_band):
#     return cv2.resize(norm_img_band, config.IMG_SIZE)

# # seperate img_arr from data and resize all
# def get_img_arr(arr: np.ndarray, start: int = 0, end: int = 5) -> np.ndarray:
#     img_arr = arr[start:end]                       # just few bands for now!!
#     return img_arr

# def norm_resize_bands(arr_bands: np.ndarray):
#     finished_bands = []
#     for band in arr_bands:
#         nb = normalize_band(band)
#         rb = resize_band(nb)
#         finished_bands.append(rb)
#     return np.array(finished_bands)

# #  do that for all samples in metadata_subset (and test_data)
# def get_all_imgs(uid_list: list) -> np.ndarray:
#     data_list = []
#     # add tqdm..
#     for uid in tqdm(uid_list): 
#         arr = get_img(uid)
#         img_arr = get_img_arr(arr)
#         normalized_img_arr = norm_resize_bands(img_arr)
#         data_list.append(normalized_img_arr)
#     return np.array(data_list)


# #  make into tf or np datasets



# def get_np_data(split : float = 0.2, task='train'):
#     """Return np data for training and validation."""
#     if task == 'train':
#         print("Loading train and validation data...")
#         x_train_uids, x_val_uids, y_train, y_val = train_test_split(
#             data['uid'],
#             data.severity,
#             test_size=split,
#             random_state=config.RANDOM_SEED,
#             stratify=data.severity
#         )

#         x_train = get_all_imgs(x_train_uids)
#         x_val = get_all_imgs(x_val_uids)

#         return x_train, y_train, x_val, y_val

#     if task == 'test':
#         test_ids = test_data.uids
#         x_test
#         return x_test


# %%time
# x_train_, y_train_, x_val_, y_val_ = get_np_data()
# print(y_train_.value_counts(normalize=True))
# print(y_val_.value_counts(normalize=True))
# print('Done')

In [None]:
config

# Preprocess

In [29]:
# # # change labels to 0-3(model works this way) instead of 1-4 given range(given severity)
# # -1 for to make labels look like sparse encoded labels

# y_train = y_train_-1
# y_val = y_val_-1


# X_train = x_train_.transpose([0, 2, 3, 1])
# X_val = x_val_.transpose([0, 2, 3, 1])

# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [30]:
# train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(500).batch(config.train.batch_size).prefetch(tf.data.AUTOTUNE)
# val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(config.train.batch_size)

# Model

In [31]:
def get_model(mdtype='clf'):
    print(f'Loading {mdtype} type model...')
    input_shape = (*config.IMG_SIZE, config.CHANNELS)

    if mdtype == 'clf':
        loss = losses.SparseCategoricalCrossentropy()
        comp_metric = keras_rmse_clf
        last_layer = layers.Dense(5, activation='softmax')

    if mdtype == 'reg':
        loss = rmse_loss
        comp_metric = keras_rmse_reg
        last_layer = layers.Dense(1)
    
    
    input_imgs = layers.Input(shape=input_shape)
    x = layers.Conv2D(64, (3, 3), activation='relu')(input_imgs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dense(128, activation='relu')(x)
    output = last_layer(x)

    model = models.Model(inputs=input_imgs, outputs=output, name=config.name)
    

    model.compile(optimizer=optimizers.Adam(learning_rate=config.train.lr),
                    loss = loss,
                    metrics=[
                        comp_metric,
                        metrics.SparseCategoricalAccuracy(name='acc')
                    ])

    return model

In [32]:
my_model = get_model('clf')
my_model.summary()

Loading clf type model...
Model: "conv2d64_d128-1674063395"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 35, 35, 10)]      0         
_________________________________________________________________
conv2d (Conv2D)              (None, 33, 33, 64)        5824      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4194560   
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense (Dense)   

In [33]:
my_model.fit(train_data, epochs=1)

# for x, y in train_data.take(2):
#     x = x.numpy()
#     x = np.expand_dims(x, axis=0)
#     print(x.shape)
#     model.fit(x, y)
#     break



InvalidArgumentError:  Input to reshape is a tensor with 3 values, but the requested shape has 32
	 [[{{node Reshape}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_1100]

Function call stack:
train_function


In [48]:
my_model.evaluate(val_data)



[1.6077656745910645, 1.1836735010147095, 0.42059949040412903]

# Train and eval

In [None]:
def train_(model, config=config, train_data=(X_train, y_train), val_data=None, debug=None):
    """fits given model to x_train and y_train"""
    
    train_config = config['train']
    my_callbacks = []
    
    earlystopping = callbacks.EarlyStopping(patience=15, monitor='val_loss', restore_best_weights=True)
    my_callbacks.append(earlystopping)
    reduce_lr_on_plateau = callbacks.ReduceLROnPlateau(
                                            monitor="val_acc",
                                            factor=0.5,
                                            patience=5,
                                            verbose=1,
                                            mode="auto",
                                            min_delta=0.01,
                                            cooldown=0,
                                            min_lr=0)
    my_callbacks.append(reduce_lr_on_plateau)
    
    try:
        wandb_callback = wandb.keras.WandbCallback(
            monitor='val_loss',
            log_weights=True,
            log_gradients=True,
            save_model=False,
            training_data=train_data,
            log_batch_frequency=None,
        )

        my_callbacks.append(wandb_callback)
    except:
        print('wandb not tracking')
        
    print(f'Training model... {config.name}')
    if debug == True:
        epochs = 1000
    else:
        epochs = train_config.epochs
    history = model.fit(
                train_data,
                epochs=epochs,
                batch_size=train_config.batch_size, 
                callbacks=my_callbacks, 
                validation_data=val_data,
                shuffle=True, 
                verbose=1 
            )

    return model, history


def eval_(model, val_data=(X_val, y_val)):
    print('Evaluating model....')
    model.evaluate(val_data, return_dict=True)

In [None]:
def train_eval(model=None, mdtype='clf',train_data=(X_train, y_train), val_data=(X_val, y_val), debug=False):
    
    if model is None:
        print('Getting New model')
        model = get_model()
    
    # train
    model, history = train_(model, config, train_data=train_dataset, val_data=val_data, debug=debug)  # try to overfit thsi batch
    # eval
    eval_(model, val_dataset)

    # classification report
    y_pred = model.predict(X_val)
    if mdtype == 'clf':
        y_pred_hard = np.argmax(y_pred, axis=1)             
    if mdtype == 'reg':
        y_pred_hard = np.round(y_pred)
        
    print(y_pred_hard)
    error = mse(y_val, y_pred_hard, squared=False)
    print("Comp Metric: ", error)
    cr = classification_report(y_val, y_pred_hard)     # +1 to account for 0-4 as it should be 1-5 originallly
    print(cr)
    
    return model, history

In [None]:
config.train.epochs = 50
config.train.lr = 1e-5
config.train

In [None]:
with wandb.init(project=config.PROJECT_NAME, config=config, name=config.name):
    print(config)
    model = get_model('clf')
    model, history = train_eval(model, 'clf', train_data=train_dataset, val_data=val_dataset)

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
preds = np.argmax(model.predict(X_train), axis=1)
print("Training Error:", mse(y_train, preds.ravel(), squared=False))

In [None]:
preds = np.argmax(model.predict(X_val), axis=1)
print("Testing error:", mse(y_val, preds.ravel(), squared=False))

In [None]:
pd.Series(preds).value_counts(normalize=True)

In [None]:
y_val.value_counts(normalize=True)

In [None]:
list(history.history.keys())

In [None]:
def vis_metrics(history):
    metrics = list(history.history.keys())[:-3]
    fig, axes = plt.subplots(2, int(len(metrics)/2), figsize=(15, 8))
    epochs = [x for x in range(config.train.epochs)]
    for n, ax in enumerate(axes.ravel()):
        ax.plot(epochs, history.history[metrics[n]])
        ax.set_xlabel(metrics[n])

In [None]:
vis_metrics(history)

In [None]:
y_val.values[0]

In [None]:
def plot_pred(n, axis):
    sample = X_val[n]
    rgb = sample.transpose()[2:5]
    axis.imshow(rgb.transpose())
    axis.set_title(f"label: {y_val.values[n]+1}")
    sample = np.expand_dims(sample, axis=0)
    pred = np.argmax(model.predict(sample))
    axis.set_xlabel(f"Predicted: {pred}");
    
    
# randomly viz some of the preds and their rgb_imgs
def viz_preds():
    rand_n = np.random.randint(low=0, high=3144, size=25)
    fig, axis = plt.subplots(5, 5, figsize=(30, 25))
    for n, ax in zip(rand_n, axis.ravel()):
        plot_pred(n, ax)

In [None]:
viz_preds()

In [None]:
viz_preds()

In [None]:
viz_preds()

In [None]:
#  No wayy! even I can figure out what is what!!

In [None]:
# model = models.load_model('/kaggle/working/d128_rmse_lndsat8_raw_v1-1673283452.h5', custom_objects={'comp_loss': comp_loss})
# preds = model.predict(X_val)
# int_preds = np.round(preds)
# mse(y_val, int_preds, squared=False)

# Save something..

In [None]:
# save model
if config.SAVE_MODEL:
    model.save(config.name + '.h5')
    print("Model saved as ",config.name + '.h5')

# Make submission

In [None]:
model.summary()

In [None]:
# X_test.shape

In [None]:
# # test_preds = np.round(model.predict(X_test)).ravel()
# test_preds = np.argmax(model.predict(X_test), axis=1)
# test_preds = test_preds + 1
# sub_format.severity = test_preds
# sub_format.severity = sub_format.severity.astype(int) 
# sub_format.severity.value_counts()

In [None]:
# save_file_to = f'{config.name}_preds.csv'
# print(f'saving file to {save_file_to}')
# sub_format.to_csv(save_file_to, index=False) # expect @ 0.979 0.98

# So...

- Model is not learning at alll - just better than random guessing!
- What could be the reason???

> Such terrible data?  (most probbale)

> bad Normalization or what?

> Atleast it tried to learn? (40--> 50) acc

- OOM erros :(

* # ToDos:

- if terrible data 

> Download much wider bbox

> more informative bands (band processing/indexes)

> Try different satellite imgs

> Try same dataset with gbs and see if they peform any bettter or do they learn anything!

- Fix oom error
- Best way to normalize bands?
- Try with target as density.

**MOST PROBABLY THIS COMPETITION WINNER WILL BE THE ONE WHO HAS BETTER DATA HANDLING TECHNIQUES..**

>