# <h1><center>Classify MoA based on biomarkers</center></h1>

### Import Libraries

In [None]:
import math
import numpy as np 
import pandas as pd
import os

import sys
sys.path.append('../input/stratifiers')
from ml_stratifiers import MultilabelStratifiedKFold

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.feature_selection import SelectFromModel

# from kerashypetune.kerashypetune import KerasGridSearch
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, GaussianNoise, Lambda, Input, Concatenate, Embedding, Flatten
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow_addons.layers import WeightNormalization
from tensorflow_addons.optimizers import Lookahead

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

### Read Data

In [None]:
test_df = pd.read_csv('../input/lish-moa/test_features.csv')
train_df = pd.read_csv('../input/lish-moa/train_features.csv')
train_target_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
sub = pd.read_csv('../input/lish-moa/sample_submission.csv')

target_cols = train_target_df.columns[1:]

In [None]:
train_df

In [None]:
ge_cols = train_df.columns[train_df.columns.str.contains('g-')]

In [None]:
train_target_df

### Basic Setup and Helpers

In [None]:
SEED = 43253
EPOCHS = 20
BATCH_SIZE = 64
FOLDS = 6
N_TARGETS = len(target_cols)
ACTIVATION='relu'

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [None]:
def multi_log_loss(y_true, y_pred):
    losses = []
    for col in y_true.columns:
        losses.append(log_loss(y_true.loc[:, col], y_pred.loc[:, col]))
    return np.mean(losses)

In [None]:
def log_loss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, 0.001, 0.999)
    return tf.keras.losses.binary_crossentropy(y_true, y_pred)

### Encode Categoricals to Binary

In [None]:
def preprocess_df(df):
    df['cp_type'] = (df['cp_type'] == 'trt_cp').astype(int)
    # df['cp_dose'] = 2 - df['cp_dose'].str.replace('D', '').astype(int)
    df.loc[:,'cp_time_oh'] = df['cp_time'].astype(str)
    df['cp_time'] = df['cp_time'].div(24).astype(int)
    return df

In [None]:
def handle_cats(df):
    df = pd.concat([df, pd.get_dummies(df.loc[:, one_hot_cols])], axis=1)
    df = df.drop(columns=one_hot_cols)
    return df    

In [None]:
def ctrl_normalize(df, dose, time):
    doseTimeRows = (df['cp_time'] == time) & (df['cp_dose'] == dose)
    actives = (df['cp_type'] == 1)
    inactives = (df['cp_type'] == 0)
    ctrl_mean = df.loc[doseTimeRows & inactives, bio_cols].mean()
    ctrl_std = df.loc[doseTimeRows & inactives, bio_cols].std()

    experimental = df.loc[doseTimeRows & actives, bio_cols]
    mean_diff = experimental.subtract(ctrl_mean, axis=1)
    df.loc[doseTimeRows & actives, bio_cols] = mean_diff.divide(ctrl_std, axis=1)

    return df

In [None]:
x_train = preprocess_df(train_df.drop(columns="sig_id"))
x_test = preprocess_df(test_df.drop(columns="sig_id"))
y_train = train_target_df.drop(columns="sig_id")

bio_cols = x_train.columns[x_train.columns.str.contains('-')]
non_bio_cols = x_train.columns[~x_train.columns.str.contains('-')]

In [None]:
for i in ['D1', 'D2']:
    for j in [1, 2, 3]:
       x_train = ctrl_normalize(x_train, i, j)
       x_test = ctrl_normalize(x_test, i, j)

x = x_train.loc[:, bio_cols].values
x = StandardScaler().fit_transform(x)
x_train.loc[:, bio_cols] = pd.DataFrame(x, columns=bio_cols)

In [None]:
def augment_data(df):
    # Worth ~0.00015 reduction in log loss
    pca = PCA(n_components=100)
    pca_x = pca.fit_transform(df.loc[:, bio_cols].values)
    return pd.concat([df, pd.DataFrame(data=pca_x)], axis=1)

In [None]:
# x_train = augment_data(x_train)
# x_test = augment_data(x_test)

In [None]:
one_hot_cols = ['cp_time_oh', 'cp_dose'] # 'cp_time'

x_train = handle_cats(x_train)
x_test = handle_cats(x_test)

N_FEATURES = x_train.shape[1]

x_train

### Define Model Architecture

In [None]:
def create_model(params):
    learning_rate = params['learning_rate']
    hidden_layers = params['hidden_layers']
    dropout = params['dropout']
    label_smoothing = params['label_smoothing']
    noise = params['noise']

    inputs = Input(shape=(N_FEATURES),)
    x = GaussianNoise(noise)(inputs)
    x = BatchNormalization()(x)

    for units in hidden_layers:
        x = WeightNormalization(Dense(units, activation=ACTIVATION))(x)
        x = Dropout(dropout)(x)
        x = BatchNormalization()(x)

    x = WeightNormalization(Dense(N_TARGETS, activation='sigmoid'))(x)
    
    model = Model(inputs, x)
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=[log_loss, 'accuracy'])
        
    return model

### Main CV and Model Training Function

In [None]:
cb_lr = ReduceLROnPlateau(
    monitor='val_log_loss',
    factor=0.1,
    patience=3,
    min_lr=1E-5)

es = EarlyStopping(
    patience=5,
    min_delta=1E-5,
    monitor='val_log_loss',
    restore_best_weights=True)

In [None]:
# !pip install keras-hypetune

param_grid = {
    'learning_rate': [0.0008], 
    'hidden_layers': [(2048, 2048)],
    'dropout': [0.65],
    'epochs': [50],
    'batch_size': [BATCH_SIZE],
    'noise': [0.3, 0.4],
    'label_smoothing': [0.0005, 0.0001]
}

# kgs = KerasGridSearch(create_model, param_grid, monitor='val_loss', greater_is_better=False)
# Xt1, Xv1, Xt2, Xv2, yt, yv = train_test_split(non_embed_data, embed_data, y_train, test_size=0.33, random_state=42)
# kgs.search([Xt1, Xt2], yt, validation_data=([Xv1, Xv2], yv), callbacks=[es, cb_lr])

# print(f'Best Score: {kgs.best_score}; Best Params: {kgs.best_params}')

In [None]:
def build_train(folds = 5):
    models = []
    oof_preds = y_train.copy()

    seed_everything(SEED)
    
    # turn x_train into array broken out by time
    x_size = x_train.loc[x_train['cp_time'] == 1].shape[0]
    y_size = y_train.loc[x_train['cp_time'] == 1].shape[0]

    x_by_time = np.array([
        x_train.loc[x_train['cp_time'] == 1].index.values,
        np.resize(x_train.loc[x_train['cp_time'] == 2].index.values, x_size),
        np.resize(x_train.loc[x_train['cp_time'] == 3].index.values, x_size)
    ])

    y_by_time = np.array([
        y_train.loc[x_train['cp_time'] == 1].index.values,
        np.resize(y_train.loc[x_train['cp_time'] == 2].index.values, y_size),
        np.resize(y_train.loc[x_train['cp_time'] == 3].index.values, y_size),
    ])
    tfold = TimeSeriesSplit(n_splits=2)

    for t_fold, (train_idx, test_idx) in enumerate(tfold.split(x_by_time, y_by_time)):
        print('\n')
        print('-'*50)
        print(f'Training tfold {t_fold + 1} ({train_idx} and {test_idx})')
        
        x_idx = x_train.index.isin(x_by_time[train_idx].flatten())
        y_idx = y_train.index.isin(y_by_time[train_idx].flatten())
        
        x = x_train[x_idx]
        y = y_train[y_idx]
        
        kfold = MultilabelStratifiedKFold(folds, shuffle=True, random_state=SEED)

        for k_fold, (kt_idx, kte_idx) in enumerate(kfold.split(x, y)):
            print('\n')
            print('-'*50)
            print(f'Training fold {k_fold + 1} ({kt_idx} and {kte_idx})')

            model = create_model({
                'learning_rate': 0.0008,
                'hidden_layers': [2048, 2048],
                'dropout': 0.65,
                'label_smoothing': 0.0001,
                'noise': 0.4
            })

            model.fit(
                x.values[kt_idx],
                y.values[kt_idx],
                callbacks = [cb_lr, es],
                epochs=EPOCHS,
                batch_size=BATCH_SIZE)

            oof_preds.loc[kte_idx, oof_preds.columns] += model.predict(x_train.values[kte_idx]) / (folds * 3)
            models.append(model)

    return models, oof_preds

In [None]:
models, oof_preds = build_train(folds=FOLDS)

In [None]:
print(f"OOF Log Loss: {multi_log_loss(y_train, oof_preds)}")

In [None]:
oof_preds.iloc[:,1:] = np.clip(oof_preds.iloc[:,1:], 0.001, 0.999)
oof_preds.loc[x_train['cp_type'] == 0, target_cols] = 0

print(f"OOF Log Loss: {multi_log_loss(y_train, oof_preds)}")

### Make Test Predictions and Save Submission

In [None]:
test_preds = sub.copy()
test_preds[target_cols] = 0
for model in models:
    test_preds.loc[:, target_cols] += model.predict(x_test)
test_preds.loc[:,target_cols] /= len(models)
test_preds.iloc[:,1:] = np.clip(test_preds.iloc[:,1:], 0.001, 0.999)
test_preds.loc[x_test['cp_type'] == 0, target_cols] = 0
test_preds.to_csv('submission.csv', index=False)

K.clear_session()