# Kaggle - Mechanisms of Action (MoA) Prediction

In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

import plotly.graph_objects as go
import plotly.express as px

from IPython.display import clear_output
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

random_seed = 7

In [None]:
def preprocess_features(df, filter_ctl = False):

    # split
    df_A = df.loc[:, [c for c in df.columns if c in ['cp_type', 'cp_dose', 'cp_time']]]
    df_g = df.loc[:, [c for c in df.columns if 'g-' in c]]
    df_c = df.loc[:, [c for c in df.columns if 'c-' in c]]

    # cp_type
    if filter_ctl:
        df_A = df_A.loc[df_A['cp_type']!='ctl_vehicle', :]
        del df_A['cp_type']
    else:
        df_A['cp_type'] = df_A['cp_type'].replace('trt_cp', 0)
        df_A['cp_type'] = df_A['cp_type'].replace('ctl_vehicle', 1)

    # cp_dose
    df_A['cp_dose'] = df_A['cp_dose'].replace('D1', 0)
    df_A['cp_dose'] = df_A['cp_dose'].replace('D2', 1)

    # cp_time
    df_A = pd.get_dummies(df_A, columns = ['cp_time'])

    # g- and c- : rescale using QuantileTransformer
    transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution='normal')
    df_g = pd.DataFrame(transformer.fit_transform(df_g), index=df_g.index, columns=df_g.columns)
    df_c = pd.DataFrame(transformer.fit_transform(df_c), index=df_c.index, columns=df_c.columns)
    
    list_df = [df_A, df_g, df_c]
       

    return pd.concat(list_df, axis = 1).dropna()

def kaggle_replace(p):
    return np.max((np.min((p,1-10**(-15))),10**-15))

def kaggle_score(Y_true, Y_pred):
    score = []
    
    Y_true = Y_true.sort_index().sort_index(axis = 1)
    Y_pred = Y_pred.sort_index().sort_index(axis = 1)
    for target in Y_true.columns:
        y_pred = Y_pred.loc[:, target]
        
        score.append(log_loss(Y_true.loc[:, target], y_pred, labels = [0,1]))
    return np.mean(score)

## Load data

In [None]:
data_folder = 'lish-moa'

In [None]:
file_name = f"{data_folder}/train_features.csv"
train_features = pd.read_csv(file_name)

file_name = f"{data_folder}/test_features.csv"
test_features = pd.read_csv(file_name)

file_name = f"{data_folder}/train_targets_scored.csv"
train_targets_scored = pd.read_csv(file_name)

file_name = f"{data_folder}/train_targets_nonscored.csv"
train_targets_nonscored = pd.read_csv(file_name)

file_name = f"{data_folder}/sample_submission.csv"
sample_submission = pd.read_csv(file_name)

# set id
train_targets_scored = train_targets_scored.set_index('sig_id')
train_features = train_features.set_index('sig_id')
test_features = test_features.set_index('sig_id')

In [None]:
train_features = preprocess_features(train_features, filter_ctl=True)
test_features = preprocess_features(test_features, filter_ctl=True)

train_targets_scored = train_targets_scored.loc[train_features.index, :]

### PCA

In [None]:
latent_dim = 50

X = pd.concat([train_features, test_features], axis = 0)
pca_features = PCA(latent_dim).fit_transform(X)

pca_features = pd.DataFrame(pca_features, index = X.index, columns=[f'pca_{i}' for i in range(pca_features.shape[1])])

transformer = QuantileTransformer(n_quantiles=100, random_state=random_seed, output_distribution='normal')
pca_features = pd.DataFrame(transformer.fit_transform(pca_features), index=pca_features.index, columns=pca_features.columns)

pca_train_features = pca_features.loc[train_features.index, :]
pca_test_features = pca_features.loc[test_features.index, :]

In [None]:
train_features = pd.concat([train_features, pca_train_features], axis = 1)
test_features = pd.concat([test_features, pca_test_features], axis = 1)

### Select K-best

In [None]:
k_best = int(0.9*train_features.shape[1])

df_k_best = pd.DataFrame(columns=train_features.columns)

for i,c in enumerate(train_targets_scored.columns):
    print(f"{i}/{len(train_targets_scored.columns)} : {c}", end = '\r')
    X_new = SelectKBest(f_classif, k=k_best).fit(train_features, train_targets_scored[c])
    df_k_best.loc[c, X_new.get_support(indices=False)] = 1
    
k_best_features =  df_k_best.fillna(0).sum().sort_values(ascending = False).index[0:k_best]

## train/val split

In [None]:
train_features, train_targets_scored = shuffle(train_features, train_targets_scored, random_state=random_seed)

In [None]:
X_train = train_features.to_numpy()
Y_train = train_targets_scored.to_numpy()

X_test = test_features.to_numpy()

### Train

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, BatchNormalization, InputLayer
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.losses import categorical_crossentropy, BinaryCrossentropy

from tensorflow_addons.optimizers import Lookahead

In [None]:
input_dim = X_train.shape[1]
output_dim = Y_train.shape[1]

In [None]:
# Model construction
def get_model(size_layer = 64, dropout = 0.35,
              loss = BinaryCrossentropy, opt = Adam(10e-4)):
    
    model = Sequential([
        InputLayer(input_dim),
        Dense(size_layer),
        Activation('relu'),
        BatchNormalization(),
        Dropout(dropout),
        Dense(size_layer),
        Activation('relu'),
        BatchNormalization(),
        Dropout(dropout),
        Dense(size_layer),
        Activation('relu'),
        BatchNormalization(),
        Dense(output_dim),
        Activation('sigmoid')
    ])

    model.compile(optimizer=opt,
                  loss=loss)
    
    return model

In [None]:
tf.random.set_seed(random_seed)

n_seed = 10
n_splits = 10

results = train_targets_scored.copy()
submission = sample_submission.set_index('sig_id').loc[test_features.index, :]

results.loc[:,:] = 0.0
submission.loc[:,:] = 0.0

epochs = 5000
batch_size = 16
patience = 10
size_layer = 128
dropout = 0.5
loss = BinaryCrossentropy(label_smoothing=0.001)

for seed in range(n_seed):
    print(f'\n### Seed {seed} ###')
    for n, (train_, test_) in enumerate(MultilabelStratifiedKFold(n_splits=n_splits, 
                                                           random_state=seed, 
                                                           shuffle=True).split(train_features, 
                                                                               train_targets_scored)):
        
        print(f'\nFold {n}')
        checkpoint_path = f'Seed:{seed}_Fold:{n}.hdf5'
        
        opt = Lookahead(Adam(1e-3))
        model = get_model(size_layer, dropout, loss, opt)
        
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
        mc = ModelCheckpoint(checkpoint_path, monitor='val_loss', mode='min', verbose=1, save_best_only=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-8)
        model.fit(X_train[train_], Y_train[train_], 
                  validation_data = (X_train[test_], Y_train[test_]), 
                  epochs=epochs, 
                  batch_size=batch_size, 
                  verbose=0,
                  callbacks=[es, mc, reduce_lr])

        model.load_weights(checkpoint_path)
        
        val_predict = model.predict(X_train[test_])
        test_predict = model.predict(X_test)

        results.iloc[test_, :] += val_predict
        submission += test_predict
        
   
results /= n_seed
submission /= n_splits * n_seed

## Test and Val scores

In [None]:
# Validation
score_val = kaggle_score(train_targets_scored, results)

print(f"Validation score : {score_val}")

## Predict

In [None]:
# Test 
Y_test_pred = submission

Y_test_pred_complete = pd.DataFrame(0, index = [i for i in sample_submission['sig_id'] if i not in Y_test_pred.index], columns = Y_test_pred.columns)
Y_test_pred = pd.concat([Y_test_pred, Y_test_pred_complete], axis = 0)

Y_test_pred = Y_test_pred.sort_index().reset_index().rename(columns = {'index' : 'sig_id'}).loc[:, sample_submission.columns]

In [None]:
# Export
file_name = "submission.csv"
Y_test_pred.to_csv(file_name, sep = ',', index = False)