In [1]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn import preprocessing


from tqdm.notebook import tqdm

import math

In [35]:
# Load data
train_feature = pd.read_csv("../../Data/Mechanisms of Action (MoA) Prediction/train_features.csv")
test_feature = pd.read_csv("../../Data/Mechanisms of Action (MoA) Prediction/test_features.csv")
train_targets_scored = pd.read_csv("../../Data/Mechanisms of Action (MoA) Prediction/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("../../Data/Mechanisms of Action (MoA) Prediction/train_targets_nonscored.csv")
sub = pd.read_csv('../../Data/Mechanisms of Action (MoA) Prediction/sample_submission.csv')
data = train_feature.append(test_feature)

In [3]:
# data encoding
def preprocess(df):
#   df.loc[:,'cp_type'] =  df.loc[:,'cp_type'].map({'trt_cp' : 0,'ctl_vehicle' : 1})
    df.drop(['cp_type'],axis=1,inplace=True)
    df.loc[:,'cp_dose'] = df.loc[:,'cp_dose'].map({'D1':0,'D2':1})
    df.loc[:,'cp_time'] = df.loc[:,'cp_time'].map({24:0,48:1,72:2})
    df = pd.get_dummies(df,columns=['cp_dose','cp_time'])
    del df['sig_id']
    return df

In [4]:
train = preprocess(train_feature)
test = preprocess(test_feature)

In [5]:
# fit scaler to train and test data
scaler = preprocessing.MinMaxScaler()
scaler.fit(train)

train_transient = scaler.transform(train)
test_transient = scaler.transform(test)

train = pd.DataFrame(train_transient, columns=train.columns)
test = pd.DataFrame(test_transient, columns=test.columns)

In [6]:
y_train = train_targets_scored.drop(['sig_id'], axis=1)

In [7]:
def loss_fn(yt,yp):
    return log_loss(yt, yp, eps=1e-15, labels=[0,1])

In [8]:
def create_model(num_input, activation='relu'):
    model=tf.keras.Sequential()
    model.add(L.Input(num_input))
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))
    model.add(tfa.layers.WeightNormalization(L.Dense(1024,activation = activation)))
    model.add(L.BatchNormalization())
    model.add(L.Dropout(0.2))
    model.add(tfa.layers.WeightNormalization(L.Dense(1024,activation = activation)))
    model.add(L.BatchNormalization())
    model.add(tfa.layers.WeightNormalization(L.Dense(206,activation = 'sigmoid')))
    
    model.compile(optimizer = tfa.optimizers.AdamW(lr=1e-3, weight_decay=1e-5
                                                   ,clipvalue=786)
                  , loss=BinaryCrossentropy(label_smoothing=1e-15))
    
    return model


In [9]:
# Use All feats as top feats
top_feats = [i for i in range(train.shape[1])]
print("Top feats length:",len(top_feats))

Top feats length: 877


In [27]:
def metric(y_true,y_pred):
    metrics=[]
    for _target in train_targets_scored.columns[1:]:
        metrics.append(loss_fn(y_true.loc[:,_target],y_pred.loc[:,_target].astype(float)))
        
    return np.mean(metrics)

In [11]:
SEED = 1
EPOCHS = 20
BATCH_SIZE = 128
FOLDS = 5
REPEATS = 1
LR = 0.0005
N_TARGETS = len(train_targets_scored.columns)

In [12]:
def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [13]:
def build_train(resume_models=None, repeat_number=0, folds=5, skip_folds=0):
    
    models=[]
    preds = y_train.copy()
    
    kfold = KFold(folds, shuffle=True)
    for fold,(train_indices,val_indices) in enumerate(kfold.split(train)):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold+1}')
        print(fold)
        cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
            monitor = 'val_loss', factor = 0.4, patience = 2, 
            verbose = 1, min_delta = 0.0001, mode = 'auto')
        
        checkpoint_path = f'Repeat{repeat_number}_Fold{fold}.hdf5'
        cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                        monitor = 'val_loss', 
                                                        verbose = 0, 
                                                        save_best_only = True, 
                                                        save_weights_only = True, 
                                                        mode = 'min')
        
        model = create_model(len(top_feats))
        model.fit(train.values[train_indices],
              y_train.values[train_indices],
              validation_data=(train.values[val_indices], y_train.values[val_indices]),
              callbacks = [cb_lr_schedule, cb_checkpt],
              epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2
             )
        
        model.load_weights(checkpoint_path)
        preds.loc[val_indices, :] = model.predict(train.values[val_indices])
        models.append(model)
        
    return models,preds
    

In [14]:
models = []
oof_preds = []
# seed everything
seed_everything(SEED)
for i in range(REPEATS):
    m, oof = build_train(repeat_number = i, folds=FOLDS)
    models = models + m
    oof_preds.append(oof)



--------------------------------------------------
Training fold 1
0
Epoch 1/20
149/149 - 11s - loss: 0.2779 - val_loss: 0.0391
Epoch 2/20
149/149 - 11s - loss: 0.0262 - val_loss: 0.0230
Epoch 3/20
149/149 - 11s - loss: 0.0194 - val_loss: 0.0193
Epoch 4/20
149/149 - 11s - loss: 0.0176 - val_loss: 0.0175
Epoch 5/20
149/149 - 11s - loss: 0.0166 - val_loss: 0.0165
Epoch 6/20
149/149 - 11s - loss: 0.0156 - val_loss: 0.0172
Epoch 7/20
149/149 - 11s - loss: 0.0150 - val_loss: 0.0160
Epoch 8/20
149/149 - 11s - loss: 0.0142 - val_loss: 0.0160
Epoch 9/20
149/149 - 11s - loss: 0.0135 - val_loss: 0.0158
Epoch 10/20
149/149 - 11s - loss: 0.0128 - val_loss: 0.0158
Epoch 11/20

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
149/149 - 11s - loss: 0.0118 - val_loss: 0.0159
Epoch 12/20
149/149 - 11s - loss: 0.0103 - val_loss: 0.0157
Epoch 13/20

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00016000000759959222.
149/149 - 11s - loss: 0.0095 - val_loss: 0.

Epoch 18/20
149/149 - 11s - loss: 0.0102 - val_loss: 0.0157
Epoch 19/20

Epoch 00019: ReduceLROnPlateau reducing learning rate to 4.09600033890456e-06.
149/149 - 11s - loss: 0.0102 - val_loss: 0.0157
Epoch 20/20
149/149 - 11s - loss: 0.0102 - val_loss: 0.0157


In [16]:
target_cols = train_targets_scored.columns[1:]
N_TARGETS = len(target_cols)

In [26]:
target_cols

Index(['5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',
       'acat_inhibitor', 'acetylcholine_receptor_agonist',
       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',
       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',
       'adenylyl_cyclase_activator', 'adrenergic_receptor_agonist',
       ...
       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
      dtype='object', length=206)

In [32]:
mean_oof_preds = y_train.copy()
mean_oof_preds.loc[:, target_cols] = 0
for i, p in enumerate(oof_preds):
    print(f"Repeat {i + 1} OOF Log Loss: {metric(y_train, p)}")
    mean_oof_preds.loc[:, target_cols] += p[target_cols]

mean_oof_preds.loc[:, target_cols] /= len(oof_preds)
print(f"Mean OOF Log Loss: {metric(y_train, mean_oof_preds)}")
# mean_oof_preds.loc[train_feature['cp_type'] == 0, target_cols] = 0
# print(f"Mean OOF Log Loss (ctl adjusted): {metric(y_train, mean_oof_preds)}")

Repeat 1 OOF Log Loss: 0.01567695950269504
Mean OOF Log Loss: 0.01567695950269504


In [37]:
test_preds = sub.copy()
test_preds[target_cols] = 0
for model in models:
    test_preds.loc[:,target_cols] += model.predict(test)
test_preds.loc[:,target_cols] /= len(models)
# test_preds.loc[x_test['cp_type'] == 0, target_cols] = 0
test_preds.to_csv('submission_1.csv', index=False)

In [36]:
test

Unnamed: 0,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,g-8,g-9,...,c-95,c-96,c-97,c-98,c-99,cp_dose_0,cp_dose_1,cp_time_0,cp_time_1,cp_time_2
0,0.320196,0.544506,0.494816,0.402475,0.483780,0.569124,0.564588,0.660641,0.512204,0.695735,...,0.771086,0.742021,0.769870,0.774235,0.668236,1.0,0.0,1.0,0.0,0.0
1,0.343589,0.553916,0.593975,0.346656,0.366779,0.559432,0.468355,0.672281,0.510785,0.573060,...,0.738396,0.743125,0.736893,0.657387,0.671496,1.0,0.0,0.0,0.0,1.0
2,0.367318,0.519358,0.501866,0.383110,0.301240,0.592842,0.557376,0.625567,0.566231,0.603974,...,0.701264,0.804606,0.852368,0.800275,0.710308,1.0,0.0,1.0,0.0,0.0
3,0.386502,0.550529,0.546426,0.401450,0.353320,0.509085,0.611539,0.634660,0.579049,0.584374,...,0.710122,0.800674,0.763657,0.805774,0.682296,0.0,1.0,1.0,0.0,0.0
4,0.329730,0.414718,0.634583,0.387780,0.353265,0.577676,0.606519,0.687423,0.543289,0.489167,...,0.865460,0.718534,0.732645,0.838464,0.857515,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,0.384845,0.479092,0.719832,0.336155,0.441365,0.672607,0.580627,0.605468,0.465982,0.613961,...,0.743008,0.646575,0.686475,0.730936,0.676566,1.0,0.0,1.0,0.0,0.0
3978,0.317443,0.508742,0.673291,0.396525,0.415975,0.596615,0.534028,0.645363,0.543310,0.699729,...,0.879741,0.759822,0.752167,0.761719,0.717610,1.0,0.0,1.0,0.0,0.0
3979,0.329691,0.517966,0.539813,0.332335,0.390018,0.606359,0.575976,0.694765,0.531297,0.658632,...,0.816810,0.743880,0.819313,0.730646,0.713481,1.0,0.0,0.0,0.0,1.0
3980,0.284729,0.367669,0.501486,0.438430,0.307533,0.565866,0.556101,0.562141,0.583205,0.610135,...,0.790432,0.721081,0.813015,0.819388,0.720869,0.0,1.0,0.0,1.0,0.0


In [38]:
pwd

'/home/kratagya/Desktop/ML_AI/python files/MOA prediction'