In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
from datetime import datetime
import random
from tensorflow.keras import backend
import kerastuner

In [2]:
tf.keras.backend.set_floatx('float64')
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
dir_name = str(datetime.now())[:-10].replace(' ','-')
dir_name = dir_name.replace(':','')

In [4]:
dir_name

'2020-10-20-1238'

In [5]:
directory = f'logs\\{dir_name}'

In [6]:
train_raw = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_targets_scored.csv')

#train_raw = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
#labels = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

# scale
ss = StandardScaler()
ss.fit(train_raw[train_raw.columns[4:]])

train_raw[train_raw.columns[4:]] = ss.transform(train_raw[train_raw.columns[4:]])

#drop controls
all_features = train_raw[train_raw['cp_type'] == 'trt_cp']

#decode
all_features['cp_time'] = all_features['cp_time'].map({ 24: 0, 48: 0.5, 72: 1})
all_features['cp_dose'] = all_features['cp_dose'].map({'D1': 0, 'D2': 1})

# drop controls from labels
labels['cp_type'] = train_raw['cp_type']
labels = labels[labels['cp_type'] == 'trt_cp']
labels.drop('cp_type',axis=1,inplace=True)

#train_test_split
train_features = all_features.sample(int(len(all_features)*.85))
test_features = all_features.drop(list(train_features.index),axis=0)
train_labels = labels.loc[list(train_features.index)]
test_labels = labels.drop(list(train_features.index),axis=0)

train_features.reset_index(inplace=True,drop=True)
test_features.reset_index(inplace=True,drop=True)
train_labels.reset_index(inplace=True,drop=True)
test_labels.reset_index(inplace=True,drop=True)

print('Check train / test split is correct: \n')
print('train sig_ids match:')
print(list(train_features['sig_id']) == list(train_labels['sig_id']))
print('test sig_ids match:')
print(list(test_features['sig_id']) == list(test_labels['sig_id']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Check train / test split is correct: 

train sig_ids match:
True
test sig_ids match:
True


In [7]:
p_min = 0.0005
p_max = 0.9995
def logloss(y_true, y_pred): #training logloss function
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

In [8]:
def create_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.BatchNormalization())
    
    model.add(tfa.layers.WeightNormalization(tf.keras.layers.Dense(len(train_features.columns[2:]), activation='relu')))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(hp.Float('Dropout_0',.2,.4,.05)))
    
    for i in range(random.choice([1,2,3,4,5])):
        model.add(tf.keras.layers.Dense(hp.Int(f'{i}_dense_layer',
                                                min_value=512,
                                                max_value=2432,
                                                step=64),activation='relu'))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Dropout(hp.Float(f'Dropout_{i+1}',.2,.4,.05)))

    model.add(tf.keras.layers.Dense(len(labels.columns[1:]), activation='sigmoid'))
    
    model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.0005), metrics=logloss)
    
    return model

In [9]:
tuner = RandomSearch(
    create_model,
    objective=kerastuner.Objective("val_logloss", direction="min"),
    max_trials=10,  # how many model variations to test?
    executions_per_trial=1,  # how many trials per variation? (same model could perform differently)
    directory=directory,
    project_name='NN')

In [10]:
early_stop = EarlyStopping(monitor='val_logloss', patience=7,verbose=0)
reduce_lr = ReduceLROnPlateau(monitor='val_logloss', factor=0.3, patience=5, mode='min', min_lr=1e-5)
callbacks_list = [early_stop, reduce_lr]

tuner.search(x=train_features.drop(['sig_id','cp_type'],axis=1),
             y=train_labels.drop(['sig_id'],axis=1),
             verbose=0,
             epochs=10000, #this will break with Early Stopping
             batch_size=256,
             callbacks=callbacks_list,
             validation_split=0.15
            )







KeyboardInterrupt: 

In [None]:
tuner.get_best_hyperparameters()[0].values

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
best_model

In [None]:
best_model.fit( x=train_features.drop(['sig_id','cp_type'],axis=1),
           y=train_labels.drop(['sig_id'],axis=1),
           batch_size=256,
           epochs=10000,
           validation_split=0.15,
           verbose=1,
           callbacks=callbacks_list )

In [None]:
best_model.summary()

In [None]:
samp_sub = pd.read_csv('sample_submission.csv')
samp_sub = samp_sub[:len(test_features)]

In [None]:
from logloss import logloss #actual logloss function

#for scoring on fake test set
preds = []
true = []
for i in tqdm(range(len(test_features))):
    sample = np.array(test_features.drop(['sig_id','cp_type'],axis=1).iloc[i]).reshape(1,874)
    preds.append(best_model.predict(sample)[0])
    true.append(test_labels.drop(['sig_id'],axis=1).iloc[i])
    
samp_sub[samp_sub.columns[1:]] = preds
samp_sub['sig_id'] = test_features['sig_id']

scores = []
for i in range(len(preds)):
    scores.append(logloss(true[i],preds[i]))
print(f'Log loss {np.mean(scores)}')

In [None]:
best_model.save(f'C:\\Users\\leesc\\Documents\\MoA_comp_drafting\\logs\\{np.mean(scores)}.h5')