In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from tensorflow.keras import layers

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

from scipy.linalg import pinv2
import kerastuner as kt
import keras
from sklearn.metrics import confusion_matrix
import seaborn as sns

from numpy import asarray
from sklearn.datasets import make_multilabel_classification
from keras.models import Sequential

from keras.layers import Dense

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_drug.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/test_features.csv


In [2]:
def convertTypes(df):
    df.cp_type = df.cp_type.replace(['trt_cp', 'ctl_vehicle'], [0,1]) 
    df.cp_dose = df.cp_dose.replace(['D1', 'D2'], [0,1])
    return df

def prepareData(threshold = None
                , PCA_components=None
                , to_csv=False
                , X_train=None
                , X_test=None
                , internal=None
               ):
    
    if X_train is None:
        X_train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
    
    # To avoid errors, let's get rid of - sign
    X_train.columns = X_train.columns.str.replace('-', '_')

    category_cols = ['cp_type', 'cp_time', 'cp_dose']

    X_train = X_train.copy()  
    X_train = X_train.set_index("sig_id")
    X_train = convertTypes(X_train)
    
    to_drop = None
    if threshold is not None:
        try:
            corr
        except NameError:
            corr = X_train.loc[:,~X_train.columns.isin(category_cols)].corr().abs()
            corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
            
        to_drop = [column for column in corr.columns if any(corr[column] > threshold)]
        X_train = X_train.drop(X_train[to_drop], axis=1)
    
    pca = None
    scaler = None
    if PCA_components is not None:      
        ds = X_train.loc[:,~X_train.columns.isin(category_cols)]
        scaler = StandardScaler().fit(ds)
        
        ds = scaler.transform(ds)
        if type(PCA_components) == float:
            # Convert ratio to amount of components
            PCA_components = int(PCA_components*ds.shape[1])
        pca = PCA(n_components = PCA_components).fit(ds)
        
        ds = pca.transform(ds)
        X_train = pd.concat([X_train[category_cols], pd.DataFrame(ds, index=X_train.index)], axis=1)
        
        # Make sure columns aren't digits
        X_train.columns = ['_{}'.format(x) if type(x) == int else x for x in X_train.columns]
            
    Y_train = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    Y_train = Y_train.set_index("sig_id")
    
    Y_train.columns = Y_train.columns.str.replace('-', '_')
    Y_train.columns = Y_train.columns.map('_{}'.format)
    
    Y_test = None
    
    # Test 
    if internal is not None:
        X_train, X_test = train_test_split(X_train, test_size=internal, random_state=0)
        Y_test = Y_train.loc[X_test.index]
        Y_train = Y_train.loc[X_train.index]

    else: 
        if X_test is None:    
            X_test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

        X_test = X_test.copy()
        X_test = X_test.set_index("sig_id")
        X_test = convertTypes(X_test)

        
        if pca is not None:
            # Apply PCA
            ds = X_test.loc[:,~X_test.columns.isin(category_cols)]
            ds = scaler.transform(ds) 
            ds = pca.transform(ds)
            X_test = pd.concat([X_test[category_cols], pd.DataFrame(ds, index=X_test.index)], axis=1)
            X_test.columns = ['_{}'.format(x) if type(x) == int else x for x in X_test.columns]

    
    
    if to_csv:
        # Save file

        name = '_' + str(X_train.shape[0]) + 'x' + \
            str(X_train.shape[1]) + '_' + \
            (f'thres_{threshold}_' if threshold is not None else '') + \
            (f'PCA_{PCA}' if PCA is not None else '') + \
            '.csv'
        path = '/kaggle/working/'
        X_train.to_csv(path + 'X_train' + name)
        X_test.to_csv(path + 'X_test' + name)
        Y_train.to_csv(path + 'Y_train' + name)
        Y_test.to_csv(path + 'Y_test' + name)
    
    return ((X_train, X_test), (Y_train, Y_test))

## Train and test split

In [3]:
((X_train, X_test), (Y_train, Y_test)) = prepareData(threshold = 0.9, PCA_components=0.6, internal=0.2)

In [4]:
n_inputs, n_outputs = X_train.shape[1], Y_test.shape[1]
n_inputs

509

In [5]:
X = X_train.to_numpy()
y = Y_train.to_numpy()

## Model

In [6]:
def get_model(hidden_neurons, kernel_initializer_val, activation_fn, lr):
    model = Sequential()
    model.add(Dense(20, input_dim=X.shape[1],  kernel_initializer=kernel_initializer_val, activation=activation_fn))
    model.add(Dense(hidden_neurons, activation= activation_fn))
    model.add(Dense(hidden_neurons, activation= activation_fn))
    model.add(Dense(y.shape[1], activation='sigmoid'))
    
    opt = keras.optimizers.Adam(learning_rate=lr)
    model.compile( loss='BinaryCrossentropy', optimizer=opt, metrics=['accuracy', 'AUC']) 
    return model
    

In [10]:
def train_nn(hidden_neurons, kernel_initializer_val, activation_fn, lr):
    model = get_model(hidden_neurons, kernel_initializer_val, activation_fn, lr)
    history = model.fit(X, y, verbose=0, epochs=300, validation_split=0.2, shuffle=True)
    yhat = model.predict(X_test)
    yhat = yhat.astype(int)
    accuracy_score_val = accuracy_score(Y_test, yhat)
    
    new_data = {
        'lr': lr,
        'activation': activation_fn,
        'kernal_initializer': kernel_initializer_val,
        'hidden_layers': hidden_neurons,
        'accuracy': accuracy_score_val    
    }
    return new_data, history, yhat

In [None]:
parameter = {
    'lr': [0.001, 0.01, 0.0001],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'kernal_initializer': ['he_uniform', 'normal', 'glorot_normal'],
    'hidden_layers': [100, 200, 224]
}

accuracy_calculated = 0
model_data = None
model_history = None
yhat = None
for i in range(3):
    new_values, history, y_pred = train_nn(parameter['hidden_layers'][i], parameter['kernal_initializer'][i],  
                          parameter['activation'][i], 
                          parameter['lr'][i])
    print(new_values)
    if new_values['accuracy'] > accuracy_calculated:
        accuracy_calculated = new_values['accuracy']
        model_data = new_values
        model_history = history
        yhat = y_pred
        

print(model_data)

In [None]:
print("Done")

precision = precision_score(Y_test, yhat, average='micro')
recall = recall_score(Y_test, yhat, average='micro')
f1_measure = 2 * ((precision * recall) / (precision + recall))
print('Calculated Accuracy score: {}, Precision: {}, Recall: {}, F1 measure: {} '.format(
        accuracy_calculated, precision, recall, f1_measure))

## Plot Accuracy

In [None]:
plt.title('Accuracy Keras Model')
plt.plot(model_history.history['accuracy'], label='train')
#plt.plot(history.history['loss'], label='test')
plt.legend()
plt.show()

## Plot loss

In [None]:
plt.title('Loss Keras Model')
plt.plot(model_history.history['loss'], label='test')
plt.legend()
plt.show()

## AUC

In [None]:
plt.title('AUC Keras Model')
plt.plot(model_history.history['auc'], label='train')
plt.legend()
plt.show()

In [None]:
%matplotlib inline

def create_cm(y_pred):
    ## Confusion matrix
    y_test_binary = Y_test.to_numpy()
    y_test_actual_binary = []
    y_test_pred_binary = []
    for i in range(len(y_test_binary)):
        for j in range(len(y_test_binary[i])):
            predicted = yhat[i][j]
            actual = y_test_binary[i][j]

            y_test_actual_binary.append(actual)
            y_test_pred_binary.append(predicted)
    

    cf_matrix = confusion_matrix(y_test_actual_binary, y_test_pred_binary)
    sns.heatmap(cf_matrix, annot=True, fmt='', cmap = "Blues")
    plt.show()

In [None]:
create_cm(yhat)

# Extreme Learning Machine

## Hyperparameters

In [None]:
def relu(x):
    return np.maximum(x, 0, x)

In [None]:
def hidden_nodes(X, input_weights, biases):
    G = np.dot(X, input_weights)
    G = G + biases
    H = relu(G)
    return H

In [None]:
def predict(X, input_weights, biases, output_weights):
    out = hidden_nodes(X, input_weights, biases)
    out = np.dot(out, output_weights)
    return out

## Model Training

In [None]:
def elm_algorithm(hidden_size):
    input_weights = np.random.normal(size=[n_inputs,hidden_size])
    biases = np.random.normal(size=[hidden_size])
    
    output_weights = np.dot(pinv2(hidden_nodes(X, input_weights, biases)), y)
    
    X_test_data = X_test.to_numpy()
    prediction = predict(X_test_data, input_weights, biases, output_weights)
    
    yhat = np.where(prediction > 0.5, 1, 0)

    accuracy_score_val = accuracy_score(Y_test, yhat)
    precision = precision_score(Y_test, yhat, average='micro')
    recall = recall_score(Y_test, yhat, average='micro')
    f1_measure = 2 * ((precision * recall) / (precision + recall))
    print('For Hidden size: {}, Calculated Accuracy score: {}, Precision: {}, Recall: {}, F1 measure: {} '.format(
        hidden_size, accuracy_score_val, precision, recall, f1_measure))
    return yhat

In [None]:
yhat = elm_algorithm(3000)

In [None]:
create_cm(yhat)