## Prepare data

In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

data = pd.read_csv('./data/drug200.csv')

In [122]:
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [123]:
drug_encoding = dict({
    'drugA': 0,
    'drugB': 1,
    'drugC': 2,
    'drugX': 3,
    'DrugY': 4
})

In [124]:
x = pd.get_dummies(data.iloc[:,:-1]).astype(np.float32)
y = data['Drug'].apply(lambda y: drug_encoding[y]).astype(np.float32)

## Split the data

In [125]:
# Splitting the data set into training set, test set and cross validation sets
# to avoid overfitting

x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.4, random_state=1)
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.5, random_state=1)

## Fit

In [126]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [212]:
model = Sequential([
        Dense(25, activation='relu'),
        Dense(15, activation='relu'),
        Dense(5, activation='softmax')
    ])
    
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3),
)
model.fit(x_train, y_train, epochs=2000, verbose=0)

<keras.src.callbacks.History at 0x14130a85050>

## Evaluate

In [213]:
def tf_predict(x_in, model):
    sm2cat = lambda p: np.argsort(p)[-1]
    y_pred = model.predict(x_in)

    return np.array(list(map(sm2cat, y_pred)))

In [214]:
y_pred = tf_predict(x_train, model)
y_pred_cv = tf_predict(x_cv, model)
y_pred_test = tf_predict(x_test, model)



In [215]:
def eval(y_pred, y_train, name):
    print(f'{name} error\t: {round(np.mean(y_pred != y_train), 2) * 100}%')

In [216]:
eval(y_pred, y_train, 'train')
eval(y_pred_cv, y_cv, 'cv')
eval(y_pred_test, y_test, 'test')

train error	: 0.0%
cv error	: 2.0%
test error	: 8.0%


We can see that the model does very well on the training set, but has misclassfication on the CV and test sets. Let's try an ensemble of models and calculate the CV errors.

## Model ensemble

In [203]:
def get_models():
    tf.random.set_seed(20)

    model1 = Sequential(
        [
            Dense(25, activation = 'relu'),
            Dense(15, activation = 'relu'),
            Dense(5, activation = 'softmax')
        ],
        name='model1'
    )

    model2 = Sequential(
        [
            Dense(20, activation = 'relu'),
            Dense(12, activation = 'relu'),
            Dense(12, activation = 'relu'),
            Dense(20, activation = 'relu'),
            Dense(5, activation = 'softmax')
        ],
        name='model2'
    )

    model3 = Sequential(
        [
            Dense(32, activation = 'relu'),
            Dense(16, activation = 'relu'),
            Dense(8, activation = 'relu'),
            Dense(4, activation = 'relu'),
            Dense(12, activation = 'relu'),
            Dense(5, activation = 'softmax')
        ],
        name='model3'
    )

    return [model1, model2, model3]

In [218]:
models = get_models()

for i, model in enumerate(models):
    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3),
    )
    model.fit(x_train, y_train, epochs=1000, verbose=0)
    y_pred = tf_predict(x_train, model)
    y_pred_cv = tf_predict(x_cv, model)
    y_pred_test = tf_predict(x_test, model)

    print(f'Model {i}')
    eval(y_pred, y_train, 'train')
    eval(y_pred_cv, y_cv, 'cv')
    eval(y_pred_test, y_test, 'test')
    print()

Model 0
train error	: 0.0%
cv error	: 5.0%
test error	: 8.0%

Model 1
train error	: 0.0%
cv error	: 8.0%
test error	: 12.0%

Model 2
train error	: 0.0%
cv error	: 5.0%
test error	: 5.0%



Different runs produce different results. In this particular run, the 3rd model produced the lowest CV error and test error. We can choose this model. But other runs might result in different combinations.