In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('./data/drug200.csv')

In [2]:
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [4]:
category = dict({
    'drugA': 0,
    'drugB': 1,
    'drugC': 2,
    'drugX': 3,
    'DrugY': 4,
    'M': 5,
    'F': 10,
    'HIGH': 30,
    'LOW': 20,
    'NORMAL': 25
})

def encode_x(x_data):
    x_train = []
    for x in x_data:
        train = []
        for j in range(len(x)):
            train.append(x[j] if x[j] not in category else category[x[j]])
        x_train.append(train)
    return np.array(x_train)


In [8]:
x_data = np.array(data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']])
x_train = encode_x(x_data)
y_train = np.array(list(map(lambda y: category[y], data['Drug'])))

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [10]:
def softmax_to_category(sm_out):
    return np.argsort(sm_out)[-1]
    
def predict(x_train, model):
    m, n = x_train.shape
    y_pred = []
    ten_pc = m/10
    for i, x in enumerate(x_train):
        [prediction] = model.predict(x.reshape(1, n), verbose=0)
        y_pred.append(softmax_to_category(prediction))

        if i%ten_pc == 0:
            # print(f'{round(i/ten_pc * 10)}%', end=' ')
            print('▒▒▒', end=(' '))
    return np.array(y_pred)

In [11]:
def evaluate(y_pred, y_train):
    m, = y_pred.shape
    t = 0
    for i in range(m):
        if y_pred[i] == y_train[i]:
            t += 1
    print(f'Accuracy: {round(t/m*100, 2)}%')

    return t/m

In [12]:
model = Sequential([
        Dense(25, activation='relu'),
        Dense(15, activation='relu'),
        Dense(5, activation='softmax')
    ])
    
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2),
)
model.fit(x_train, y_train, epochs=200, verbose=0)
y_pred = predict(x_train, model)
evaluate(y_pred, y_train)

▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ Accuracy: 90.0%


0.9

Pretty good so far. Let's see how much accuracy we can get with a different encoding.

## One-hot encoding

In [13]:
def encode_onehot(x_data):
    x_train = []

    for x in x_data:
        x_i = []
        for j, x_j in enumerate(x):
            # 0 is Age and 4 is Na_to_K
            if j == 0 or j == 4:
                x_i.append(x_j)
            # 1 is sex: M or F
            elif j == 1:
                x_i.append(1 if x_j == 'M' else 0)
            # BP: LOW, HIGH, NORMAL
            elif j == 2:
                x_i.append(1 if x_j == 'LOW' else 0)
                x_i.append(1 if x_j == 'NORMAL' else 0)
                x_i.append(1 if x_j == 'HIGH' else 0)
            # Cholesterol
            elif j == 3:
                x_i.append(1 if x_j == 'NORMAL' else 0)
                x_i.append(1 if x_j == 'HIGH' else 0)
            else:
                pass
        x_train.append(x_i)

    return np.array(x_train)

In [14]:
x_1h = encode_onehot(x_data)

In [15]:
model = Sequential([
        Dense(25, activation='relu'),
        Dense(15, activation='relu'),
        Dense(5, activation='softmax')
    ])
    
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2),
)
model.fit(x_1h, y_train, epochs=200, verbose=0)
y_pred_1h = predict(x_1h, model)
evaluate(y_pred_1h, y_train)

▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ ▒▒▒ Accuracy: 97.5%


0.975

We're able to achive higher accuracy on training set with one-hot encoding.