In [99]:
import warnings
warnings.filterwarnings('ignore')

import os

import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras import losses
from tensorflow.keras.layers import Dense, InputLayer, Activation
from tensorflow.keras.models import Sequential

### Load and read the data

In [100]:
data = pd.read_csv("mayo_clinic_data.csv")
data.head()

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,SHORTNESS_OF_BREATH,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE,TYPE
0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,ALLERGY
1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,ALLERGY
2,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,ALLERGY
3,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,1,1,ALLERGY
4,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,ALLERGY


In [101]:
data['TYPE'].unique()

array(['ALLERGY', 'COLD', 'COVID', 'FLU'], dtype=object)

In [None]:
data.columns

Index(['COUGH', 'MUSCLE_ACHES', 'TIREDNESS', 'SORE_THROAT', 'RUNNY_NOSE',
       'STUFFY_NOSE', 'FEVER', 'NAUSEA', 'VOMITING', 'DIARRHEA',
       'SHORTNESS_OF_BREATH', 'DIFFICULTY_BREATHING', 'LOSS_OF_TASTE',
       'LOSS_OF_SMELL', 'ITCHY_NOSE', 'ITCHY_EYES', 'ITCHY_MOUTH',
       'ITCHY_INNER_EAR', 'SNEEZING', 'PINK_EYE', 'TYPE'],
      dtype='object')

In [None]:
data.shape

(44453, 21)

In [102]:
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'TYPE'.
data['TYPE']= label_encoder.fit_transform(data['TYPE'])
 
data['TYPE'].unique() 

array([0, 1, 2, 3])

In [111]:
labels = data['TYPE']
labels

0        0
1        0
2        0
3        0
4        0
        ..
44448    3
44449    3
44450    3
44451    3
44452    3
Name: TYPE, Length: 44453, dtype: int64

In [112]:
data.drop(['TYPE'], axis=1, inplace=True)
features = data
features.head()

Unnamed: 0,COUGH,MUSCLE_ACHES,TIREDNESS,SORE_THROAT,RUNNY_NOSE,STUFFY_NOSE,FEVER,NAUSEA,VOMITING,DIARRHEA,SHORTNESS_OF_BREATH,DIFFICULTY_BREATHING,LOSS_OF_TASTE,LOSS_OF_SMELL,ITCHY_NOSE,ITCHY_EYES,ITCHY_MOUTH,ITCHY_INNER_EAR,SNEEZING,PINK_EYE
0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1
1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1
2,0,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1
3,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,1,1
4,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1


### Data Split

In [113]:
SEED = 100

In [114]:
X = features

# create training and validation sets with 80-20 split
X_train, X_validation, y_train, y_validation = train_test_split(X, labels, test_size=0.2, random_state = SEED)

# split the validation sets to get a holdout dataset (for testing) 50-50 split
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.5, random_state = SEED)

print(X_train.shape)
print(X_validation.shape)
print(y_train.shape)
print(y_validation.shape)
print(X_test.shape)
print(y_test.shape)

(35562, 20)
(4445, 20)
(35562,)
(4445,)
(4446, 20)
(4446,)


### Obtain data points from test to run inference on


In [115]:
user_input_1 = X_test.iloc[[0]].to_numpy()
user_input_1

array([[0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0]])

In [116]:
user_input_2 = X_test.iloc[[500]].to_numpy()
user_input_2

array([[0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

In [117]:
user_input_3 = X_test.iloc[[3000]].to_numpy()
user_input_3

array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [118]:
user_input_4 = X_test.iloc[[4000]].to_numpy()
user_input_4

array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0]])

### Prepare data for training

In [119]:
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32
BUFFER_SIZE = 2000

train_numeric_ds = Dataset.from_tensor_slices((X_train, y_train))

train_numeric_ds = train_numeric_ds.batch(BATCH_SIZE).shuffle(BUFFER_SIZE).prefetch(AUTOTUNE)

# val dataset
val_numeric_ds = Dataset.from_tensor_slices((X_validation, y_validation))
val_numeric_ds = val_numeric_ds.batch(BATCH_SIZE).shuffle(BUFFER_SIZE).prefetch(AUTOTUNE)

# test dataset 
test_numeric_ds = Dataset.from_tensor_slices((X_test, y_test))
test_numeric_ds = test_numeric_ds.batch(BATCH_SIZE).shuffle(BUFFER_SIZE).prefetch(AUTOTUNE)

print(train_numeric_ds.element_spec)

(TensorSpec(shape=(None, 20), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))


### Modelling

In [120]:
model = Sequential([
                    InputLayer(input_shape=X_train.shape[1:]),
                    Dense(300, activation='relu'),
                    Dense(100, activation='relu'),
                    Dense(4, activation="softmax") # 4 neurons, 1 per class
                  ]) 

model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='sgd',
    metrics=['accuracy'])

history = model.fit(
    train_numeric_ds, validation_data=val_numeric_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [121]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 300)               6300      
                                                                 
 dense_10 (Dense)            (None, 100)               30100     
                                                                 
 dense_11 (Dense)            (None, 4)                 404       
                                                                 
Total params: 36,804
Trainable params: 36,804
Non-trainable params: 0
_________________________________________________________________
None


In [122]:
model_loss, model_accuracy = model.evaluate(test_numeric_ds)

print("Model accuracy: {:2.2%}".format(model_accuracy))

Model accuracy: 92.29%


### Run inference on new data

In [123]:
CONDITION = {
    0: 'ALLERGY',
    1: 'COLD',
    2: 'COVID',
    3: 'FLU'
}

In [124]:
"""
A function to find the label with the maximum score.
"""
class_values = tf.constant([0, 1, 2, 3])

def get_label(user_input):
  predicted_scores_batch = model.predict(user_input)
  predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
  predicted_labels = tf.gather(class_values, predicted_int_labels)
  return predicted_labels

In [125]:
"""
Now, the model can take user input (symptoms) and predict a score for each label using Model.predict. 
"""

predicted_condition_1 = get_label(user_input_1)
print("Predicted condition: ", CONDITION[predicted_condition_1[0].numpy()])

predicted_condition_2 = get_label(user_input_2)
print("Predicted condition: ", CONDITION[predicted_condition_2[0].numpy()])

predicted_condition_3 = get_label(user_input_3)
print("Predicted condition: ", CONDITION[predicted_condition_3[0].numpy()])

predicted_condition_4 = get_label(user_input_4)
print("Predicted condition: ", CONDITION[predicted_condition_4[0].numpy()])

Predicted condition:  ALLERGY
Predicted condition:  FLU
Predicted condition:  FLU
Predicted condition:  FLU
