In [2]:
import tensorflow as tf

import numpy as np
import os
import time
import pandas as pd

In [124]:
data=pd.read_excel("/content/output.xlsx")

In [126]:
data.head()

Unnamed: 0,Query,Disease Label
0,I've been feeling really tired and thirsty all...,Diabetes
1,"I have a high fever and a persistent cough, co...","Fever, Pneumonia"
2,I've lost my sense of taste and smell.,COVID-19
3,I have a runny nose and a sore throat.,Common Cold
4,I've been coughing a lot and it's hard to brea...,"Pneumonia, COVID-19"


In [127]:
# How many examples of each class?
data["Disease Label"].value_counts()

Diabetes               57
Common Cold            52
Fever                  51
COVID-19               47
Pneumonia              47
COVID-19, Fever         4
Fever, Pneumonia        2
Pneumonia, COVID-19     2
Fever\t                 1
Name: Disease Label, dtype: int64

In [128]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Before TensorFlow 2.6
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output seque

In [130]:
train_sentences=data["Query"]
train_label=data["Disease Label"]

In [131]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

11

In [132]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 11 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [16]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [18]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 11), dtype=int64, numpy=array([[159,   2,   1,  31,   8,   1,   0,   0,   0,   0,   0]])>

In [20]:
# Choose a random sentence from the training dataset and tokenize it
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
I've been having difficulty breathing, fatigue, and lost my sense of taste.      

Vectorized version:


<tf.Tensor: shape=(1, 11), dtype=int64, numpy=array([[ 6,  7, 87, 39, 40, 71,  3, 18,  8, 46, 17]])>

In [21]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 216
Top 5 most common words: ['', '[UNK]', 'a', 'and', 'i']
Bottom 5 least common words: ['cake', 'bring', 'bread', 'beach', 'aching']


In [22]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")

embedding

<keras.layers.core.embedding.Embedding at 0x7c1b26a9f700>

In [23]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
I have a sore throat and a runny nose.      

Embedded version:


<tf.Tensor: shape=(1, 11, 128), dtype=float32, numpy=
array([[[ 0.03194851,  0.02795464,  0.01128917, ...,  0.03184073,
         -0.04434466, -0.04310882],
        [-0.03914752,  0.00077748,  0.00243609, ...,  0.01046237,
          0.03381581, -0.02784684],
        [-0.00235378, -0.04919804,  0.03766878, ..., -0.0480589 ,
          0.04337915,  0.02073364],
        ...,
        [ 0.04253621,  0.03385509, -0.04488031, ..., -0.01310171,
          0.01251353,  0.01199418],
        [ 0.04884959, -0.03452299,  0.00630269, ..., -0.01382089,
         -0.00966058, -0.0242269 ],
        [ 0.04884959, -0.03452299,  0.00630269, ..., -0.01382089,
         -0.00966058, -0.0242269 ]]], dtype=float32)>

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_label)

In [26]:
baseline_score = model_0.score(train_sentences,train_label)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 96.58%


In [68]:
# Make predictions
test=[""]
baseline_preds = model_0.predict_log_proba(test)

In [69]:
import numpy as np
print(model_0.predict(test))
# Assume log_probs is the output of predict_log_proba
# Convert log probabilities to probabilities
probs = np.exp(baseline_preds)

# Find the classes with more than 40 percent probability
classes_over_40 = np.where(probs > 0.4)
train_label[classes_over_40[0]]

['Diabetes']


Series([], Name: Disease Label, dtype: object)

AttributeError: ignored

In [143]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model