In [1]:
import tensorflow as tf

import numpy as np
import os
import time
import pandas as pd

In [2]:
data=pd.read_excel("/content/Query_Label_Data.xlsx")

In [3]:
data.head()

Unnamed: 0,Query,Label
0,Symptoms of coronavirus,Yes
1,How to prevent influenza,Yes
2,Is the common cold contagious?,Yes
3,Treatment for pneumonia,Yes
4,Difference between SARS and coronavirus,Yes


In [4]:
# How many examples of each class?
data["Label"].value_counts()

No     446
Yes    313
Name: Label, dtype: int64

In [5]:
data = data.sample(frac=1).reset_index(drop=True)

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Before TensorFlow 2.6
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output seque

In [7]:
train_sentences=data["Query"]
train_label=data["Label"]

In [8]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

7

In [9]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 7 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [10]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [11]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
How to improve and protect joint health?      

Vectorized version:


<tf.Tensor: shape=(1, 7), dtype=int64, numpy=array([[ 4,  2, 13,  6, 42, 67, 11]])>

In [12]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 554
Top 5 most common words: ['', '[UNK]', 'to', 'the', 'how']
Bottom 5 least common words: ['antiviral', 'animals', 'anemia', 'afternoon', 'acid']


In [13]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")

embedding

<keras.layers.core.embedding.Embedding at 0x7c572e87d450>

In [14]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
What are the flu transmission routes?      

Embedded version:


<tf.Tensor: shape=(1, 7, 128), dtype=float32, numpy=
array([[[-4.80573177e-02, -4.52473536e-02, -2.24440098e-02,
         -2.51319539e-02, -3.95250805e-02, -1.94970258e-02,
          2.98757292e-02, -5.22388145e-03, -2.69570835e-02,
          2.58469470e-02, -3.93499956e-02,  3.81974913e-02,
          3.68589647e-02, -1.65615566e-02, -2.26228721e-02,
          1.55936964e-02, -3.84149551e-02, -3.49549763e-02,
          4.42284979e-02, -8.86403024e-04, -4.54434268e-02,
          2.92804129e-02, -1.99053288e-02, -4.86453287e-02,
         -3.04196011e-02,  2.31040828e-02, -5.26245683e-03,
         -1.02571361e-02,  3.50415707e-04,  1.16495118e-02,
         -2.65319478e-02, -1.47844478e-03,  1.33084320e-02,
         -1.81552544e-02, -7.74149969e-03, -3.27155478e-02,
          2.04986073e-02, -2.25756411e-02, -1.31188259e-02,
         -7.45774433e-03, -2.92692315e-02, -3.74942794e-02,
          4.49532755e-02, -5.41917980e-05, -1.23392828e-02,
         -4.27115560e-02, -4.36036959e-02, -1.0

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", svm.NuSVC()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_label)

In [31]:
baseline_score = model_0.score(train_sentences,train_label)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 99.60%


In [38]:
# Make predictions
test=["i am having frequent heartattack"]
baseline_preds = model_0.predict(test)

In [37]:
import numpy as np
print(model_0.predict(test))
# Assume log_probs is the output of predict_log_proba
# Convert log probabilities to probabilities


['No']


In [27]:
train_label.value_counts()

No     446
Yes    313
Name: Label, dtype: int64