In [1]:
# %pip install pandas
# %pip install keras
# %pip install numpy
# %pip install nltk
# %pip install tensorflow
# %pip install scikit-learn

In [2]:
# %pip install joblib

In [9]:
import pandas as pd

path2Data = "..\data\intentClassification.csv"
df = pd.read_csv(path2Data)

df.head()

Unnamed: 0,sno,user_query,intent
0,1,What are the symptoms of the flu?,Symptom Inquiry
1,2,Can you suggest remedies for a sore throat?,Treatment Inquiry
2,3,What are the side effects of ibuprofen?,Medication Information
3,4,How can I relieve muscle pain after exercising?,Medical Advice
4,5,Tell me a joke!,Non-medical Queries


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sno         600 non-null    int64 
 1   user_query  600 non-null    object
 2   intent      600 non-null    object
dtypes: int64(1), object(2)
memory usage: 14.2+ KB


In [11]:
df.describe()

Unnamed: 0,sno
count,600.0
mean,300.5
std,173.349358
min,1.0
25%,150.75
50%,300.5
75%,450.25
max,600.0


In [12]:
# Data cleaning
df.drop("sno", axis=1, inplace=True)

df.head(10)

Unnamed: 0,user_query,intent
0,What are the symptoms of the flu?,Symptom Inquiry
1,Can you suggest remedies for a sore throat?,Treatment Inquiry
2,What are the side effects of ibuprofen?,Medication Information
3,How can I relieve muscle pain after exercising?,Medical Advice
4,Tell me a joke!,Non-medical Queries
5,What is asthma?,Symptom Inquiry
6,How is arthritis treated?,Treatment Inquiry
7,Is it safe to take two different medications t...,Medication Information
8,Do you have any tips for managing stress?,Medical Advice
9,What's the capital of France?,Non-medical Queries


In [13]:
df["intent"].value_counts()

intent
Symptom Inquiry           120
Treatment Inquiry         120
Medication Information    120
Medical Advice            120
Non-medical Queries       120
Name: count, dtype: int64

In [14]:
# Test-train split
from sklearn.model_selection import train_test_split

# random_state parameter is the random seed to keep the output same
trainSet, testSet = train_test_split(df, test_size=0.2, random_state=42)

print(len(trainSet), len(testSet), sep=" <--TrainSet ||| TestSet--> ")
print(trainSet.head(), testSet.head(), sep="\n\n\n")

480 <--TrainSet ||| TestSet--> 120
                                            user_query               intent
145  I have a sore throat and cough. What should I do?      Symptom Inquiry
9                        What's the capital of France?  Non-medical Queries
375  What are the symptoms of a urinary tract infec...      Symptom Inquiry
523        How can I improve my cardiovascular health?       Medical Advice
188  How can I improve my mental focus and concentr...       Medical Advice


                                            user_query                  intent
110  What are the symptoms of a urinary tract infec...         Symptom Inquiry
419           What's the latest technology innovation?     Non-medical Queries
565        What are the symptoms of a sinus infection?         Symptom Inquiry
77   Are there any side effects of birth control pi...  Medication Information
181               How can I relieve back pain at home?       Treatment Inquiry


In [15]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Use the default TextVectorisation variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the
                                                     # vocabulary (all of the
                                                     # different words in your
                                                     # text)
                                    # how to process text
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to
                                                       # numbers
                                    output_sequence_length=None) # how long
                                                                 # should the
                                                                 # output sequence

In [16]:
X_train = trainSet["user_query"]
y_train = trainSet["intent"]

print(len(X_train))
print(len(y_train))

480
480


In [17]:
# Find average number of tokens (words) in training tweets
maxLen = round(sum([len(i.split()) for i in X_train]) / len(X_train))
maxLen

8

In [18]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = maxLen # max length our sequences will be (e.g. how many words from a
                    # Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [19]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [20]:
# Create sample sentence and tokenize it
sample_sentence = "What's the latest sports news?"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[ 19,   2,  48, 175, 119,   0,   0,   0]], dtype=int64)>

In [21]:
# Choose a random sentence from the training dataset and tokenize it
import random
random_sentence = random.choice(X_train)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
How is diabetes managed?      

Vectorized version:


<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[10, 28, 44, 35,  0,  0,  0,  0]], dtype=int64)>

In [22]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[ : 5] # most common tokens (notice the [UNK] token
                                 # for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 253
Top 5 most common words: ['', '[UNK]', 'the', 'can', 'are']
Bottom 5 least common words: ['book', 'blood', 'be', 'attack', 'aspirin']


In [23]:
from tensorflow.keras import layers

tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default,
                                                               # intialize
                                                               # randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")

embedding

<keras.src.layers.core.embedding.Embedding at 0x17f25ed8d90>

In [24]:
# Get a random sentence from training set
random_sentence = random.choice(X_train)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
I have a sore throat and fever. Should I see a doctor?      

Embedded version:


<tf.Tensor: shape=(1, 8, 128), dtype=float32, numpy=
array([[[ 0.02657813,  0.02375447, -0.02775292, ...,  0.03366859,
          0.00890239,  0.04800744],
        [ 0.04463801,  0.03169176, -0.03659219, ...,  0.02845874,
         -0.01232802, -0.03836731],
        [ 0.0463779 , -0.03603455,  0.011273  , ...,  0.01538581,
         -0.0157498 , -0.04347318],
        ...,
        [ 0.04070635,  0.01097725, -0.03088051, ...,  0.04258109,
         -0.00198812,  0.03329751],
        [ 0.04153654,  0.00226883, -0.01576544, ..., -0.03825042,
          0.02903656,  0.04963911],
        [-0.01371101,  0.00128777,  0.02418203, ...,  0.02150855,
          0.03496372, -0.00992824]]], dtype=float32)>

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
intentClassifierModel = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers
                                                  # using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
intentClassifierModel.fit(X_train, y_train)

In [26]:
baseline_score = intentClassifierModel.score(X_train, y_train)
print(f"Our baseline model achieves an accuracy of: \
{baseline_score * 100:.4f}%")

Our baseline model achieves an accuracy of: 98.7500%


In [27]:
# Make predictions
baseline_preds = intentClassifierModel.predict_log_proba(testSet)

In [28]:
import numpy as np
print(intentClassifierModel.predict(testSet))

# Assume log_probs is the output of predict_log_proba
# Convert log probabilities to probabilities
probs = np.exp(baseline_preds)

# Find the classes with more than 40 percent probability
classes_over_40 = np.where(probs > 0.4)
y_train[classes_over_40[0]]

['Medication Information' 'Medication Information']


Series([], Name: intent, dtype: object)

In [30]:
# Save model to load later
import pickle as pkl

pkl.dump(intentClassifierModel, open("../custom_trained_models/intentClassifierModel0.pkl",
                                    'wb'))