<a href="https://colab.research.google.com/github/jahnavikolli/Intent-Classification-for-Conversational-AI-Systems/blob/main/intent_Classification_for_Conversational_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

sheet_id = "1BG4GYGscyd4inQ2RuZUunzhxil2q0OklbktNRCOuNLg"
sheet_name = "sofmattress_train"

url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df = pd.read_csv(url)

print(df.head())

                                         sentence label
0                    You guys provide EMI option?   EMI
1  Do you offer Zero Percent EMI payment options?   EMI
2                                         0% EMI.   EMI
3                                             EMI   EMI
4                           I want in installment   EMI


In [None]:
sentences = df['sentence'].astype(str).values
labels = df['label'].values


# Preprocessing( Cleaning and Label Encoding)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

Extracting sentences and cleaning the sentences using NLTK library

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

# Downloading stopwords (Ex: The, and, is etc.,)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    text = text.strip()  # Remove extra spaces
    return text

# cleaning the sentences column
df["cleaned_sentence"] = df["sentence"].apply(clean_text)
sentences = df["cleaned_sentence"].values


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Extracting labels
labels = df["label"].values

# Encode the labels to numbers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)  # Converts labels to integers

# One-hot encode labels (for LSTM)
num_classes = len(label_encoder.classes_)  # Number of unique labels
one_hot_labels = to_categorical(encoded_labels, num_classes=num_classes)


# Tokenization and Padding

In [None]:
# Tokenize the sentences
tokenizer = Tokenizer(oov_token="<OOV>")  # To handle unseen words
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

# Converting text to sequences
sequences = tokenizer.texts_to_sequences(sentences)

# Padding sequences to ensure all the sentences are of same length.
max_length = max(len(seq) for seq in sequences)
# Setting the padding to max_length of all the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


In [None]:
print(tokenizer.word_index)  # word-to-index mapping
print(f"Vocabulary size: {vocab_size}")


{'<OOV>': 1, 'mattress': 2, 'order': 3, 'want': 4, 'sof': 5, 'size': 6, 'available': 7, 'need': 8, 'emi': 9, 'cost': 10, 'ergo': 11, 'trial': 12, 'get': 13, 'buy': 14, 'back': 15, 'price': 16, 'features': 17, 'ortho': 18, 'product': 19, 'warranty': 20, 'status': 21, 'cod': 22, 'know': 23, 'pillows': 24, 'offer': 25, 'pincode': 26, 'delivery': 27, 'tell': 28, 'products': 29, 'cancel': 30, 'option': 31, 'night': 32, 'return': 33, 'call': 34, 'paisa': 35, 'deliver': 36, 'variants': 37, 'offers': 38, 'help': 39, 'please': 40, 'store': 41, 'finance': 42, 'custom': 43, 'days': 44, 'sizes': 45, 'showroom': 46, 'shop': 47, 'pain': 48, 'difference': 49, 'exchange': 50, 'inches': 51, 'show': 52, 'touch': 53, 'x': 54, 'payment': 55, 'possible': 56, 'problem': 57, 'looking': 58, 'mattresses': 59, 'comparison': 60, 'details': 61, 'check': 62, 'chart': 63, 'agent': 64, 'state': 65, 'nearby': 66, 'different': 67, 'month': 68, 'discount': 69, 'provide': 70, 'options': 71, 'installments': 72, 'cash': 7

In [None]:
print(padded_sequences[:5])  # Printing first 5 padded sequences to check if they are all of same length
print(f"Padded shape: {padded_sequences.shape}")  # Total number of sequences, max_length


[[107  70   9  31   0   0   0   0   0   0   0   0]
 [ 25 108 109   9  55  71   0   0   0   0   0   0]
 [  9   0   0   0   0   0   0   0   0   0   0   0]
 [  9   0   0   0   0   0   0   0   0   0   0   0]
 [  4 110   0   0   0   0   0   0   0   0   0   0]]
Padded shape: (328, 12)


# Train test split

In [None]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, one_hot_labels, test_size=0.2, random_state=42, stratify=one_hot_labels
)


In [None]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (262, 12)
y_train shape: (262, 21)
X_test shape: (66, 12)
y_test shape: (66, 21)


# Embedding

In [None]:
# Downloading the GloVe model
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-02-20 17:42:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-20 17:42:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-20 17:42:57--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

# Loading GloVe embedding model
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # Word
        coefs = np.asarray(values[1:], dtype="float32")  # Embedding values
        embedding_index[word] = coefs


In [None]:
# Creating embedding matrix
embed_dim = 100  # Size of GloVe vectors ( Loaded a 100d model, embed_dim and model dim should match)
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



In [None]:
print(embedding_matrix.shape)


(219, 100)


# Building LSTM Model

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input

# Input length same as my padded sequences
input_length = 12


model = keras.models.Sequential([
    Input(shape=(input_length,)),  # Input layer
    Embedding(input_dim=vocab_size, output_dim=100, input_length=input_length,
              weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(64, dropout=0.1, return_sequences=False)),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])

# Compile model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Model summary to check if the model is building well or not
model.summary()




In [None]:
print(labels.shape)
print(type(labels))
print(labels[:5])  # Preview first 5 labels


(328,)
<class 'numpy.ndarray'>
['EMI' 'EMI' 'EMI' 'EMI' 'EMI']


In [None]:
training = model.fit(X_train, y_train, epochs=15, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step - accuracy: 0.0640 - loss: 3.0306 - val_accuracy: 0.2727 - val_loss: 2.9336
Epoch 2/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.2358 - loss: 2.9129 - val_accuracy: 0.4394 - val_loss: 2.7842
Epoch 3/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.3650 - loss: 2.6852 - val_accuracy: 0.4091 - val_loss: 2.4209
Epoch 4/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4009 - loss: 2.2508 - val_accuracy: 0.4394 - val_loss: 2.0510
Epoch 5/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4837 - loss: 1.8750 - val_accuracy: 0.5152 - val_loss: 1.7623
Epoch 6/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.5917 - loss: 1.6026 - val_accuracy: 0.6212 - val_loss: 1.5044
Epoch 7/15
[1m17/17[0m [32m━━━━

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7846 - loss: 0.8036
Test Accuracy: 0.79


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)  # Convert probabilities to class labels
y_true_classes = y_test.argmax(axis=1)  # Convert one-hot encoded labels to class labels

print(classification_report(y_true_classes, y_pred_classes))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 220ms/step
              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.67      1.00      0.80         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       1.00      0.50      0.67         2
           5       1.00      0.50      0.67         2
           6       0.50      0.50      0.50         2
           7       1.00      0.86      0.92         7
           8       0.83      1.00      0.91         5
           9       1.00      0.50      0.67         2
          10       1.00      0.75      0.86         4
          11       1.00      0.80      0.89         5
          12       1.00      1.00      1.00         2
          13       0.75      0.75      0.75         4
          14       0.80      1.00      0.89         4
          15       1.00      1.00      1.00         2
        

Checking for any unpredicted classes
* If both the y_pred and y_test_classes are equal then there are no unpredicted classes

In [None]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert the one-hot encoded values to class labels

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


In [None]:
y_test_classes = np.argmax(y_test, axis=1)
print(len(np.unique(y_pred)))
print(len(np.unique(y_test_classes)))


21
21
