In [1]:
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
import pandas as pd

df = pd.read_csv('voice_upi_dataset.csv')
df.head()

Unnamed: 0,text,intent
0,send 500 to mobile number 9912381230,transfer_money
1,use upi to send 250 to dad,transfer_money
2,i need to pay ravi 250 via upi,transfer_money
3,do i have money for a 500 purchase,check_balance
4,request for money,request_money


In [3]:
import re
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    text=text.lower()  # Convert to lowercase
    text=re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['clean_text'] = df['text'].apply(preprocess_text)
df.head()
df.drop('text',axis=1,inplace=True)
df.head()

Unnamed: 0,intent,clean_text
0,transfer_money,send 500 to mobile number 9912381230
1,transfer_money,use upi to send 250 to dad
2,transfer_money,i need to pay ravi 250 via upi
3,check_balance,do i have money for a 500 purchase
4,request_money,request for money


In [4]:
X = df['clean_text'].astype(str)
df[' intent'] = df[' intent'].str.strip()
y= df[' intent']

In [5]:
print(df.columns)

Index([' intent', 'clean_text'], dtype='object')


In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
print(f"\nEncoded classes: {list(label_encoder.classes_)}")


Encoded classes: ['check_balance', 'request_money', 'transfer_money']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [8]:
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)

In [9]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)


In [10]:
max_len = 20
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')



In [11]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_len = 20

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])




In [12]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [13]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
history = model.fit(X_train_padded, y_train, epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.4766 - loss: 1.0705 - val_accuracy: 0.9706 - val_loss: 0.8697
Epoch 2/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8882 - loss: 0.7965 - val_accuracy: 1.0000 - val_loss: 0.2792
Epoch 3/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9452 - loss: 0.3020 - val_accuracy: 1.0000 - val_loss: 0.0538
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9771 - loss: 0.1022 - val_accuracy: 1.0000 - val_loss: 0.0159
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9894 - loss: 0.0616 - val_accuracy: 1.0000 - val_loss: 0.0064
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9966 - loss: 0.0210 - val_accuracy: 1.0000 - val_loss: 0.0031
Epoch 7/20
[1m20/20[0m [32m━━━━━━━━━

In [14]:
loss, accuracy = model.evaluate(X_test_padded, y_test)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9044 - loss: 0.3955 


In [15]:
def predict_intent(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction_probs = model.predict(padded_sequence)
    predicted_class_index = np.argmax(prediction_probs, axis=1)[0]
    predicted_intent = label_encoder.inverse_transform([predicted_class_index])[0]
    confidence = prediction_probs[0][predicted_class_index]
    return predicted_intent, confidence


In [16]:
print(predict_intent("What is the balance in my account?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
('check_balance', 0.8535873)


In [17]:
print(predict_intent("Send 1000 to jen"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
('transfer_money', 0.7671452)


In [18]:
print(predict_intent("need 1000 from jen?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
('request_money', 0.69674635)


In [19]:
print(predict_intent("whats the weather like today?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
('check_balance', 0.3769995)


In [20]:
print(predict_intent("How are you?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
('check_balance', 0.53907394)


In [21]:
print(predict_intent("Who are you?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
('check_balance', 0.49556902)


In [22]:
print(predict_intent("you are an asshole"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
('check_balance', 0.42078373)


In [23]:
print(predict_intent("Go fuck yourself"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
('check_balance', 0.40268916)


In [24]:
print(predict_intent("my balance"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
('check_balance', 0.8201141)


In [25]:
print(predict_intent("please check my balance "))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
('check_balance', 0.91106904)


In [27]:
import pickle

# Save the Keras model
model.save('intent_model.h5')

# Save tokenizer and label encoder
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save max_len for preprocessing
with open('max_len.pkl', 'wb') as f:
    pickle.dump(max_len, f)

print("Model and preprocessors saved successfully!")



Model and preprocessors saved successfully!
