apply preprocessing on utterance 
apply one hot encoding on intent(which is converted into sentence)

do the prediction and display the intent and not the one hot encoded message 



In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import nltk

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM

# Assuming necessary NLTK data files are already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the CSV file into a pandas DataFrame
file_path = '20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv'
df = pd.read_csv(file_path)

# Drop the specified columns
columns_to_drop = ['category', 'flags']
df.drop(columns=columns_to_drop, inplace=True)

# Define the mapping dictionary
intent_mapping = {
    'create_account': "To create a new account, go to the sign-up page and fill in your details.",
    'delete_account': "To delete your account, navigate to your account settings and select 'Delete Account'.",
    'edit_account': "To update your account details, go to your profile and select 'Edit Profile'.",
    'recover_password': "To recover your password, click on 'Forgot Password' and follow the instructions.",
    'registration_problems': "If you have registration issues, check the provided information and ensure all required fields are filled.",
    'switch_account': "To switch accounts, log out of your current account and log in with the other account credentials.",
    'check_cancellation_fee': "To check the cancellation fee, visit the cancellation policy section in our terms and conditions.",
    'contact_customer_service': "To contact customer service, use the 'Contact Us' form on our website.",
    'contact_human_agent': "To chat with a human agent, use the live chat feature available on our support page.",
    'delivery_options': "To view delivery options, go to the shipping section during checkout.",
    'delivery_period': "To check the delivery period, view the estimated delivery date provided at checkout.",
    'complaint': "To file a complaint, fill out the complaint form available on our support page.",
    'review': "To leave a review, go to the product page and click on 'Write a Review'.",
    'check_invoices': "To check your invoices, log in to your account and go to the 'Invoices' section.",
    'get_invoice': "To get a copy of your invoice, access the 'Orders' section in your account and select 'View Invoice'.",
    'newsletter_subscription': "To subscribe to our newsletter, enter your email in the subscription box at the bottom of the homepage.",
    'cancel_order': "To cancel your order, go to your order history and select 'Cancel Order'.",
    'change_order': "To change your order, go to your order details and select 'Edit Order'.",
    'place_order': "To place an order, add items to your cart and proceed to checkout.",
    'track_order': "To track your order, enter your order number in the tracking section on our website.",
    'check_payment_methods': "To view available payment methods, go to the payment options section during checkout.",
    'payment_issue': "If you have a payment issue, check your payment details and try again.",
    'check_refund_policy': "To view our refund policy, visit the 'Refund Policy' page on our website.",
    'get_refund': "To request a refund, go to your order details and select 'Request Refund'.",
    'track_refund': "To track your refund, go to the 'Refunds' section in your account.",
    'change_shipping_address': "To update your shipping address, go to your account settings and select 'Shipping Address'.",
    'set_up_shipping_address': "To set up a new shipping address, go to your account settings and add a new address in the 'Shipping Address' section."
}

# Replace the intent values with the descriptive sentences
df['intent'] = df['intent'].replace(intent_mapping)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Apply the preprocessing function to the 'utterance' column
df['utterance'] = df['utterance'].apply(preprocess_text)

# Create a TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['utterance'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Split the data into training and testing sets for TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(df_tfidf, df['intent'], test_size=0.2, random_state=42)

# Encode labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train_tfidf)
y_test_encoded = encoder.transform(y_test_tfidf)

# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': MultinomialNB(),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    # 'Gradient Boosting': GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train_encoded)
    accuracy = model.score(X_test_tfidf, y_test_encoded)
    print(f'TF-IDF {model_name} Model Accuracy: {accuracy:.4f}')

# Deep Learning Models
# Assuming `len(intent_mapping)` gives the number of classes for DL models

def build_dl_model(input_shape):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(intent_mapping), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def build_dl_lstm_model(input_shape):
    model = Sequential()
    model.add(Embedding(input_dim=input_shape, output_dim=128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(len(intent_mapping), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate deep learning models
dl_models = {
    'Dense Neural Network': build_dl_model(X_train_tfidf.shape[1]),
    'LSTM': build_dl_lstm_model(X_train_tfidf.shape[1])
}

for model_name, model in dl_models.items():
    model.fit(X_train_tfidf, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1, verbose=1)
    _, accuracy = model.evaluate(X_test_tfidf, y_test_encoded, verbose=0)
    print(f'TF-IDF Deep Learning {model_name} Model Accuracy: {accuracy:.4f}')



TF-IDF Logistic Regression Model Accuracy: 0.9842
TF-IDF Decision Tree Model Accuracy: 0.9765
TF-IDF Naive Bayes Model Accuracy: 0.9587
TF-IDF SVC Model Accuracy: 0.9835
TF-IDF Random Forest Model Accuracy: 0.9872
TF-IDF XGBoost Model Accuracy: 0.9796
TF-IDF AdaBoost Model Accuracy: 0.2584


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TF-IDF Deep Learning Dense Neural Network Model Accuracy: 0.9879
Epoch 1/10
 22/485 [>.............................] - ETA: 5:08:37 - loss: 3.0030 - accuracy: 0.1804

KeyboardInterrupt: 

In [4]:
df_tfidf.head()

Unnamed: 0,aa,aaddress,aan,aand,abck,abill,aboutgetting,aboutr,aboutt,aboutthe,...,yourpayment,youshow,youtell,youy,yoy,yu,yuou,yyou,zccount,zccounts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import nltk

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM

# Assuming necessary NLTK data files are already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the CSV file into a pandas DataFrame
file_path = '20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv'
df = pd.read_csv(file_path)

# Drop the specified columns
columns_to_drop = ['category', 'flags']
df.drop(columns=columns_to_drop, inplace=True)

# Define the mapping dictionary
intent_mapping = {
    'create_account': "To create a new account, go to the sign-up page and fill in your details.",
    'delete_account': "To delete your account, navigate to your account settings and select 'Delete Account'.",
    'edit_account': "To update your account details, go to your profile and select 'Edit Profile'.",
    'recover_password': "To recover your password, click on 'Forgot Password' and follow the instructions.",
    'registration_problems': "If you have registration issues, check the provided information and ensure all required fields are filled.",
    'switch_account': "To switch accounts, log out of your current account and log in with the other account credentials.",
    'check_cancellation_fee': "To check the cancellation fee, visit the cancellation policy section in our terms and conditions.",
    'contact_customer_service': "To contact customer service, use the 'Contact Us' form on our website.",
    'contact_human_agent': "To chat with a human agent, use the live chat feature available on our support page.",
    'delivery_options': "To view delivery options, go to the shipping section during checkout.",
    'delivery_period': "To check the delivery period, view the estimated delivery date provided at checkout.",
    'complaint': "To file a complaint, fill out the complaint form available on our support page.",
    'review': "To leave a review, go to the product page and click on 'Write a Review'.",
    'check_invoices': "To check your invoices, log in to your account and go to the 'Invoices' section.",
    'get_invoice': "To get a copy of your invoice, access the 'Orders' section in your account and select 'View Invoice'.",
    'newsletter_subscription': "To subscribe to our newsletter, enter your email in the subscription box at the bottom of the homepage.",
    'cancel_order': "To cancel your order, go to your order history and select 'Cancel Order'.",
    'change_order': "To change your order, go to your order details and select 'Edit Order'.",
    'place_order': "To place an order, add items to your cart and proceed to checkout.",
    'track_order': "To track your order, enter your order number in the tracking section on our website.",
    'check_payment_methods': "To view available payment methods, go to the payment options section during checkout.",
    'payment_issue': "If you have a payment issue, check your payment details and try again.",
    'check_refund_policy': "To view our refund policy, visit the 'Refund Policy' page on our website.",
    'get_refund': "To request a refund, go to your order details and select 'Request Refund'.",
    'track_refund': "To track your refund, go to the 'Refunds' section in your account.",
    'change_shipping_address': "To update your shipping address, go to your account settings and select 'Shipping Address'.",
    'set_up_shipping_address': "To set up a new shipping address, go to your account settings and add a new address in the 'Shipping Address' section."
}

# Replace the intent values with the descriptive sentences
df['intent'] = df['intent'].replace(intent_mapping)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Apply the preprocessing function to the 'utterance' column
df['utterance'] = df['utterance'].apply(preprocess_text)

# Create a BoW representation
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df['utterance'])
df_bow = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

# Split the data into training and testing sets for BoW
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(df_bow, df['intent'], test_size=0.2, random_state=42)

# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': MultinomialNB(),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    # 'Gradient Boosting': GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_bow, y_train_bow)
    accuracy = model.score(X_test_bow, y_test_bow)
    print(f'BoW {model_name} Model Accuracy: {accuracy:.4f}')

# Deep Learning Models
def build_dl_model(input_shape):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(intent_mapping), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Example of a deeper architecture using embeddings and LSTM
def build_dl_lstm_model(input_shape):
    model = Sequential()
    model.add(Embedding(input_dim=input_shape, output_dim=128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(len(intent_mapping), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate deep learning models
dl_models = {
    'Dense Neural Network': build_dl_model(X_train_bow.shape[1]),
    'LSTM': build_dl_lstm_model(X_train_bow.shape[1])
}

for model_name, model in dl_models.items():
    model.fit(X_train_bow, y_train_bow, epochs=10, batch_size=32, validation_split=0.1, verbose=1)
    _, accuracy = model.evaluate(X_test_bow, y_test_bow, verbose=0)
    print(f'BoW Deep Learning {model_name} Model Accuracy: {accuracy:.4f}')


BoW Logistic Regression Model Accuracy: 0.9854
BoW Decision Tree Model Accuracy: 0.9770
BoW Naive Bayes Model Accuracy: 0.9759


In [4]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib  # For saving the model as .pkl

# Load the CSV file into a pandas DataFrame
file_path = '20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv'
df = pd.read_csv(file_path)

# Drop the specified columns
columns_to_drop = ['category', 'flags']
df.drop(columns=columns_to_drop, inplace=True)

# Define the mapping dictionary
intent_mapping = {
    'create_account': "To create a new account, go to the sign-up page and fill in your details.",
    'delete_account': "To delete your account, navigate to your account settings and select 'Delete Account'.",
    'edit_account': "To update your account details, go to your profile and select 'Edit Profile'.",
    'recover_password': "To recover your password, click on 'Forgot Password' and follow the instructions.",
    'registration_problems': "If you have registration issues, check the provided information and ensure all required fields are filled.",
    'switch_account': "To switch accounts, log out of your current account and log in with the other account credentials.",
    'check_cancellation_fee': "To check the cancellation fee, visit the cancellation policy section in our terms and conditions.",
    'contact_customer_service': "To contact customer service, use the 'Contact Us' form on our website.",
    'contact_human_agent': "To chat with a human agent, use the live chat feature available on our support page.",
    'delivery_options': "To view delivery options, go to the shipping section during checkout.",
    'delivery_period': "To check the delivery period, view the estimated delivery date provided at checkout.",
    'complaint': "To file a complaint, fill out the complaint form available on our support page.",
    'review': "To leave a review, go to the product page and click on 'Write a Review'.",
    'check_invoices': "To check your invoices, log in to your account and go to the 'Invoices' section.",
    'get_invoice': "To get a copy of your invoice, access the 'Orders' section in your account and select 'View Invoice'.",
    'newsletter_subscription': "To subscribe to our newsletter, enter your email in the subscription box at the bottom of the homepage.",
    'cancel_order': "To cancel your order, go to your order history and select 'Cancel Order'.",
    'change_order': "To change your order, go to your order details and select 'Edit Order'.",
    'place_order': "To place an order, add items to your cart and proceed to checkout.",
    'track_order': "To track your order, enter your order number in the tracking section on our website.",
    'check_payment_methods': "To view available payment methods, go to the payment options section during checkout.",
    'payment_issue': "If you have a payment issue, check your payment details and try again.",
    'check_refund_policy': "To view our refund policy, visit the 'Refund Policy' page on our website.",
    'get_refund': "To request a refund, go to your order details and select 'Request Refund'.",
    'track_refund': "To track your refund, go to the 'Refunds' section in your account.",
    'change_shipping_address': "To update your shipping address, go to your account settings and select 'Shipping Address'.",
    'set_up_shipping_address': "To set up a new shipping address, go to your account settings and add a new address in the 'Shipping Address' section."
}

# Replace the intent values with the descriptive sentences
df['intent'] = df['intent'].replace(intent_mapping)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Apply the preprocessing function to the 'utterance' column
df['utterance'] = df['utterance'].apply(preprocess_text)

# Create a TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['utterance'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Split the data into training and testing sets for TF-IDF
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(df_tfidf, df['intent'], test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train_tfidf)

# Evaluate the model
accuracy = rf_classifier.score(X_test_tfidf, y_test_tfidf)
print(f'TF-IDF Random Forest Model Accuracy: {accuracy:.4f}')

# Save the trained model as a .pkl file
joblib.dump(rf_classifier, 'tfidf_random_forest_model.pkl')
print("Model saved as tfidf_random_forest_model.pkl")

# Save the TF-IDF vectorizer as a .pkl file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("TF-IDF Vectorizer saved as tfidf_vectorizer.pkl")


TF-IDF Random Forest Model Accuracy: 0.9870
Model saved as tfidf_random_forest_model.pkl
TF-IDF Vectorizer saved as tfidf_vectorizer.pkl


In [5]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import joblib  # For loading the model and vectorizer

# Function to preprocess text
def preprocess_text(text):
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a string
    return ' '.join(tokens)

# Load TF-IDF Vectorizer and Random Forest model
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
rf_classifier = joblib.load('tfidf_random_forest_model.pkl')

# Example input
input_text = "How do I delete my account?"

# Preprocess the input text
preprocessed_input = preprocess_text(input_text)

# Transform the preprocessed text using the loaded TF-IDF vectorizer
input_vector = tfidf_vectorizer.transform([preprocessed_input])

# Predict the intent using the loaded Random Forest model
predicted_intent = rf_classifier.predict(input_vector)[0]

# Output the predicted intent
print(f"Predicted Intent: {predicted_intent}")


Predicted Intent: To delete your account, navigate to your account settings and select 'Delete Account'.


