In [1]:
import pandas as pd

# Load the dataset (replace 'your_dataset.csv' with the actual filename)
file_path = 'Bitext_Sample_Customer_Service_Training_Dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Check basic info about the dataset
print(data.info())


  flags                                          utterance category  \
0    BM            I have problems with canceling an order    ORDER   
1   BIM  how can I find information about canceling ord...    ORDER   
2     B          I need help with canceling the last order    ORDER   
3   BIP  could you help me cancelling the last order I ...    ORDER   
4     B            problem with cancelling an order I made    ORDER   

         intent  
0  cancel_order  
1  cancel_order  
2  cancel_order  
3  cancel_order  
4  cancel_order  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8175 entries, 0 to 8174
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   flags      8175 non-null   object
 1   utterance  8175 non-null   object
 2   category   8175 non-null   object
 3   intent     8175 non-null   object
dtypes: object(4)
memory usage: 255.6+ KB
None


In [2]:
# Check for missing values
print(data.isnull().sum())

# Remove rows with missing values (if any)
data_cleaned = data.dropna()

# Function to clean text data
import re

def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower().strip()
    return text

# Apply the cleaning function to the 'utterance' column
data_cleaned['utterance_cleaned'] = data_cleaned['utterance'].apply(clean_text)

# Display the cleaned text data
print(data_cleaned['utterance_cleaned'].head())


flags        0
utterance    0
category     0
intent       0
dtype: int64
0              i have problems with canceling an order
1    how can i find information about canceling orders
2            i need help with canceling the last order
3    could you help me cancelling the last order i ...
4              problem with cancelling an order i made
Name: utterance_cleaned, dtype: object


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the cleaned utterances
X = tfidf_vectorizer.fit_transform(data_cleaned['utterance_cleaned'])

# Display the shape of the TF-IDF matrix
print(X.shape)


(8175, 612)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'intent' column into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data_cleaned['intent'])

# Display the unique intents and their encoded values
print(label_encoder.classes_)


['cancel_order' 'change_order' 'change_shipping_address'
 'check_cancellation_fee' 'check_invoice' 'check_payment_methods'
 'check_refund_policy' 'complaint' 'contact_customer_service'
 'contact_human_agent' 'create_account' 'delete_account'
 'delivery_options' 'delivery_period' 'edit_account' 'get_invoice'
 'get_refund' 'newsletter_subscription' 'payment_issue' 'place_order'
 'recover_password' 'registration_problems' 'review'
 'set_up_shipping_address' 'switch_account' 'track_order' 'track_refund']


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (6540, 612)
Testing set size: (1635, 612)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=200)

# Train the model
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Logistic Regression Accuracy: 97.74%
                          precision    recall  f1-score   support

            cancel_order       0.98      0.98      0.98        62
            change_order       1.00      1.00      1.00        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       0.98      0.98      0.98        66
           check_invoice       0.93      0.65      0.77        63
   check_payment_methods       1.00      0.96      0.98        68
     check_refund_policy       0.98      1.00      0.99        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.98      1.00      0.99        61
     contact_human_agent       1.00      1.00      1.00        57
          create_account       0.98      0.95      0.97        62
          delete_account       0.98      1.00      0.99        53
        delivery_options       0.96      1.00      0.98        55
         delivery_period       0.98   

In [8]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVC Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


SVC Accuracy: 97.86%
                          precision    recall  f1-score   support

            cancel_order       1.00      1.00      1.00        62
            change_order       0.99      1.00      0.99        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       1.00      1.00      1.00        66
           check_invoice       0.97      0.62      0.76        63
   check_payment_methods       0.99      1.00      0.99        68
     check_refund_policy       1.00      1.00      1.00        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.98      1.00      0.99        61
     contact_human_agent       0.98      1.00      0.99        57
          create_account       0.98      0.94      0.96        62
          delete_account       0.96      1.00      0.98        53
        delivery_options       1.00      1.00      1.00        55
         delivery_period       1.00      1.00      1.0

In [32]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
 #Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


RandomForestClassifier Accuracy: 96.94%
                          precision    recall  f1-score   support

            cancel_order       1.00      0.98      0.99        62
            change_order       0.99      1.00      0.99        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       1.00      1.00      1.00        66
           check_invoice       0.75      0.67      0.71        63
   check_payment_methods       0.99      1.00      0.99        68
     check_refund_policy       0.98      1.00      0.99        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.98      1.00      0.99        61
     contact_human_agent       0.97      0.98      0.97        57
          create_account       0.98      0.92      0.95        62
          delete_account       0.96      1.00      0.98        53
        delivery_options       1.00      1.00      1.00        55
         delivery_period       0.98

In [33]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




MultinomialNB Accuracy: 97.19%
                          precision    recall  f1-score   support

            cancel_order       0.98      0.98      0.98        62
            change_order       1.00      1.00      1.00        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       0.98      0.98      0.98        66
           check_invoice       0.88      0.68      0.77        63
   check_payment_methods       1.00      0.96      0.98        68
     check_refund_policy       0.98      0.98      0.98        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.95      1.00      0.98        61
     contact_human_agent       1.00      0.98      0.99        57
          create_account       1.00      0.95      0.98        62
          delete_account       1.00      1.00      1.00        53
        delivery_options       0.96      1.00      0.98        55
         delivery_period       1.00      1.0

In [9]:
# Define responses for each intent
responses = {
    'cancel_order': "Your order has been canceled.",
    'complaint': "I'm sorry to hear that you have a complaint. Let me help you with that.",
    'contact_customer_service': "You can reach customer service via email or phone.",
    'track_order': "You can track your order by visiting the tracking page.",
    'payment_issue': "It looks like there was a payment issue. Please check your payment method.",
    'get_refund': "You can request a refund by filling out the form on our website.",
    # Add more responses for each intent as needed
}

# Function to predict intent and respond
def get_response(user_input):
    # Clean the user input
    user_input_cleaned = clean_text(user_input)
    
    # Convert the input into the same format as our training data
    user_input_vectorized = tfidf_vectorizer.transform([user_input_cleaned])
    
    # Predict the intent using the Logistic Regression model
    predicted_intent_index = log_reg_model.predict(user_input_vectorized)[0]
    
    # Convert the predicted index back to the intent label
    predicted_intent = label_encoder.inverse_transform([predicted_intent_index])[0]
    
    # Get the chatbot response based on the predicted intent
    response = responses.get(predicted_intent, "Sorry, I didn't understand that.")
    
    return response


In [10]:
# Start the chatbot loop
print("Chatbot is running. Type 'exit' to end the conversation.")

while True:
    # Get user input
    user_input = input("You: ")
    
    # Exit the chatbot loop if the user types 'exit'
    if user_input.lower() == 'exit':
        print("Chatbot: Goodbye!")
        break
    
    # Get chatbot response
    response = get_response(user_input)
    
    # Print the chatbot response
    print(f"Chatbot: {response}")


Chatbot is running. Type 'exit' to end the conversation.


You:  where is my order


Chatbot: You can track your order by visiting the tracking page.


You:  exit


Chatbot: Goodbye!


In [10]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVC Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


SVC Accuracy: 97.86%
                          precision    recall  f1-score   support

            cancel_order       1.00      1.00      1.00        62
            change_order       0.99      1.00      0.99        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       1.00      1.00      1.00        66
           check_invoice       0.97      0.62      0.76        63
   check_payment_methods       0.99      1.00      0.99        68
     check_refund_policy       1.00      1.00      1.00        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.98      1.00      0.99        61
     contact_human_agent       0.98      1.00      0.99        57
          create_account       0.98      0.94      0.96        62
          delete_account       0.96      1.00      0.98        53
        delivery_options       1.00      1.00      1.00        55
         delivery_period       1.00      1.00      1.0

In [12]:
import joblib

# Save the trained Logistic Regression model
joblib.dump(model, 'model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save the Label Encoder for intent mapping
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']