<a href="https://colab.research.google.com/github/jahnavikolli/Intent-Classification-for-Conversational-AI-Systems/blob/main/intent_classification_with_SVM_and_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn




# Loading Dataset

In [None]:
import pandas as pd

sheet_id = "1BG4GYGscyd4inQ2RuZUunzhxil2q0OklbktNRCOuNLg"
sheet_name = "sofmattress_train"  # Change this to match your sheet name

url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df = pd.read_csv(url)

print(df.head())

                                         sentence label
0                    You guys provide EMI option?   EMI
1  Do you offer Zero Percent EMI payment options?   EMI
2                                         0% EMI.   EMI
3                                             EMI   EMI
4                           I want in installment   EMI


In [None]:
print(df['label'].value_counts())

label
DISTRIBUTORS             34
EMI                      25
MATTRESS_COST            22
LEAD_GEN                 21
PRODUCT_VARIANTS         21
ORDER_STATUS             21
WHAT_SIZE_TO_ORDER       20
100_NIGHT_TRIAL_OFFER    18
ORTHO_FEATURES           17
RETURN_EXCHANGE          14
COD                      12
COMPARISON               11
ERGO_FEATURES            11
ABOUT_SOF_MATTRESS       11
DELAY_IN_DELIVERY        11
CANCEL_ORDER             10
PILLOWS                  10
CHECK_PINCODE            10
WARRANTY                 10
OFFERS                   10
SIZE_CUSTOMIZATION        9
Name: count, dtype: int64


In [None]:
print(df['label'].nunique())


21


# Text Preprocessing

In [None]:
import re
from sklearn.preprocessing import LabelEncoder

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['sentence'] = df['sentence'].astype(str).apply(clean_text)

# Encode labels into numbers
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

print(df.head())



                                        sentence label  label_encoded
0                    you guys provide emi option   EMI              8
1  do you offer zero percent emi payment options   EMI              8
2                                          0 emi   EMI              8
3                                            emi   EMI              8
4                          i want in installment   EMI              8


# Convert Text To features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['sentence'])
y = df['label_encoded']

#Shape of the Features extracted
print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (328, 278)


# Training


SVM and Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Training SVM
svm_model = SVC(kernel='linear', class_weight='balanced')
svm_model.fit(X_train, y_train)

# Training Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)



# Testing (accuracy, precision Recall)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predictions
svm_preds = svm_model.predict(X_test)
rf_preds = rf_model.predict(X_test)

# Evaluate SVM
print("SVM Results")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds, target_names=label_encoder.classes_))

# Evaluate Random Forest
print("Random Forest Results")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds, target_names=label_encoder.classes_))



SVM Results
Accuracy: 0.7727272727272727
                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      0.75      0.86         4
   ABOUT_SOF_MATTRESS       0.50      0.50      0.50         2
         CANCEL_ORDER       1.00      1.00      1.00         2
        CHECK_PINCODE       1.00      1.00      1.00         2
                  COD       0.67      1.00      0.80         2
           COMPARISON       1.00      1.00      1.00         2
    DELAY_IN_DELIVERY       1.00      0.50      0.67         2
         DISTRIBUTORS       0.54      1.00      0.70         7
                  EMI       1.00      0.60      0.75         5
        ERGO_FEATURES       0.00      0.00      0.00         2
             LEAD_GEN       0.50      1.00      0.67         4
        MATTRESS_COST       1.00      1.00      1.00         5
               OFFERS       1.00      1.00      1.00         2
         ORDER_STATUS       1.00      0.50      0.67         4
       ORTHO_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
missing_labels_svm = np.setdiff1d(y_test, svm_preds)
print("SVM Unpredicted Labels:", missing_labels_svm)


SVM Unpredicted Labels: [9]


In [None]:
missing_labels_rf = np.setdiff1d(y_test, rf_preds)
print("Random Forest Unpredicted Labels:", missing_labels_rf)


Random Forest Unpredicted Labels: []
