## Title: Clustering Queries For Enhanced Customer Support
### submitted by <br/> 1) Lakshya Mahawar, 20116049, ECE <br/> 2) Pranav Arya, 20116064, ECE

Importing necessary libraries

In [16]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import davies_bouldin_score
import string
import warnings

Importing data from 'queries.csv' file

In [17]:
df = pd.read_csv('queries.csv')
df.head()

Unnamed: 0,QuestionID,QuestionType,Category,AskerID,QuestionTime,QuestionText
0,C1Q1,yes/no,Automotive,A365S8H55GGXPD,"July 19, 2013",will they fit 2013 f350 dually
1,C1Q2,yes/no,Automotive,AXOOEUYEJ87ZB,"May 7, 2014",will they fit 2014 mazda 3 sport?
2,C1Q3,open-ended,Automotive,AN2AMELSNPN99,"June 20, 2014",Do they fit a 1998 GMC Sierra 3 door?
3,C1Q4,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
4,C1Q5,yes/no,Automotive,A15Q5XTXEKWK0E,"November 3, 2013",will this fit a 1996 4 winns 238 vista 5.0 w/v...


Removing unnecessary columns

In [18]:
df.drop(['QuestionID', 'Category', 'AskerID', 'QuestionTime'], axis = 1, inplace = True)
df.head()

Unnamed: 0,QuestionType,QuestionText
0,yes/no,will they fit 2013 f350 dually
1,yes/no,will they fit 2014 mazda 3 sport?
2,open-ended,Do they fit a 1998 GMC Sierra 3 door?
3,yes/no,will this work on 95 bonneville and 94 camaro
4,yes/no,will this fit a 1996 4 winns 238 vista 5.0 w/v...


Tokenizing and removing stopwords

In [19]:
english_stop_words = stopwords.words('english')

def cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join=Test_punc_removed_join.split()
    Test_punc_removed_join_clean = [word+" " for word in Test_punc_removed_join if word.lower() not in english_stop_words and word.isalpha()]
    Test_punc_removed_join_Text = ' '.join(Test_punc_removed_join_clean)
    return Test_punc_removed_join_Text


df['QuestionText'] = df['QuestionText'].apply(cleaning)
df.head()

Unnamed: 0,QuestionType,QuestionText
0,yes/no,fit dually
1,yes/no,fit mazda sport
2,open-ended,fit GMC Sierra door
3,yes/no,work bonneville camaro
4,yes/no,fit winns vista wvolvo sx drive


Lemmatizing the text

In [20]:
lemmatizer = WordNetLemmatizer()
def stemming(message):
    message = message.split()
    lemma = [lemmatizer.lemmatize(word) for word in message]
    lemma = ' '.join(lemma)
    return lemma
df['QuestionText'] = df['QuestionText'].apply(stemming)
df.head()

Unnamed: 0,QuestionType,QuestionText
0,yes/no,fit dually
1,yes/no,fit mazda sport
2,open-ended,fit GMC Sierra door
3,yes/no,work bonneville camaro
4,yes/no,fit winns vista wvolvo sx drive


Count Vectorization

In [21]:
warnings.filterwarnings('ignore')
transformer = CountVectorizer(min_df=0.05, lowercase=True).fit(df['QuestionText'])
vocab = transformer.get_feature_names()
for i in vocab:
    x=[]
    for j in df["QuestionText"]:
        counter=0
        if i in j:
            x.append(j.count(i))
        else:
            x.append(0)
    df[i]=x
df['QuestionText'] = df['QuestionText'].apply(cleaning)
df.head()

Unnamed: 0,QuestionType,QuestionText,come,fit,one,use,work
0,yes/no,fit dually,0,1,0,0,0
1,yes/no,fit mazda sport,0,1,0,0,0
2,open-ended,fit GMC Sierra door,0,1,0,0,0
3,yes/no,work bonneville camaro,0,0,0,0,1
4,yes/no,fit winns vista wvolvo sx drive,0,1,0,0,0


Label Encoding

In [22]:
le = LabelEncoder()
le.fit_transform(df['QuestionType'])
df['QuestionType'] = le.fit_transform(df['QuestionType'])

Preparing dataset for training

In [23]:
X = df.drop(['QuestionText'], axis = 1)
X.head()

Unnamed: 0,QuestionType,come,fit,one,use,work
0,1,0,1,0,0,0
1,1,0,1,0,0,0
2,0,0,1,0,0,0
3,1,0,0,0,0,1
4,1,0,1,0,0,0


Splitting the dataset for test and train purpose

In [24]:
X_train,X_test = train_test_split(X, test_size = 0.2, random_state = 49)
X_train.head()

Unnamed: 0,QuestionType,come,fit,one,use,work
114157,0,0,0,0,0,0
95579,0,0,0,0,0,0
60847,0,0,0,0,0,0
75620,0,0,0,0,0,0
11572,0,0,0,0,0,0


Applying KMeans algorithm

In [25]:
kmeans = KMeans(n_clusters = 2**len(vocab)).fit(X_train)
labels_K = kmeans.predict(X_test)

KMeans Score

In [26]:
print("KMeans score : ", silhouette_score(X_test, labels_K))

KMeans score :  0.9439200070448699


Applying Gaussian Mixture Model

In [27]:

GMM =  GaussianMixture(n_components = 2**len(vocab)).fit(X_train)
labels_G = GMM.predict(X_test)

Gaussian Mixture Model score

In [28]:
print("GMM score : ", silhouette_score(X_test, labels_G))

GMM score :  0.9344284119977065


## THANK YOU :)