# Customer Query Classification
### Hoori Javadnia & Fatemeh Khosravi

### Topic Modeling

In [3]:
#!pip install --upgrade emoji
#!pip install plotly
#!python -m nltk.downloader stopwords
#!pip install pyLDAvis==3.4.1

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import random
from collections import OrderedDict
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import nltk
import random
py.init_notebook_mode(connected=True)
from nltk.stem import WordNetLemmatizer
from random import randint
color = sns.color_palette()
%matplotlib inline
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.metrics import ConfusionMatrix
import plotly.io as pio
from nltk.tokenize import word_tokenize
re.compile('<title>(.*)</title>')
import nltk
import os
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
sw = set(stopwords.words('english'))
import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

### Read the Data

In [2]:
#Read the data

customer_queries = pd.read_csv('customer_query.csv')


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [3]:
# Some punctuation variations

punctuation = set(punctuation) #Creates a set of punctuation marks for later use.


# Stopwords
#Loads a list of stopwords for the English language.

sw = stopwords.words("english")


# useful regex
#Compiles a regular expression pattern to match whitespace.
whitespace_pattern = re.compile(r"\s+")


#Descriptive function
#This function takes a list of tokens as input and calculates various statistics
#such as the total number of tokens, the number of unique tokens, the number of characters,
#and the lexical diversity. It also prints the top num_tokens most common tokens if specified.


def descriptive_stats(tokens, num_tokens = 10, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens,
        number of characters, lexical diversity, and num_tokens most common
        tokens. Return a list of
    """

    if verbose :
        print(f"There are {len(tokens)} tokens in the data.")
        print(f"There are {len(set(tokens))} unique tokens in the data.")
        print(f"There are {len(''.join(tokens))} characters in the data.")
        print(f"The lexical diversity is {len(set(tokens))/len(tokens):.3f} in the data.")

        counts = Counter(tokens)

        if num_tokens > 0 :
            print("The top ten most common tokens are:")
            top_tokens=counts.most_common(num_tokens)
            top_token_df = pd.DataFrame(top_tokens, columns = ["Token", "Frequency"])
            print(top_token_df)

    return([len(tokens),
           len(set(tokens)),
           len("".join(tokens)),
           len(set(tokens))/len(tokens)])





# Remove stop words  from a list of tokens

def remove_stop(tokens) :
    return([t for t in tokens if t.lower() not in sw])

# Remove punctuation

#Removes punctuation marks from a given text.
def remove_punctuation(text, punct_set=punctuation) :
    return("".join([ch for ch in text if ch not in punct_set]))

#Remove white space
#Replaces multiple consecutive whitespace characters with a single space
def remove_whitespace(text):
    text = whitespace_pattern.sub(r" ", text)
    return text

# Tokenization
#Splits a given text into tokens using whitespace as the delimiter
def tokenize(text) :
    """ Splitting on whitespace rather than the book's tokenize function. That
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """

    return([t for t in whitespace_pattern.split(text) if t])

# pipeline function

#Takes a text and a list of transformation functions as input.
#It applies each function in the pipeline to the text sequentially,
#modifying the text along the way.


def prepare(text, pipeline) :
    tokens = str(text)

    for transform in pipeline :
        tokens = transform(tokens)

    return(tokens)

def join_tokens(tokens):

    text = " ".join(tokens)

    return(text)



#Overall,this code provides a set of utility functions for text preprocessing and
#descriptive analysis. It can be used to clean and analyze text data by applying various
#transformations and computing statistics on the processed text.


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [4]:
prep_pipeline = [str.lower, remove_punctuation, tokenize, remove_stop, join_tokens]
customer_queries["clean_queries"] = customer_queries["Query"].apply(prepare, pipeline = prep_pipeline)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



### Fitting an LSA Model

In [5]:
#Fitting an LSA Model

# Creating a list of stopwords using the 'sw' variable
stopwords_list = list(sw)
#stopwords_list.extend(["ll", "ve"])


# Creating a TF-IDF vectorizer with the stopwords list.
tfidf_text_vectorizer = TfidfVectorizer(
    stop_words = stopwords_list ,min_df=5, max_df=0.7
)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(customer_queries["clean_queries"])
# Checking the shape of the TF-IDF vectors
tfidf_text_vectors.shape



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



(40267, 4356)

In [6]:
# Initialize an LSA model with 4 components and a random state of 7

lsa_model = TruncatedSVD(n_components = 4, random_state = 7)

# Transform the TF-IDF text vectors using the LSA model to obtain the W matrix
W_lsa_matrix = lsa_model.fit_transform(tfidf_text_vectors)
# Retrieve the LSA model's components to obtain the H matrix
H_lsa_matrix = lsa_model.components_


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [7]:
# Topic index dict
# (key = topic index) : (value : np.argmax count)
topic_dict = {0 : 0, 1 : 0,
              2 : 0, 3: 0
              }

# Iterate over each row in the W_lsa_matrix
for row in W_lsa_matrix:
    # Find most similar topic to document
    topic_idx = np.argmax(row)
    # Add 1 count to topic dictionary
    topic_dict[topic_idx] += 1

topic_dict  # Print or use the topic_dict for further analysis


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



{0: 11945, 1: 19848, 2: 1745, 3: 6729}

In [8]:
#assign topic to dataframe
customer_queries['LSA_topic']=[np.argmax(row) for row in W_lsa_matrix]

customer_queries.groupby('LSA_topic')['Class'].value_counts()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



LSA_topic  Class          
0          Returns            9534
           Product Inquiry    1383
           Billing             818
           Tech. Support       210
1          Billing            9170
           Tech. Support      8068
           Product Inquiry    2481
           Returns             129
2          Tech. Support      1710
           Product Inquiry      34
           Returns               1
3          Product Inquiry    6169
           Returns             402
           Billing              81
           Tech. Support        77
Name: count, dtype: int64

In [9]:
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()#sum of all the values in the words array, which represents the total occurrence or importance of words in the topic.
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (
                features[largest[i]],
                abs(words[largest[i]]*100.0/total)))



#Overall, this code defines a function that takes a topic modeling model,
#a list of features (words), and an optional parameter for the number of top words
#to display. It then iterates over the topics in the model, calculates
#the total importance of words in each topic, sorts the words by importance,
#and prints the top words along with their weights for each topic.


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [10]:
# call display_topics on  model

display_topics(lsa_model, tfidf_text_vectorizer.get_feature_names_out())
##This part of the code will display the top words and their weights for each topic in the
#LSA model, using the feature names obtained from the TF-IDF text vectorizer.


Topic 00
  return (5.62)
  item (4.27)
  purchased (3.36)
  recently (2.11)
  process (1.92)

Topic 01
  wont (1.72)
  laptop (1.59)
  payment (1.51)
  computer (1.37)
  turn (1.27)

Topic 02
  wont (11.27)
  laptop (9.35)
  turn (8.69)
  computer (8.06)
  printer (6.58)

Topic 03
  product (34.39)
  warranty (6.00)
  come (4.84)
  policy (4.70)
  im (4.04)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



### Fitting an LDA Model

In [11]:

count_text_vectorizer = CountVectorizer(
    stop_words = stopwords_list, min_df=5, max_df=0.7

   )
count_text_vectors = count_text_vectorizer.fit_transform(customer_queries["clean_queries"])
# prints the shape of the count_text_vectors matrix
count_text_vectors.shape

#Overall, the code demonstrates the use of CountVectorizer to convert a collection of text
#documents into a numerical representation.It also sets some parameters to control the
#tokenization process and the inclusion of words in the matrix based on their document
#frequency.


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



(40267, 4356)

In [12]:
# Create a LatentDirichletAllocation object
lda_model = LatentDirichletAllocation(n_components = 4, random_state = 7)
# Fit the LDA model to the count_text_vectors and
#transform the data into a topic-document matrix
W_lda_matrix = lda_model.fit_transform(count_text_vectors)
# Retrieve the topic-word matrix from the trained LDA model
H_lda_matrix = lda_model.components_


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [13]:
#assign topic to dataframe
customer_queries['LDA_topic']=[np.argmax(row) for row in W_lda_matrix]

customer_queries.groupby('LDA_topic')['Class'].value_counts()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



LDA_topic  Class          
0          Returns            9672
           Product Inquiry    1003
           Billing             311
           Tech. Support        39
1          Tech. Support      4381
           Product Inquiry    3076
           Billing              39
           Returns              39
2          Billing            9640
           Product Inquiry     470
           Returns             252
           Tech. Support       138
3          Product Inquiry    5518
           Tech. Support      5507
           Returns             103
           Billing              79
Name: count, dtype: int64

In [14]:
# Call `display_topics` on model
display_topics(lda_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  return (9.06)
  item (5.87)
  purchased (4.46)
  recently (3.45)
  product (2.56)

Topic 01
  wont (2.87)
  laptop (2.85)
  computer (2.41)
  turn (1.75)
  power (1.15)

Topic 02
  payment (3.54)
  please (2.43)
  billing (2.16)
  bill (1.63)
  order (1.60)

Topic 03
  product (4.57)
  im (2.70)
  printer (1.42)
  wifi (1.34)
  working (1.23)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



 ### Fitting a Non-Negative Matrix Factorization Model

In [15]:
nmf_text_model = NMF(n_components=4, random_state=7)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [16]:
#assign topic to dataframe
customer_queries['NMF_topic']=[np.argmax(row) for row in W_text_matrix]

customer_queries.groupby('NMF_topic')['Class'].value_counts()



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



NMF_topic  Class          
0          Returns            7973
           Product Inquiry     623
           Billing             186
           Tech. Support        81
1          Billing            9667
           Product Inquiry     842
           Returns             592
           Tech. Support       505
2          Tech. Support      9231
           Product Inquiry     781
           Returns              40
           Billing              26
3          Product Inquiry    7821
           Returns            1461
           Tech. Support       248
           Billing             190
Name: count, dtype: int64

In [17]:
display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  return (10.22)
  item (8.25)
  purchased (6.14)
  recently (3.60)
  process (3.37)

Topic 01
  payment (2.92)
  please (1.92)
  billing (1.77)
  order (1.76)
  invoice (1.33)

Topic 02
  wont (4.32)
  laptop (3.75)
  turn (3.26)
  computer (3.24)
  printer (2.66)

Topic 03
  product (10.48)
  warranty (1.78)
  im (1.50)
  come (1.46)
  available (1.13)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



### LDA Model Visualization

In [18]:
lda_display = pyLDAvis.lda_model.prepare(lda_model, count_text_vectors,count_text_vectorizer,sort_topics=False)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [19]:
pyLDAvis.display(lda_display)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

