### Gensim Implementation

In [22]:
# Required Packages
# Refer to team guides for installing

import re
import pandas as pd
import numpy as np
from pprint import pprint
import spacy
from nltk.stem import WordNetLemmatizer
import nltk; nltk.download('stopwords')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
# %matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /Users/jihok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Cleaning Steps

Prior to using the data sources, they must be cleaned and prepared. Two functions, utilizing regular expression, were developed and used to remove unnecessary characters found in the ServiceNow datasets. Further edits can be made to the cleaning process as needed or requested.

In [24]:
# FUNCTIONS APPLY CURRENT CLEANING STEPS INTO ENTIRE DATASET

def subject_clean(subject_line):
    """Remove all unnecessary characters from Description column for each dataset row"""
    regex_sol = re.sub(r'https?://\S+', '', subject_line) # removes URL links
    regex_sol = re.sub(r"\S*@\S*\s?", "", regex_sol) # removes email accounts
    regex_sol = regex_sol.replace("\n", "").replace("< >","").replace("\r", "") # removes newline and <> and \r
    regex_sol = re.sub(r"\d+", "", regex_sol) # removes integers 
    regex_sol = re.sub(r"([^\s\w]|_)+", " ", regex_sol) # removes non-alphanumeric characters, but maintains whitespace
    regex_sol = regex_sol.encode("ASCII", "replace").decode("utf-8").replace("?", " ") # removes all non-ASCII characters
    regex_sol = regex_sol.lower() # lower case string
    return regex_sol

def case_line_clean(case_line):
    """Remove all unnecessary characters from Case column for each dataset row"""
    regex_sol_2 = re.sub(r"[?](CS)[0-9]+", "", case_line) # removes ending "?CS###"
    regex_sol_2 = re.sub(r"\d+", "", regex_sol_2) # removes integers
    regex_sol_2 = re.sub(r"([^\s\w]|_)+", " ", regex_sol_2) # removes non-characters, but maintains whitespace
    regex_sol_2 = regex_sol_2.lower()
    return regex_sol_2

An additional removal of any instance of "Spam" and "Other Buying Inquiry" were added as part of the dataset cleaning efforts.

In [None]:
# call the dataset that pertains to you
august_dataset = pd.read_csv("SNOW August Data.csv", encoding = "ISO-8859-1")
september_dataset = pd.read_csv("SNOW September Data.csv", encoding = "ISO-8859-1")

#SPAM case removal
august_dataset = august_dataset.loc[(august_dataset['contact'] != 'Spam spam') 
                                    & (august_dataset['contact'] != 'SPAM SPAM') 
                                    & (august_dataset['resolution_code'] != 'Spam') 
                                    & (august_dataset['service_offering'] == 'Other Buying Inquiry')].reset_index(drop=True)

september_dataset = september_dataset.loc[(september_dataset['contact'] != 'Spam spam') 
                                          & (september_dataset['contact'] != 'SPAM SPAM') 
                                          & (september_dataset['resolution_code'] != 'Spam') 
                                          & (september_dataset['service_offering'] == 'Other Buying Inquiry')].reset_index(drop=True)

# Regex cleaning applied to both the "description" and "case" columns in each dataset
august_dataset['description'] = august_dataset['description'].apply(subject_clean)
september_dataset['description'] = september_dataset['description'].apply(subject_clean)

august_dataset['case'] = august_dataset['case'].apply(case_line_clean)
september_dataset['case'] = september_dataset['case'].apply(case_line_clean)

# Concat the cleaned data into one dataframe for use
# June annd July have been removed 
all_months = pd.concat([august_dataset, september_dataset]).reset_index()
# len(all_months) 631

Cases with "wav file" and "idt" were also removed as requested. Code below iterates through all_months and removes a total of 85 rows. 

In [26]:
delete_rows = []
for i in range(len(all_months)):
    curr = all_months.iloc[i]['description']
    test1 = re.findall(r"\b(wav.file)\b", curr)
    test2 = re.findall(r"\b(idt)\b", curr)
    if len(test1) > 0 or len(test2) > 0:
        delete_rows.append(i)

all_months = all_months.drop(delete_rows).reset_index(drop=True)

### Topic Modeling Work 

Resources: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

Stopwords are generated at this step in the project. The variable "domain_stop_words" is done by hand and includes words typically seen in ServiceNow cases. Additional words can be added here or from the additional_stopwords.ipynb for cleanliness. "domain_stop_words" is then added onto the downloaded stopwords from the nltk package.

In [28]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Additional stopwords
domain_stop_words = [
    'hi', 'hello', 'thank', 'thanks', 'com', 'the', 're', 'php', 'http', 'XXXXXXXXX', 'would', 'however', 
    'please', 'do', 'can', 'may', 'check', 'pende', 'tell', 'use', 'call', 'let', 'dear', 'see', 'click', 
    'still', 'unable', 'even', 'minute', 'basically', 'seem', 'expect', 'pcie', 'usd', 'go', 'could', 
    'advise', 'appreciate', 'regard', 'also', 'end', 'sure', 'copy', 'phone', 'know', 'accidently', 
    'reply', 'web', 'soon', 'regard', 'get', 'try', 'new', 'follow', 'date', 'pm' ,'back', 'note', 'us', 
    'sku', 'sincerely', 'immediately', 'notify', 'one', 'two', 'someone', 'day', 'put', 'start', 'set', 
    'reply', 'advise', 'august', 'arise', 'therewith', 'regarding', 'san', 'diego', 'uc', 'inc', 'ca',
    'go', 'able', 'say', 'like', 'wav_file', 'ref', 'monday', 'marketplace', 'try', 'time', 'use','want',
    'ucsd','pur', 'support', 'provide', 'question','darmstadt', 'germany', 'accept', 'liability','office',
    'subject', 'email','sent','confidential','attachment','say','pdf','sender', 'comments','v', 'customer',
    'services', 'abcam','kendall','square','suite','cambridge', 'usatoll','free','international','tel','fax',
    'hours','est','mon','frigoods','duties','unpaid','control','placing','agreeing','duties','applicable', 
    'wondering', 'happens', 'something', 'janelle', 'chartstream', 'needs', 'believe', 'attached', 'cce', 
    'zhu', 'rm', 'ste', 'cd', 'going', 'received', 'much', 'kind', 'regards', 'drydock', 'avenueboston', 
    'linethank', 'ab', 'from'
    ]      

# if stop_words_blk_1 is empty, cell will run lines under inside "except" 
# to add additional stopwords go to "additional_stopwords.ipynb"
try:
    # retreives additional stop words from additional_stopwords notebook
    %store -r stop_words_blk_1
    stop_words.extend(domain_stop_words+stop_words_blk_1)   
except:
    # retreives additional stop words 
    stop_words.extend(domain_stop_words)

In [30]:
print(len(stop_words))

3958


#### Tokenization of Words

The "description" column is now tokenized to have their sentences broken down and separated into a list of words. The result lists of words is stored in the variable "text_words".

In [34]:
# Dividing description into individual words aka tokenizing
alldescrip = list(all_months['description'])

#initiate blank list
text_words = []

#for loop to tokenize all words in
for sentence in alldescrip:
    temp = gensim.utils.simple_preprocess(sentence)
    text_words.append(temp)

#### Biagram Creation

Words that frequently occur together in the variable "text_words" are built using the gensim package here. 
Consult resource link above for further details.

In [36]:
# Build the bigram
bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

#### Stopword Removal, Biagram Constructed, Lemmatization Occurs

The stopwords stored earlier are now being used to remove instances of the stopwords found in the variable "text_words". A bigram model is also constructed here.

In [38]:
#Removeal of stop words
text_words_no_stops = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_words]

#Make texts into bigram model
text_words_no_stops_bigram = [bigram_mod[doc] for doc in text_words_no_stops]

Lemmatization process occurs here. Spacy was used in the online tutorial, but was not used for our purposes with the reason being that the results of the lemmatization being better not using spacy.

In [42]:
# Lemmatize with POS Tag
# Ex: dance and dancing == dance 
# Grouping words
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


# return POS tag for next function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [44]:
# Lemmatizing function 
# takes in full list of descriptions with stop words removed
# returns descriptions lemmatized
def lemm(text_stop_no_bigram):
    count = 0
    out = []
    for descrip in text_stop_no_bigram:
        count += 1
        inner = []
        for word in descrip:
            pos = get_wordnet_pos(word)
            inner.append(lemmatizer.lemmatize(word, pos))
        out.append(inner)
    return out


In [46]:
# Lemmatizes 
nltk.download('wordnet')
lemmed = (lemm(text_words_no_stops_bigram))
#lemmed[0]

[nltk_data] Downloading package wordnet to /Users/jihok/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Dictionary and Corpus Creation for Topic Model

A dictionary and corpus are created as they serve as the inputs for the LDA model. Both created variables use the variable "lemmed" from previous lines of code.

In [50]:
#Create Dictionary
id2word = gensim.corpora.Dictionary(lemmed)

#Create Corpus
texts = lemmed

#Build Term Document Frequency (TDF)
corpus = [id2word.doc2bow(text) for text in texts]

In [52]:
# Read Corups/TDF per case
# Uncomment line below to observe word frequency
# WARNING: list is huge
# [[(id2word[id], freq) for id, freq in cp] for cp in corpus]

#### Building Topic Model and Visualization (Gensim)

In [54]:
# Build LDA Model
# Number of topics chosen here is 6 (changeable)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=0,
                                           chunksize=100,
                                           passes=1,
                                           alpha='auto',
                                           per_word_topics=True)



For this particular case, 6 topics are generated and each topic (denoted with a number from 0-6) has keywords that contribute a percentage weight to the topic. \
The top 10 keywords are chosen for each topic. \
Note: Certain words appear in multiple topics (consider for future implementations).

In [55]:
# Print the Keyword in the 6 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.066*"order" + 0.023*"message" + 0.022*"product" + 0.021*"document" + '
  '0.018*"number" + 0.017*"ship" + 0.016*"purchase" + 0.015*"purchasing" + '
  '0.014*"lifespan" + 0.014*"file"'),
 (1,
  '0.105*"message" + 0.037*"inquiry" + 0.027*"update" + 0.027*"content" + '
  '0.026*"error" + 0.026*"subsidiary" + 0.026*"transmission" + 0.025*"contact" '
  '+ 0.025*"result" + 0.020*"order"'),
 (2,
  '0.045*"po" + 0.039*"order" + 0.027*"invoice" + 0.022*"contact" + '
  '0.021*"message" + 0.021*"number" + 0.017*"document" + 0.015*"payment" + '
  '0.014*"information" + 0.011*"oracle"'),
 (3,
  '0.047*"order" + 0.031*"fedex" + 0.030*"number" + 0.024*"message" + '
  '0.022*"delivery" + 0.020*"service" + 0.018*"document" + 0.015*"purchase" + '
  '0.014*"request" + 0.014*"need"'),
 (4,
  '0.079*"order" + 0.041*"message" + 0.021*"po" + 0.020*"number" + '
  '0.017*"service" + 0.017*"contact" + 0.017*"change" + 0.016*"document" + '
  '0.015*"supplier" + 0.012*"requisition_req"'),
 (5,
  '0.076*

Perplexity and Coherence scores are calculated to measure the topic model

In [56]:
# Compute Perplexity (Lower the better)
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score (Higher the beter)
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.1672345747669235

Coherence Score:  0.42355793021033744


We can now visualize the topics and their associated words. This was done using the pyLDAvis package downloaded early on in the project. \
Each topic is represented as a bubble on the left-hand side. We want to see larger bubbles and bubbles that do not overlap with over bubbles, if possible. \
Hovering over a bubble, the right-hand side of the visual will adjust to show the words associated with that particlar bubble/topic.

Note: Adjusting the number of topics can be done under the "Building Topic Model and Visualization (Gensim)" tab. Locate the variable "num_topics"

In [57]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis