In [1]:
import re
import pandas as pd
import numpy as np
from pprint import pprint
import spacy
from nltk.stem import WordNetLemmatizer
import nltk; nltk.download('stopwords')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
# %matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

#test

[nltk_data] Downloading package stopwords to /Users/jihok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Run cells prior to adding additional stop words

In [2]:
# FUNCTIONS APPLY CURRENT CLEANING STEPS INTO ENTIRE DATASET

def subject_clean(subject_line):
    regex_sol = re.sub(r'https?://\S+', '', subject_line) # removes URL links
    regex_sol = re.sub(r"\S*@\S*\s?", "", regex_sol) # removes email accounts
    regex_sol = regex_sol.replace("\n", "").replace("< >","").replace("\r", "") # removes newline and <> and \r
    regex_sol = re.sub(r"\d+", "", regex_sol) # removes integers 
    regex_sol = re.sub(r"([^\s\w]|_)+", " ", regex_sol) # removes non-alphanumeric characters, but maintains whitespace
    regex_sol = regex_sol.encode("ASCII", "replace").decode("utf-8").replace("?", " ") # removes all non-ASCII characters
    regex_sol = regex_sol.lower() # lower case string
    return regex_sol

def case_line_clean(case_line):
    regex_sol_2 = re.sub(r"[?](CS)[0-9]+", "", case_line) # removes ending "?CS###"
    regex_sol_2 = re.sub(r"\d+", "", regex_sol_2) # removes integers
    regex_sol_2 = re.sub(r"([^\s\w]|_)+", " ", regex_sol_2) # removes non-characters, but maintains whitespace
    regex_sol_2 = regex_sol_2.lower()
    return regex_sol_2

# call the dataset that pertains to you
august_dataset = pd.read_csv("SNOW August Data.csv", encoding = "ISO-8859-1")
september_dataset = pd.read_csv("SNOW September Data.csv", encoding = "ISO-8859-1")

#SPAM case removal
august_dataset = august_dataset.loc[(august_dataset['contact'] != 'Spam spam') 
                                    & (august_dataset['contact'] != 'SPAM SPAM') 
                                    & (august_dataset['resolution_code'] != 'Spam') 
                                    & (august_dataset['service_offering'] == 'Other Buying Inquiry')].reset_index(drop=True)

september_dataset = september_dataset.loc[(september_dataset['contact'] != 'Spam spam') 
                                          & (september_dataset['contact'] != 'SPAM SPAM') 
                                          & (september_dataset['resolution_code'] != 'Spam') 
                                          & (september_dataset['service_offering'] == 'Other Buying Inquiry')].reset_index(drop=True)

# Regex cleaning applied to both the "description" and "case" columns in each dataset
august_dataset['description'] = august_dataset['description'].apply(subject_clean)
september_dataset['description'] = september_dataset['description'].apply(subject_clean)

august_dataset['case'] = august_dataset['case'].apply(case_line_clean)
september_dataset['case'] = september_dataset['case'].apply(case_line_clean)

# print(len(august_dataset)) 421
# print(len(september_dataset)) 210

# Concat the cleaned data into one dataframe for use
# June annd July have been removed 
all_months = pd.concat([august_dataset, september_dataset]).reset_index()
# len(all_months) 631

In [3]:
# Removes cases where "wav file" and "idt" exists
# Current iteration removes 85 rows from all_months dataset
delete_rows = []
for i in range(len(all_months)):
    curr = all_months.iloc[i]['description']
    test1 = re.findall(r"\b(wav.file)\b", curr)
    test2 = re.findall(r"\b(idt)\b", curr)
    if len(test1) > 0 or len(test2) > 0:
        delete_rows.append(i)

all_months = all_months.drop(delete_rows).reset_index(drop=True)

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Additional stopwords
domain_stop_words = [
    'hi', 'hello', 'thank', 'thanks', 'com', 'the', 're', 'php', 'http', 'XXXXXXXXX', 'would', 'however', 
    'please', 'do', 'can', 'may', 'check', 'pende', 'tell', 'use', 'call', 'let', 'dear', 'see', 'click', 
    'still', 'unable', 'even', 'minute', 'basically', 'seem', 'expect', 'pcie', 'usd', 'go', 'could', 
    'advise', 'appreciate', 'regard', 'also', 'end', 'sure', 'copy', 'phone', 'know', 'accidently', 
    'reply', 'web', 'soon', 'regard', 'get', 'try', 'new', 'follow', 'date', 'pm' ,'back', 'note', 'us', 
    'sku', 'sincerely', 'immediately', 'notify', 'one', 'two', 'someone', 'day', 'put', 'start', 'set', 
    'reply', 'advise', 'august', 'arise', 'therewith', 'regarding', 'san', 'diego', 'uc', 'inc', 'ca',
    'go', 'able', 'say', 'like', 'wav_file', 'ref', 'monday', 'marketplace', 'try', 'time', 'use','want',
    'ucsd','pur', 'support', 'provide', 'question','darmstadt', 'germany', 'accept', 'liability','office',
    'subject', 'email','sent','confidential','attachment','say','pdf','sender', 'comments','v', 'customer',
    'services', 'abcam','kendall','square','suite','cambridge', 'usatoll','free','international','tel','fax',
    'hours','est','mon','frigoods','duties','unpaid','control','placing','agreeing','duties','applicable', 
    'wondering', 'happens', 'something', 'janelle', 'chartstream', 'needs', 'believe', 'attached', 'cce', 
    'zhu', 'rm', 'ste', 'cd', 'going', 'received', 'much', 'kind', 'regards', 'drydock', 'avenueboston', 
    'linethank', 'ab', 'from'
    ]      
stop_words.extend(domain_stop_words)
# stop_words.extend(domain_stop_words + curr_list + more_stop_words)
print(len(stop_words))

337


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [6]:
# Dividing description into individual words aka tokenizing
alldescrip = list(all_months['description'])

#initiate blank list
text_words = []

#for loop to tokenize all words in
for sentence in alldescrip:
    temp = gensim.utils.simple_preprocess(sentence)
    text_words.append(temp)

In [7]:
# Build the bigram
bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100)

#Removeal of stop words
text_words_no_stops = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text_words]

#Make texts into bigram model
text_words_no_stops_bigram = [bigram[doc] for doc in text_words_no_stops]


data_words = [item for sublist in text_words_no_stops_bigram for item in sublist]

# remove stop words first before applying
from nltk.probability import FreqDist
fdist = FreqDist(data_words)

# uncomment line below to see list of word frequencies
# fdist.most_common()

### Start Adding Additional Stopwords Here:

In [9]:
# manual additional stopwords
manual_stop_words = [
    'team', 'details','possible','priviledged', 'milliporesigma', 'merck','kgaa', 'must', 
    'disclose', 'disclaimer','spanish', 'german_french', 'portuguese','versions', 
    'contacting', 'milliporesigma','name','dr','item','pmto','list','center','items',
    'style','aug','via','usa','today', 'original','thereto','merck','best','california',
    'days','using','rights, reserved','miltenyi','biotec','amto','llc','wrote','link',
    'way','airgas','px','bu','msd','qiagen','actual','well','care','ml','already','color',
    'wed','th','colleen','thermo, fisher','style', 'font','aug','ea','addgene','mrs',
    'gilman','asap','looking','www','br','br','size','px','shclng','trcn','ph','cc','per',
    'non','ltd'
    ]

In [None]:
# conditional stopwords added here
add_stopwords = []
for i in fdist.most_common():
    # add/adjust conditionals here
    if i[1] < 5:
        add_stopwords.append(i[0])
print(len(add_stopwords))
add_stopwords

conditional_stopwords = []
for i in add_stopwords:
    if "_" in i:
        curr = i.split("_")
        for x in curr:
            conditional_stopwords.append(x)
    else:
        conditional_stopwords.append(i)
conditional_stopwords

In [11]:
# stop_words_blk_1 is comprised of both the manual and conditional list of stopwords
stop_words_blk_1 = conditional_stopwords + manual_stop_words
print(len(stop_words_blk_1))

3620


In [12]:
# stores variable in IPython db to allow other notebooks access to the variable
# Run after any new words are added
%store stop_words_blk_1

Stored 'stop_words_blk_1' (list)


In [13]:
# run to remove stored variable 
# %store -z