In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
en_stopwords = set(stopwords.words('english'))

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

from collections import Counter

import spacy
nlp = spacy.load("en_core_web_md")
# python -m spacy download en_core_web_md
import itertools
import re
import time
import os
import pickle
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
sns.set(style="whitegrid",palette='muted',font_scale=1.2)
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_stopwords_gensim = STOPWORDS.union(set(['thank','thanks', 'you', 'help','questions','a.m.','p.m.','friday','thursday','wednesday','tuesday','monday',\
                                            'askunum','email','askunum.com','unum','askunumunum.com','day','use', 'appreciate','available','mailtoaskunumunum.com',\
                                            'hello','hi','online','?','.','. .','phone','needs','need','let','know','service','information','time','meet','client',\
                                           'team','ask','file','date','opportunity','original','benefit','eastern','specialists','specialist','attached','experienced',\
                                            'benefits insurance','employee','click','organization','httpsbit.lycjrbm',  'received', 'billing', 'manager', 'assist', \
                                            'additional', 'response','vlif']))

In [3]:
def text_preprocess(text, extract_adj=False):
    # lemma = nltk.wordnet.WordNetLemmatizer()
    
    text = str(text)
    
    #remove http links from the email
    
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '')  
    
    text = re.sub("`", "'", text)
    
    #fix misspelled words

    '''Here we are not actually building any complex function to correct the misspelled words but just checking that each character 
    should occur not more than 2 times in every word. It’s a very basic misspelling check.'''

    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    if extract_adj:
        ADJ_word=[]
        doc=nlp(text)
        for token in doc:
            if token.pos_=="ADJ":
                ADJ_word.append(token.text)   
        text=" ".join(ADJ_word)    

    # text = [appos[word] if word in appos else word for word in text.lower().split()]
    # text = " ".join(text)
    
    ### Remove stop word
    text = [i for i in word_tokenize(text) if i not in all_stopwords_gensim]
    text = " ".join(text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]
    text=" ".join(text)
    
    # stem
    # ps = PorterStemmer()
    # text=" ".join(set([ps.stem(w) for w in text.split()]))
    
    return text

In [4]:
input_dir="s3://trident-retention-output/"
output_dir="s3://trident-retention-output/output/"

askunum_text=pd.read_pickle(os.path.join(input_dir,"askunum_text_v1")) ## askunum_text_v1 group text by parentID and Subtype
askunum_text['Subtype'] = askunum_text['Subtype'].fillna("").astype(str).str.lower()
askunum_text["Subtype"]=askunum_text["Subtype"].progress_apply(lambda x: x.encode("latin1").decode("cp1252"))
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace("/"," or ")
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace("&"," and ")
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace(r"\s{2,}", " ", regex=True)

100%|██████████| 2044010/2044010 [00:03<00:00, 637048.55it/s]


In [5]:
df=askunum_text.copy()

In [6]:
df.head(2)

Unnamed: 0,ParentId,Subtype,TextBody
0,5000c00001TWN6bAAH,employee coding,"unum, the following associates have been termi..."
1,5000c00001TWN9pAAH,tax question,who pays the futa and sui taxes? do you odo we...


In [7]:
output_dir="s3://trident-retention-output/output/"
df["bag_of_word"]=df["TextBody"].progress_apply(text_preprocess)

100%|██████████| 2044010/2044010 [52:48<00:00, 645.08it/s]


In [8]:
output_dir="s3://trident-retention-output/output/"
df.to_pickle(os.path.join(output_dir,"askunum_text_bagofword"))

In [None]:
# df["adj_bag_of_word"]=df["TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))

In [9]:
negative_word=[]
with open("negative-words.txt") as f:
    for curline in f:
        if curline.startswith(";"):
            continue
        if curline.strip():
            negative_word.append(curline.strip())
print("There are {:,} negative words externally".format(len(negative_word)))

There are 4,783 negative words externally


In [10]:
from random import sample
print(sample(negative_word,50))

['amiss', 'adulterate', 'anomalous', 'fried', 'seethe', 'break-ups', 'critic', 'steal', 'disturbed', 'paralize', 'anxiously', 'antipathy', 'object', 'suicide', 'disquietingly', 'discombobulate', 'sleazy', 'poorly', 'simplistically', 'jarring', 'indeterminate', 'decay', 'standstill', 'retreat', 'protests', 'hideously', 'stubbornly', 'caustic', 'undissolved', 'prohibitively', 'bewitch', 'ache', 'catastrophes', 'zealot', 'sinful', 'nauseatingly', 'bombard', 'comical', 'averse', 'hard-hit', 'judders', 'nonexistent', 'bump', 'obstinate', 'inconsequently', 'dark', 'spew', 'absence', 'egocentric', 'stupify']


In [11]:
df['negative_word_counts'] = 0
for w in tqdm(negative_word, total=len(negative_word)):
    df['negative_word_counts']+=df["bag_of_word"].apply(lambda x: w in x)

100%|██████████| 4783/4783 [1:48:47<00:00,  1.36s/it]


In [12]:
output_dir="s3://trident-retention-output/output/"
df.to_pickle(os.path.join(output_dir,"askunum_text_bagofword"))