In [9]:
import os
import re
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)
import itertools
import spacy
nlp = spacy.load("en_core_web_md")
from textblob import TextBlob
# python -m textblob.download_corpora
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
all_stopwords_gensim = STOPWORDS.union(set(['thank','thanks', 'you', 'help','questions','a.m.','p.m.','friday','thursday','wednesday','tuesday','monday',\
                                            'askunum','email','askunum.com','unum','askunumunum.com','day','use', 'appreciate','available','mailtoaskunumunum.com',\
                                            'hello','hi','online','?','.','. .','phone','needs','need','let','know','service','information','time','meet','client',\
                                           'team','ask','file','date','opportunity','original','benefit','eastern','specialists','specialist','attached','experienced',\
                                            'benefits insurance','employee','click','organization','httpsbit.lycjrbm',  'received', 'billing', 'manager', 'assist', \
                                            'additional', 'response']))

appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}


In [3]:
def text_preprocess(text, extract_adj=False):
    # lemma = nltk.wordnet.WordNetLemmatizer()
    
    text = str(text)
    
    #remove http links from the email
    
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '')  
    
    text = re.sub("`", "'", text)
    
    #fix misspelled words

    '''Here we are not actually building any complex function to correct the misspelled words but just checking that each character 
    should occur not more than 2 times in every word. It’s a very basic misspelling check.'''

    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    
    text = [appos[word] if word in appos else word for word in text.lower().split()]
    text = " ".join(text)
    
    ### Remove stop word
    text = [i for i in word_tokenize(text) if i not in all_stopwords_gensim]
    text = " ".join(text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]
    text=" ".join(text)
    
    # stem
    ps = PorterStemmer()
    text=" ".join(set([ps.stem(w) for w in text.split()]))
    
    if extract_adj:
        ADJ_word=[]
        doc=nlp(text)
        for token in doc:
            if token.pos_=="ADJ":
                ADJ_word.append(token.text)   
        return " ".join(ADJ_word)
    
    else:
        return text
    
def textblob_sentiment(text):
    pol_score = TextBlob(text).sentiment.polarity
    if pol_score > 0: 
        return 'positive'
    elif pol_score == 0: 
        return 'neutral'
    else: 
        return 'negative'

def vader_sentiment(text):
    
    senti = SentimentIntensityAnalyzer()
    compound_score = senti.polarity_scores(text)['compound']
    
    # set sentiment 
    if compound_score >= 0.05: 
        return 'positive'
    elif (compound_score > -0.05) and (compound_score < 0.05): 
        return 'neutral'
    else: 
        return 'negative'

In [92]:
email_all=load_from_disk(os.path.join(os.getcwd(),"dataset","email_all"))
email_all

DatasetDict({
    train: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 156414
    })
    test: Dataset({
        features: ['Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'churn'],
        num_rows: 27497
    })
})

In [5]:
train_data=email_all['train']
test_data=email_all['test']
train_data.set_format(type="pandas")
df_train=train_data[:]
test_data.set_format(type="pandas")
df_test=test_data[:]

In [6]:
tempt1=pd.DataFrame(df_train["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_train["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Training set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,133904,85.61%
1,1,22510,14.39%


In [7]:
tempt1=pd.DataFrame(df_test["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_test["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Test set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,23786,86.50%
1,1,3711,13.50%


### Test Set

In [8]:
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

100%|██████████| 27497/27497 [04:57<00:00, 92.54it/s] 


In [19]:
def extract_adj(text):
    ADJ_word=set()
    doc=nlp(text)
    for token in doc:
        if token.pos_=="ADJ":
            ADJ_word.add(token.text)
    return ADJ_word

adj_count=Counter()
tempt_test=df_test[df_test["churn"]==1]
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    adj_word=extract_adj(row['Full_TextBody'])
    adj_count.update(adj_word)
    adj,freq=zip(*adj_count.most_common(50))
for i ,j in zip(adj,freq):
    print("{:<20}{:<20,}".format(i,j))

eastern             3,674               
unum                3,101               
additional          2,748               
great               2,622               
confidential        2,059               
further             1,841               
effective           1,702               
new                 1,647               
helpful             1,585               
intended            1,581               
privileged          1,544               
available           1,474               
current             1,420               
able                1,407               
other               1,381               
future              1,250               
below               1,249               
happy               1,228               
recent              1,186               
nice                1,185               
sure                1,180               
wonderful           1,164               
more                1,066               
next                1,024               
voluntary       

In [93]:
tempt=df_test.copy()
tempt["set_word"]=tempt["Full_TextBody"].progress_apply(lambda x: set(x.split()))
tempt["terminate"]=tempt["set_word"].progress_apply(lambda x: 1 if set(["termination","terminate"]).issubset(x) else 0 )
tempt["terminate"].value_counts()

100%|██████████| 27497/27497 [00:03<00:00, 7282.75it/s]
100%|██████████| 27497/27497 [00:00<00:00, 703596.82it/s]


0    25130
1     2367
Name: terminate, dtype: int64

In [94]:
tempt[(tempt["terminate"]==1) & (tempt["churn"]==0)]["Full_TextBody"].iloc[2]

"no problem looping in ask unum as they will be able to assist with this. ask unum please terminate the below mentioned employee and reply back to all once complete. employee new hire jennifer calvert signed up last month for voluntary life 00406969-0001. i believe it was to begin 4/1/2021. but ms. calvert is now no longer employed with marywood nursing care center effective 4/6/2021. i tried to terminate her in the employee changes online but she was not listed yet. could you please terminate her policy? no trees were destroyed in the sending of this e-mail. however, a great number of electrons were terribly inconvenienced..we have completed this termination request for samantha, with a termination effective date of 04/06/2021. the group will not be billed any additional premium for their coverage. please do let us know if there is anything further we may assist you with. have a great day we appreciate the opportunity to meet your benefit needs. , 8 a.m. to 8 p.m. eastern time..i sent

In [20]:
tempt=df_test[df_test["churn"]==1]
tempt.shape

(3711, 6)

In [21]:
tempt.head(2)

Unnamed: 0,Full_TextBody,Client_TextBody,Latest_TextBody,year,churn,bag_of_word
1,our ask unum team should be able to provide yo...,our ask unum team should be able to provide yo...,"on tue, apr 6, 2021 at 128 pm wrote thanks for...",2022,1,able provide schedule ill send census december...
23,"the secure message expires on sep 24, 2021 032...","the secure message expires on sep 24, 2021 032...",please process the below mentioned and attache...,2022,1,secure message expires sep pm gmt reply notifi...


In [22]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

100%|██████████| 3711/3711 [00:46<00:00, 79.46it/s] 


Unnamed: 0,Full_TextBody,Client_TextBody,Latest_TextBody,year,churn,bag_of_word
1,our ask unum team should be able to provide yo...,our ask unum team should be able to provide yo...,"on tue, apr 6, 2021 at 128 pm wrote thanks for...",2022,1,able provide schedule ill send census december...
23,"the secure message expires on sep 24, 2021 032...","the secure message expires on sep 24, 2021 032...",please process the below mentioned and attache...,2022,1,secure message expires sep pm gmt reply notifi...


In [23]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

100%|██████████| 3711/3711 [04:08<00:00, 14.91it/s]


Unnamed: 0,Full_TextBody,Client_TextBody,Latest_TextBody,year,churn,bag_of_word,adj_bag_of_word
1,our ask unum team should be able to provide yo...,our ask unum team should be able to provide yo...,"on tue, apr 6, 2021 at 128 pm wrote thanks for...",2022,1,able provide schedule ill send census december...,able separate unum comprehensive voluntary com...
23,"the secure message expires on sep 24, 2021 032...","the secure message expires on sep 24, 2021 032...",please process the below mentioned and attache...,2022,1,secure message expires sep pm gmt reply notifi...,secure open open rehire rehire effective compl...


In [24]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

100%|██████████| 3711/3711 [00:01<00:00, 2633.76it/s]


Unnamed: 0,Full_TextBody,Client_TextBody,Latest_TextBody,year,churn,bag_of_word,adj_bag_of_word,sentiment
1,our ask unum team should be able to provide yo...,our ask unum team should be able to provide yo...,"on tue, apr 6, 2021 at 128 pm wrote thanks for...",2022,1,able provide schedule ill send census december...,able separate unum comprehensive voluntary com...,positive
23,"the secure message expires on sep 24, 2021 032...","the secure message expires on sep 24, 2021 032...",please process the below mentioned and attache...,2022,1,secure message expires sep pm gmt reply notifi...,secure open open rehire rehire effective compl...,positive


In [25]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

100%|██████████| 3711/3711 [00:21<00:00, 171.43it/s]


Unnamed: 0,Full_TextBody,Client_TextBody,Latest_TextBody,year,churn,bag_of_word,adj_bag_of_word,sentiment,vader_sentiment
1,our ask unum team should be able to provide yo...,our ask unum team should be able to provide yo...,"on tue, apr 6, 2021 at 128 pm wrote thanks for...",2022,1,able provide schedule ill send census december...,able separate unum comprehensive voluntary com...,positive,positive
23,"the secure message expires on sep 24, 2021 032...","the secure message expires on sep 24, 2021 032...",please process the below mentioned and attache...,2022,1,secure message expires sep pm gmt reply notifi...,secure open open rehire rehire effective compl...,positive,positive


In [26]:
tempt.sentiment.value_counts()

positive    3484
neutral      167
negative      60
Name: sentiment, dtype: int64

In [27]:
tempt.vader_sentiment.value_counts()

positive    3472
neutral      185
negative      54
Name: vader_sentiment, dtype: int64

In [28]:
tempt_test=tempt[tempt['vader_sentiment']=="negative"]
neg_word_test=set()
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    neg_word_test=neg_word_test | set(row["adj_bag_of_word"].split())
neg_word_test

100%|██████████| 54/54 [00:00<00:00, 19433.07it/s]


{'able',
 'accrue',
 'accurate',
 'active',
 'actual',
 'additional',
 'allocate',
 'annual',
 'applicable',
 'appropriate',
 'automatic',
 'average',
 'aware',
 'backup',
 'basic',
 'broken',
 'brown',
 'certain',
 'civil',
 'complete',
 'confidential',
 'convenient',
 'convertible',
 'correct',
 'corridor',
 'covenant',
 'criminal',
 'critical',
 'current',
 'daily',
 'deepest',
 'delete',
 'dental',
 'dependent',
 'detailed',
 'different',
 'difficult',
 'digital',
 'disconnected',
 'div',
 'double',
 'due',
 'effective',
 'electronic',
 'eligibility',
 'eligible',
 'eoi',
 'ex',
 'exact',
 'exclusive',
 'expedite',
 'faster',
 'flat',
 'flexible',
 'free',
 'future',
 'general',
 'great',
 'green',
 'gross',
 'happy',
 'hard',
 'helpful',
 'hesitate',
 'hourly',
 'immediate',
 'important',
 'incorrect',
 'individual',
 'informational',
 'initial',
 'instant',
 'internal',
 'josvany',
 'julian',
 'kayleigh',
 'large',
 'lesser',
 'limited',
 'locked',
 'long',
 'minimum',
 'mistaken

In [46]:
tempt_test["Latest_TextBody"].iloc[16]

"on thu, may 20, 2021 at 114 pm wrote i hope this email finds you doing well. we have been asked to send you a copy of the w-9 from unum. attached please find the document. please let us know if there is anything else we can help you with. we appreciate the opportunity to meet your benefit needs. , 8 a.m. to 8 p.m. eastern time..thank you for submitting wyatt barnes's evidence of insurability eoi application. we will begin processing your eoi form in the order received. this request will be reviewed within 5 business days. however, if additional information is needed, processing may be delayed. for town of smyrna, you can use unums custom employer link to submit eoi online. here are the basics 1. initial setup unum creates your custom access code and provides it to you. please provide the eoi link and access code to your benefit admin/tech partner if you would like it to display on your enrollment system. 2. you provide the eoi link and access code to employees that require eoi. this l

In [None]:
# text='active'
# TextBlob(text).sentiment.polarity

# from textblob import TextBlob

# testimonial = TextBlob("active")
# print(testimonial.sentiment)

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# analyzer = SentimentIntensityAnalyzer()
# sentence = "The food was terrible!" 
# vs = analyzer.polarity_scores(sentence)
# print("{:-<65} {}".format(sentence, str(vs)))

In [None]:
# !pip install --quiet flair
# from flair.models import TextClassifier
# from flair.data import Sentence

# classifier = TextClassifier.load('en-sentiment')
# sentence = Sentence('The food was great!')
# classifier.predict(sentence)

# # print sentence with predicted labels
# print('Sentence above is: ', sentence.labels)

### Training set

In [None]:
tempt=df_train[df_train["churn"]==1]
tempt.shape

In [None]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

In [None]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

In [None]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

In [None]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

In [None]:
tempt.sentiment.value_counts()

In [None]:
tempt.vader_sentiment.value_counts()

In [None]:
tempt_train=tempt[tempt['vader_sentiment']=="negative"]
tempt_train.head()

In [None]:
neg_word=set()
for index,row in tqdm(tempt_train.iterrows(), total=tempt_train.shape[0]):
    neg_word=neg_word | set(row["adj_bag_of_word"].split())
neg_word