In [1]:
import os
import re
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)
import itertools
import spacy
nlp = spacy.load("en_core_web_md")
from textblob import TextBlob
# python -m textblob.download_corpora
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
all_stopwords_gensim = STOPWORDS.union(set(['thank','thanks', 'you', 'help','questions','a.m.','p.m.','friday','thursday','wednesday','tuesday','monday',\
                                            'askunum','email','askunum.com','unum','askunumunum.com','day','use', 'appreciate','available','mailtoaskunumunum.com',\
                                            'hello','hi','online','?','.','. .','phone','needs','need','let','know','service','information','time','meet','client',\
                                           'team','ask','file','date','opportunity','original','benefit','eastern','specialists','specialist','attached','experienced',\
                                            'benefits insurance','employee','click','organization','httpsbit.lycjrbm',  'received', 'billing', 'manager', 'assist', \
                                            'additional', 'response','vlif']))

In [3]:
def text_preprocess(text, extract_adj=False):
    # lemma = nltk.wordnet.WordNetLemmatizer()
    
    text = str(text)
    
    #remove http links from the email
    
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '')  
    
    text = re.sub("`", "'", text)
    
    #fix misspelled words

    '''Here we are not actually building any complex function to correct the misspelled words but just checking that each character 
    should occur not more than 2 times in every word. It’s a very basic misspelling check.'''

    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    if extract_adj:
        ADJ_word=[]
        doc=nlp(text)
        for token in doc:
            if token.pos_=="ADJ":
                ADJ_word.append(token.text)   
        text=" ".join(ADJ_word)    

    # text = [appos[word] if word in appos else word for word in text.lower().split()]
    # text = " ".join(text)
    
    ### Remove stop word
    text = [i for i in word_tokenize(text) if i not in all_stopwords_gensim]
    text = " ".join(text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]
    text=" ".join(text)
    
    # stem
    # ps = PorterStemmer()
    # text=" ".join(set([ps.stem(w) for w in text.split()]))
    
    return text
    
def textblob_sentiment(text):
    pol_score = TextBlob(text).sentiment.polarity
    if pol_score > 0: 
        return 'positive'
    elif pol_score == 0: 
        return 'neutral'
    else: 
        return 'negative'

def vader_sentiment(text):
    
    senti = SentimentIntensityAnalyzer()
    compound_score = senti.polarity_scores(text)['compound']
    
    # set sentiment 
    if compound_score >= 0.05: 
        return 'positive'
    elif (compound_score > -0.05) and (compound_score < 0.05): 
        return 'neutral'
    else: 
        return 'negative'

In [4]:
email_all=load_from_disk(os.path.join(os.getcwd(),"dataset","email_all"))
email_all

DatasetDict({
    train: Dataset({
        features: ['unum_id', 'policy_id', 'Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'month', 'email_counts', 'issue_counts', 'duration', 'subtype', 'churn'],
        num_rows: 143551
    })
    test: Dataset({
        features: ['unum_id', 'policy_id', 'Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'month', 'email_counts', 'issue_counts', 'duration', 'subtype', 'churn'],
        num_rows: 25246
    })
})

In [5]:
train_data=email_all['train']
test_data=email_all['test']
train_data.set_format(type="pandas")
df_train=train_data[:]
test_data.set_format(type="pandas")
df_test=test_data[:]

In [6]:
tempt1=pd.DataFrame(df_train["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_train["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Training set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,122466,85.31%
1,1,21085,14.69%


In [7]:
tempt1=pd.DataFrame(df_test["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_test["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Test set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,21739,86.11%
1,1,3507,13.89%


In [None]:
df_train["bag_of_word"]=df_train["Full_TextBody"].progress_apply(text_preprocess)
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

 77%|███████▋  | 110917/143551 [23:50<07:23, 73.65it/s] 

In [None]:
df_train["adj_bag_of_word"]=df_train["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
df_test["adj_bag_of_word"]=df_test["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))

In [None]:
def most_common_adj(df,feature):
    adj_count=Counter()
    for index,row in tqdm(df.iterrows(), total=df.shape[0]):
        adj_count.update(set(row[feature])
    adj,freq=zip(*adj_count.most_common())
    return adj,freq

In [None]:
train_data=df_train.copy()
test_data=df_test.copy()

train_churn,  train_no_churn=df_train[df_train['churn']==1], df_train[df_train['churn']==0]
test_churn,  test_no_churn=df_test[df_test['churn']==1], df_test[df_test['churn']==0]

adj_train_churn, freq_train_churn = most_common_adj(train_churn, feature="adj_bag_of_word")
adj_test_churn, freq_test_churn = most_common_adj(test_churn, feature="adj_bag_of_word")

In [None]:
train_data=df_train.copy()
test_data=df_test.copy()

train_churn,  train_no_churn=df_train[df_train['churn']==1], df_train[df_train['churn']==0]
test_churn,  test_no_churn=df_test[df_test['churn']==1], df_test[df_test['churn']==0]

In [None]:
def extract_adj(text):
    ADJ_word=set()
    doc=nlp(text)
    for token in doc:
        if token.pos_=="ADJ":
            ADJ_word.add(token.text)
    return ADJ_word

def most_common_adj(df):
    adj_count=Counter()
    for index,row in tqdm(df.iterrows(), total=df.shape[0]):
        adj_word=extract_adj(row['Full_TextBody'])
        text=" ".join(adj_word)
        text=text_preprocess(text)
        adj_word=set(text.split())
        adj_count.update(adj_word)
        adj,freq=zip(*adj_count.most_common())
    return adj,freq

In [None]:
adj_train_churn, freq_train_churn = most_common_adj(train_churn)

In [None]:
# adj_train_no_churn, freq_train_no_churn = most_common_adj(train_no_churn)

In [None]:
adj_test_churn, freq_test_churn = most_common_adj(test_churn)

In [None]:
# adj_test_no_churn, freq_test_no_churn = most_common_adj(test_no_churn)

In [None]:
len(adj_train_churn), len(adj_test_churn)

In [None]:
for index,row in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    adj_word=extract_adj(row['Full_TextBody'])
    if index==1:
        break

In [None]:
text=" ".join(adj_word)
text

In [None]:
text=text_preprocess(text)
text

In [None]:
set(text.split())

In [None]:
text="""
'unum',
'specialist',
'great',
'additional',
'good',
'wonderful',
'confidential',
'effective',
'new'
"""
text=" ".join(text.split("\n"))
text

In [None]:
ADJ_word=set()
doc=nlp(text)
for token in doc:
    if token.pos_=="ADJ":
        ADJ_word.add(token.text)
        
ADJ_word

In [None]:
## data preprocessing
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

### Test Set

In [None]:
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

In [None]:
def extract_adj(text):
    ADJ_word=set()
    doc=nlp(text)
    for token in doc:
        if token.pos_=="ADJ":
            ADJ_word.add(token.text)
    return ADJ_word

adj_count=Counter()
tempt_test=df_test[df_test["churn"]==1]
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    adj_word=extract_adj(row['Full_TextBody'])
    adj_count.update(adj_word)
    adj,freq=zip(*adj_count.most_common(50))
for i ,j in zip(adj,freq):
    print("{:<20}{:<20,}".format(i,j))

In [None]:
# tempt=df_test.copy()
# tempt["set_word"]=tempt["Full_TextBody"].progress_apply(lambda x: set(x.split()))
# tempt["terminate"]=tempt["set_word"].progress_apply(lambda x: 1 if set(["i'll"]).issubset(x) else 0 )
# tempt[tempt["terminate"]==1]["Full_TextBody"].iloc[2]

In [None]:
tempt=df_test.copy()
tempt["set_word"]=tempt["Full_TextBody"].progress_apply(lambda x: set(x.split()))
tempt["terminate"]=tempt["set_word"].progress_apply(lambda x: 1 if set(["termination","terminate"]).issubset(x) else 0 )
tempt["terminate"].value_counts()

In [None]:
tempt[(tempt["terminate"]==1) & (tempt["churn"]==0)]["Full_TextBody"].iloc[2]

In [None]:
tempt=df_test[df_test["churn"]==1]
tempt.shape

In [None]:
tempt.head(2)

In [None]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

In [None]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

In [None]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

In [None]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

In [None]:
tempt.sentiment.value_counts()

In [None]:
tempt.vader_sentiment.value_counts()

In [None]:
tempt_test=tempt[tempt['vader_sentiment']=="negative"]
neg_word_test=set()
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    neg_word_test=neg_word_test | set(row["adj_bag_of_word"].split())
neg_word_test

In [None]:
tempt_test["Latest_TextBody"].iloc[16]

In [None]:
# text='active'
# TextBlob(text).sentiment.polarity

# from textblob import TextBlob

# testimonial = TextBlob("active")
# print(testimonial.sentiment)

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# analyzer = SentimentIntensityAnalyzer()
# sentence = "The food was terrible!" 
# vs = analyzer.polarity_scores(sentence)
# print("{:-<65} {}".format(sentence, str(vs)))

In [None]:
# !pip install --quiet flair
# from flair.models import TextClassifier
# from flair.data import Sentence

# classifier = TextClassifier.load('en-sentiment')
# sentence = Sentence('The food was great!')
# classifier.predict(sentence)

# # print sentence with predicted labels
# print('Sentence above is: ', sentence.labels)

### Training set

In [None]:
tempt=df_train[df_train["churn"]==1]
tempt.shape

In [None]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

In [None]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

In [None]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

In [None]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

In [None]:
tempt.sentiment.value_counts()

In [None]:
tempt.vader_sentiment.value_counts()

In [None]:
tempt_train=tempt[tempt['vader_sentiment']=="negative"]
tempt_train.head()

In [None]:
neg_word=set()
for index,row in tqdm(tempt_train.iterrows(), total=tempt_train.shape[0]):
    neg_word=neg_word | set(row["adj_bag_of_word"].split())
neg_word