In [43]:
import time
import os
import re
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)
import itertools
import spacy
nlp = spacy.load("en_core_web_md")
from textblob import TextBlob
# python -m textblob.download_corpora
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
all_stopwords_gensim = STOPWORDS.union(set(['thank','thanks', 'you', 'help','questions','a.m.','p.m.','friday','thursday','wednesday','tuesday','monday',\
                                            'askunum','email','askunum.com','unum','askunumunum.com','day','use', 'appreciate','available','mailtoaskunumunum.com',\
                                            'hello','hi','online','?','.','. .','phone','needs','need','let','know','service','information','time','meet','client',\
                                           'team','ask','file','date','opportunity','original','benefit','eastern','specialists','specialist','attached','experienced',\
                                            'benefits insurance','employee','click','organization','httpsbit.lycjrbm',  'received', 'billing', 'manager', 'assist', \
                                            'additional', 'response','vlif']))

In [3]:
def text_preprocess(text, extract_adj=False):
    # lemma = nltk.wordnet.WordNetLemmatizer()
    
    text = str(text)
    
    #remove http links from the email
    
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '')  
    
    text = re.sub("`", "'", text)
    
    #fix misspelled words

    '''Here we are not actually building any complex function to correct the misspelled words but just checking that each character 
    should occur not more than 2 times in every word. It’s a very basic misspelling check.'''

    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    if extract_adj:
        ADJ_word=[]
        doc=nlp(text)
        for token in doc:
            if token.pos_=="ADJ":
                ADJ_word.append(token.text)   
        text=" ".join(ADJ_word)    

    # text = [appos[word] if word in appos else word for word in text.lower().split()]
    # text = " ".join(text)
    
    ### Remove stop word
    text = [i for i in word_tokenize(text) if i not in all_stopwords_gensim]
    text = " ".join(text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]
    text=" ".join(text)
    
    # stem
    # ps = PorterStemmer()
    # text=" ".join(set([ps.stem(w) for w in text.split()]))
    
    return text
    
def textblob_sentiment(text):
    pol_score = TextBlob(text).sentiment.polarity
    if pol_score > 0: 
        return 'positive'
    elif pol_score == 0: 
        return 'neutral'
    else: 
        return 'negative'

def vader_sentiment(text):
    
    senti = SentimentIntensityAnalyzer()
    compound_score = senti.polarity_scores(text)['compound']
    
    # set sentiment 
    if compound_score >= 0.05: 
        return 'positive'
    elif (compound_score > -0.05) and (compound_score < 0.05): 
        return 'neutral'
    else: 
        return 'negative'

In [4]:
email_all=load_from_disk(os.path.join(os.getcwd(),"dataset","email_all"))
email_all

DatasetDict({
    train: Dataset({
        features: ['unum_id', 'policy_id', 'Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'month', 'email_counts', 'issue_counts', 'duration', 'subtype', 'churn'],
        num_rows: 143551
    })
    test: Dataset({
        features: ['unum_id', 'policy_id', 'Full_TextBody', 'Client_TextBody', 'Latest_TextBody', 'year', 'month', 'email_counts', 'issue_counts', 'duration', 'subtype', 'churn'],
        num_rows: 25246
    })
})

In [5]:
train_data=email_all['train']
test_data=email_all['test']
train_data.set_format(type="pandas")
df_train=train_data[:]
test_data.set_format(type="pandas")
df_test=test_data[:]

In [6]:
tempt1=pd.DataFrame(df_train["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_train["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Training set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,122466,85.31%
1,1,21085,14.69%


In [7]:
tempt1=pd.DataFrame(df_test["churn"].value_counts(dropna=False)).reset_index().rename(columns={'index':'churn','churn':'count'})
tempt2=pd.DataFrame(df_test["churn"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'churn','churn':'percentage'})
tempt1.merge(tempt2, on="churn", how="inner").style.format({'count':'{:,}','percentage':'{:.2%}'}).set_caption("Test set churn dist")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,churn,count,percentage
0,0,21739,86.11%
1,1,3507,13.89%


In [8]:
df_train["bag_of_word"]=df_train["Full_TextBody"].progress_apply(text_preprocess)
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

100%|██████████| 143551/143551 [30:52<00:00, 77.47it/s] 
100%|██████████| 25246/25246 [06:10<00:00, 68.11it/s] 


In [None]:
df_train["adj_bag_of_word"]=df_train["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
df_test["adj_bag_of_word"]=df_test["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))

  9%|▉         | 13010/143551 [43:21<6:51:33,  5.29it/s] 

In [23]:
df_train.head()

Unnamed: 0,unum_id,policy_id,Full_TextBody,Client_TextBody,Latest_TextBody,year,month,email_counts,issue_counts,duration,subtype,churn,bag_of_word,adj_bag_of_word
0,158757547,403324,. . you have received an encrypted message fro...,. . you have received an encrypted message fro...,. . you have received an encrypted message fro...,2020,1,12,11,10.671863,Enrollment Submission,0,encrypted message abd insurance financial serv...,financial confidential addressee financial con...
1,122323585,604582,". . hello, could we please get the eoi form th...",". . hello, could we please get the eoi form th...","re 604581,604582 hello melissa, . for amplify ...",2020,7,3,3,3.384468,EOI Submission,0,eoi form needed guarantee issue melissa meliss...,eoi direct main electronic initial eoi eoi eoi...
2,791399,217586,name sonya marsh phone 804-249-5412 response r...,name sonya marsh phone 804-249-5412 response r...,"re 217586 hi sonya, . i needed to correct the ...",2019,3,1,1,5.015174,Employee Coding,0,sonya marsh requested e mail policy division c...,mail annual mistake re incorrect effective rec...
3,611560,933183,. . this message was sent securely using zix h...,. . this message was sent securely using zix h...,"re r0364752, 933183, r0091074 greetings lisa, ...",2021,5,1,2,1.124016,GPC â Client Request,0,message sent securely zix federal process advi...,federal federal critical new effective senior ...
4,635710940,658785,". . good afternoon, please process the attache...",". . good afternoon, please process the attache...",". . good morning, this members enrollment is s...",2020,2,20,19,19.037338,Employee Coding,1,good afternoon process termination member ivan...,good effective complete individual financial s...


In [None]:
# my_folder="s3://trident-retention-output/"
# df_train.to_pickle(os.path.join(my_folder,"df_train"))
# df_test.to_pickle(os.path.join(my_folder,"df_test"))

In [52]:
my_folder="s3://trident-retention-output/"
start=time.time()
df_train=pd.read_pickle(os.path.join(my_folder,"df_train"))
df_test=pd.read_pickle(os.path.join(my_folder,"df_test"))
end=time.time()
print("It took {:0.4f} seconds to read data".format(end-start))        

It took 169.2008 seconds to read data


In [53]:
## removing non-english words from text
words = set(nltk.corpus.words.words())
df_train["adj_bag_of_word"] = df_train["adj_bag_of_word"].progress_apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words ))
df_test["adj_bag_of_word"] = df_test["adj_bag_of_word"].progress_apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words ))

df_train["bag_of_word"] = df_train["bag_of_word"].progress_apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words ))
df_test["bag_of_word"] = df_test["bag_of_word"].progress_apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words ))

100%|██████████| 143551/143551 [00:03<00:00, 36956.53it/s]
100%|██████████| 25246/25246 [00:00<00:00, 33228.29it/s]
100%|██████████| 143551/143551 [00:35<00:00, 4092.11it/s]
100%|██████████| 25246/25246 [00:07<00:00, 3261.95it/s]


In [57]:
## removing short text
df_train["adj_bag_of_word"] = df_train["adj_bag_of_word"].progress_apply(lambda x: " ".join(w for w in x.split() if len(w)>3) )
df_test["adj_bag_of_word"] = df_test["adj_bag_of_word"].progress_apply(lambda x: " ".join(w for w in x.split() if len(w)>3))

df_train["bag_of_word"] = df_train["bag_of_word"].progress_apply(lambda x: " ".join(w for w in x.split() if len(w)>3))
df_test["bag_of_word"] = df_test["bag_of_word"].progress_apply(lambda x: " ".join(w for w in x.split() if len(w)>3))

100%|██████████| 143551/143551 [00:01<00:00, 102790.12it/s]
100%|██████████| 25246/25246 [00:00<00:00, 97284.49it/s] 
100%|██████████| 143551/143551 [00:08<00:00, 17755.73it/s]
100%|██████████| 25246/25246 [00:01<00:00, 15407.38it/s]


In [59]:
def most_common_word(df,feature):
    word_count=Counter()
    for index,row in tqdm(df.iterrows(), total=df.shape[0]):
        word_count.update(set(row[feature].split()))
    word,freq=zip(*word_count.most_common())
    return word,freq

In [60]:
train_data=df_train.copy()
test_data=df_test.copy()

train_churn,  train_no_churn=df_train[df_train['churn']==1], df_train[df_train['churn']==0]
test_churn,  test_no_churn=df_test[df_test['churn']==1], df_test[df_test['churn']==0]

adj_train_churn, freq_train_churn = most_common_word(train_churn, feature="adj_bag_of_word")
adj_test_churn, freq_test_churn = most_common_word(test_churn, feature="adj_bag_of_word")
adj_train_no_churn, freq_train_no_churn = most_common_word(train_no_churn, feature="adj_bag_of_word")
adj_test_no_churn, freq_test_no_churn = most_common_word(test_no_churn, feature="adj_bag_of_word")

100%|██████████| 21085/21085 [00:01<00:00, 18917.83it/s]
100%|██████████| 3507/3507 [00:00<00:00, 19004.25it/s]
100%|██████████| 122466/122466 [00:06<00:00, 19715.37it/s]
100%|██████████| 21739/21739 [00:01<00:00, 19612.70it/s]


In [66]:
keyword_training=[w for w in adj_train_churn if w not in adj_train_no_churn]
keyword_test=[w for w in adj_test_churn if w not in adj_test_no_churn]

dict_data={}
dict_data["training"]=keyword_training[0:50]
dict_data["test"]=keyword_test[0:50]
pd.DataFrame(dict_data).style.format().set_caption("Most common adjective keywords with churn==1")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,training,test
0,topical,incumbent
1,hundredth,liquid
2,forbidden,sizeable
3,unmasked,bord
4,faith,dependable
5,lifetime,billed
6,running,choose
7,shiver,impartial
8,quorum,fixed
9,volatile,sorted


In [68]:
word_train_churn, freq_train_churn = most_common_word(train_churn, feature="bag_of_word")
word_test_churn, freq_test_churn = most_common_word(test_churn, feature="bag_of_word")
word_train_no_churn, freq_train_no_churn = most_common_word(train_no_churn, feature="bag_of_word")
word_test_no_churn, freq_test_no_churn = most_common_word(test_no_churn, feature="bag_of_word")

keyword_training=[w for w in word_train_churn if w not in word_train_no_churn]
keyword_test=[w for w in word_test_churn if w not in word_test_no_churn]

dict_data={}
dict_data["training"]=keyword_training[0:50]
dict_data["test"]=keyword_test[0:50]
pd.DataFrame(dict_data).style.format().set_caption("Most common keywords with churn==1")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

Unnamed: 0,training,test
0,subbing,enduring
1,hornblower,paternoster
2,elderwood,matin
3,topical,never
4,towline,sheth
5,allude,survival
6,steamboat,korona
7,gratuitously,sizeable
8,soja,compounding
9,futurism,rainer


In [94]:
def check_word(df,word):
    tempt=df.copy()
    tempt["check_word"]=tempt["bag_of_word"].progress_apply(lambda x: 1 if len(set(word).intersection(set(x.split())))!=0 else 0 )
    tempt=tempt[tempt["check_word"]==1]
    tempt.drop(columns=['check_word'],inplace=True)
    return tempt

In [95]:
tempt_train=check_word(df_train,word=["termination","terminate"])
tempt_train["churn"].value_counts()

100%|██████████| 143551/143551 [00:05<00:00, 27842.31it/s]


0    36089
1     7156
Name: churn, dtype: int64

In [98]:
tempt_train=check_word(df_train,word=["frustration","frustrated","frustrate","unacceptable","apologies","apologize"])
tempt_train["churn"].value_counts()

100%|██████████| 143551/143551 [00:05<00:00, 27425.34it/s]


0    24345
1     4907
Name: churn, dtype: int64

In [75]:
tempt=tempt[(tempt["terminate"]==1) & (tempt["churn"]==0)]
tempt.shape

(2355, 15)

In [84]:
tempt["churn"].iloc[3]

0

In [83]:
tempt["Latest_TextBody"].iloc[3]

"re 459285 hi lela, thank you for your call today i have attached the claim form for long term disability, per your request. please let me know if there is anything further that i can assist you with. have a great day thank you, dayna shaw service specialist associate client success organization 1-800-ask-unum 1-800-275-8686 askunumunum.com unum covid-19 response - how to file a claim online - . , ..re 459285 hi lela, i hope this email finds you well and thank you for your call i have attached this group's ltd contract, per your request. i have also attached separately just the portion of the contract that outlines when ltd benefits will end. please let me know if you have any questions. have a great day thank you, dayna shaw service specialist associate client success organization 1-800-ask-unum 1-800-275-8686 askunumunum.com unum covid-19 response - how to file a claim online - . , ..re 459285 inclusiv, inc hi scott, the opa lela ragbar called into ask unum today and is interested in

In [102]:
tempt=df_test.copy()
tempt=tempt.sample(100)
tempt.head(2)

Unnamed: 0,unum_id,policy_id,Full_TextBody,Client_TextBody,Latest_TextBody,year,month,email_counts,issue_counts,duration,subtype,churn,bag_of_word,adj_bag_of_word
498,1112335487,425717,hi can you please pull the 5/11 feed for me? t...,hi can you please pull the 5/11 feed for me? t...,hi thanks the attached looks off.let me know y...,2022,1,16,13,6.043507,Data Feed,0,pull feed thanks good morning feed success cov...,good good better quick accurate better quick a...
12784,617539,125781,. . i would like to have access to make on lin...,. . i would like to have access to make on lin...,"re 125781 hi joe, i hope this email finds you ...",2022,6,1,1,0.004433,"Add, Remove, or Update user access",0,like access line payment policy policy number ...,official intended confidential electronic conf...


In [114]:
tempt.columns

Index(['unum_id', 'policy_id', 'Full_TextBody', 'Client_TextBody',
       'Latest_TextBody', 'year', 'month', 'email_counts', 'issue_counts',
       'duration', 'subtype', 'churn', 'bag_of_word', 'adj_bag_of_word',
       'year_month_col'],
      dtype='object')

In [116]:
tempt=df_test.copy()
tempt=tempt.sample(100)
start = time.time()
dict_of_dataframes = dict()
tempt["year_month_col"]=pd.to_datetime(tempt.apply(lambda x: str(x['year'])+'-' + str(x['month']) ,axis=1))
counter = 0 
for id, group in tempt.groupby('policy_id'):
    print(time.time()-start, counter) if counter % 30 == 0 else None
    group = group.sort_values("year_month_col")
    group = group.set_index("year_month_col")
    dict_of_dataframes[id] = group
    counter += 1
dict_of_dataframes[0]

0.0034177303314208984 0
0.0226590633392334 30
0.042584896087646484 60
0.05961441993713379 90


KeyError: 0

In [119]:
dict_of_dataframes[24085]

Unnamed: 0_level_0,unum_id,policy_id,Full_TextBody,Client_TextBody,Latest_TextBody,year,month,email_counts,issue_counts,duration,subtype,churn,bag_of_word,adj_bag_of_word
year_month_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-01-01,710145,24085,"hi matthew, they are a rate hold for 2021 thei...","hi matthew, they are a rate hold for 2021 thei...","re 910703 hi matthew, thank you for contacting...",2022,1,3,5,3.66059,Renewal Inquiry,0,rate hold possible renewal send rate sheet cov...,possible best black grey grey great great sorr...


In [None]:
train_data=df_train.copy()
test_data=df_test.copy()

train_churn,  train_no_churn=df_train[df_train['churn']==1], df_train[df_train['churn']==0]
test_churn,  test_no_churn=df_test[df_test['churn']==1], df_test[df_test['churn']==0]

In [None]:
def extract_adj(text):
    ADJ_word=set()
    doc=nlp(text)
    for token in doc:
        if token.pos_=="ADJ":
            ADJ_word.add(token.text)
    return ADJ_word

def most_common_adj(df):
    adj_count=Counter()
    for index,row in tqdm(df.iterrows(), total=df.shape[0]):
        adj_word=extract_adj(row['Full_TextBody'])
        text=" ".join(adj_word)
        text=text_preprocess(text)
        adj_word=set(text.split())
        adj_count.update(adj_word)
        adj,freq=zip(*adj_count.most_common())
    return adj,freq

In [None]:
adj_train_churn, freq_train_churn = most_common_adj(train_churn)

In [None]:
# adj_train_no_churn, freq_train_no_churn = most_common_adj(train_no_churn)

In [None]:
adj_test_churn, freq_test_churn = most_common_adj(test_churn)

In [None]:
# adj_test_no_churn, freq_test_no_churn = most_common_adj(test_no_churn)

In [None]:
len(adj_train_churn), len(adj_test_churn)

In [None]:
for index,row in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    adj_word=extract_adj(row['Full_TextBody'])
    if index==1:
        break

In [None]:
text=" ".join(adj_word)
text

In [None]:
text=text_preprocess(text)
text

In [None]:
set(text.split())

In [None]:
text="""
'unum',
'specialist',
'great',
'additional',
'good',
'wonderful',
'confidential',
'effective',
'new'
"""
text=" ".join(text.split("\n"))
text

In [None]:
ADJ_word=set()
doc=nlp(text)
for token in doc:
    if token.pos_=="ADJ":
        ADJ_word.add(token.text)
        
ADJ_word

In [None]:
## data preprocessing
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

### Test Set

In [None]:
df_test["bag_of_word"]=df_test["Full_TextBody"].progress_apply(text_preprocess)

In [None]:
def extract_adj(text):
    ADJ_word=set()
    doc=nlp(text)
    for token in doc:
        if token.pos_=="ADJ":
            ADJ_word.add(token.text)
    return ADJ_word

adj_count=Counter()
tempt_test=df_test[df_test["churn"]==1]
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    adj_word=extract_adj(row['Full_TextBody'])
    adj_count.update(adj_word)
    adj,freq=zip(*adj_count.most_common(50))
for i ,j in zip(adj,freq):
    print("{:<20}{:<20,}".format(i,j))

In [None]:
# tempt=df_test.copy()
# tempt["set_word"]=tempt["Full_TextBody"].progress_apply(lambda x: set(x.split()))
# tempt["terminate"]=tempt["set_word"].progress_apply(lambda x: 1 if set(["i'll"]).issubset(x) else 0 )
# tempt[tempt["terminate"]==1]["Full_TextBody"].iloc[2]

In [None]:
tempt=df_test.copy()
tempt["set_word"]=tempt["Full_TextBody"].progress_apply(lambda x: set(x.split()))
tempt["terminate"]=tempt["set_word"].progress_apply(lambda x: 1 if set(["termination","terminate"]).issubset(x) else 0 )
tempt["terminate"].value_counts()

In [None]:
tempt[(tempt["terminate"]==1) & (tempt["churn"]==0)]["Full_TextBody"].iloc[2]

In [None]:
tempt=df_test[df_test["churn"]==1]
tempt.shape

In [None]:
tempt.head(2)

In [None]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

In [None]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

In [None]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

In [None]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

In [None]:
tempt.sentiment.value_counts()

In [None]:
tempt.vader_sentiment.value_counts()

In [None]:
tempt_test=tempt[tempt['vader_sentiment']=="negative"]
neg_word_test=set()
for index,row in tqdm(tempt_test.iterrows(), total=tempt_test.shape[0]):
    neg_word_test=neg_word_test | set(row["adj_bag_of_word"].split())
neg_word_test

In [None]:
tempt_test["Latest_TextBody"].iloc[16]

In [None]:
# text='active'
# TextBlob(text).sentiment.polarity

# from textblob import TextBlob

# testimonial = TextBlob("active")
# print(testimonial.sentiment)

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# analyzer = SentimentIntensityAnalyzer()
# sentence = "The food was terrible!" 
# vs = analyzer.polarity_scores(sentence)
# print("{:-<65} {}".format(sentence, str(vs)))

In [None]:
# !pip install --quiet flair
# from flair.models import TextClassifier
# from flair.data import Sentence

# classifier = TextClassifier.load('en-sentiment')
# sentence = Sentence('The food was great!')
# classifier.predict(sentence)

# # print sentence with predicted labels
# print('Sentence above is: ', sentence.labels)

### Training set

In [None]:
tempt=df_train[df_train["churn"]==1]
tempt.shape

In [None]:
tempt["bag_of_word"]=tempt["Full_TextBody"].progress_apply(text_preprocess)
tempt.head(2)

In [None]:
tempt["adj_bag_of_word"]=tempt["Full_TextBody"].progress_apply(lambda x: text_preprocess(x, extract_adj=True))
tempt.head(2)

In [None]:
tempt["sentiment"]=tempt["adj_bag_of_word"].progress_apply(textblob_sentiment)
tempt.head(2)

In [None]:
tempt["vader_sentiment"]=tempt["adj_bag_of_word"].progress_apply(vader_sentiment)
tempt.head(2)

In [None]:
tempt.sentiment.value_counts()

In [None]:
tempt.vader_sentiment.value_counts()

In [None]:
tempt_train=tempt[tempt['vader_sentiment']=="negative"]
tempt_train.head()

In [None]:
neg_word=set()
for index,row in tqdm(tempt_train.iterrows(), total=tempt_train.shape[0]):
    neg_word=neg_word | set(row["adj_bag_of_word"].split())
neg_word

In [120]:
import itertools

In [122]:
list(itertools.chain([1,2,3],[5,6,7]))

[1, 2, 3, 5, 6, 7]

In [125]:
negative_word=[]
with open("negative-words.txt") as f:
    for curline in f:
        if curline.startswith(";"):
            continue
        if curline.strip():
            negative_word.append(curline.strip())
len(negative_word)

4783