In [120]:
import argparse
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.tokenize import word_tokenize
from functools import reduce
en_stopwords = set(stopwords.words('english')) 
import itertools
import re
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import textwrap

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [121]:
argparser = argparse.ArgumentParser("Data Preprocessing") 
argparser.add_argument('--year', type=str, default="unique_unum_v1")
args,_ = argparser.parse_known_args()

print(args)

my_folder="s3://trident-retention-output/"
folder = 's3://trident-retention-data/askunum/'


Namespace(year='unique_unum_v1')


In [3]:
def load_askunum_df(folder, year, usecols=None, nrows=None): 
    if year == 2018: # ['ID', 'PARENTID', 'PARENT.CREATEDDATE', 'PARENT.CLOSEDDATE']
        askunum_df = pd.read_csv(folder + 'askunum_2018.csv', encoding='latin-1', usecols=usecols, nrows=nrows)
       
    if year == 2019: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2019_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(1, 4)]) 
        
    if year == 2020:  
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2020_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)])

    if year == 2021: 
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2021_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)]) 
        
    if year == 2022: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2022_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(0, 4)])
        
    return askunum_df

In [None]:
args.year=2022
year=args.year

In [None]:
if year in [2018, 2019]: 
    idx = 'ID'
    parent_id = 'PARENTID'
    text_body = 'TEXTBODY'
    created_date = 'PARENT.CREATEDDATE'
    closed_date = 'PARENT.CLOSEDDATE'
    Incoming='INCOMING'
    subtype= 'PARENT.SUB_TYPE_TXT__C' 
    message_date= 'MESSAGEDATE'

if year in [2020, 2021]: 
    idx = 'Id'
    parent_id = 'ParentId'
    text_body = 'TextBody'
    created_date = 'CreatedDate' 
    closed_date = 'ClosedDate'
    Incoming='Incoming'
    subtype= 'SUB_TYPE_TXT__c'
    message_date= 'MessageDate'

if year in [2022]:
    idx = 'Id'
    parent_id = 'ParentId'
    text_body = 'TextBody'
    created_date = 'Parent.CreatedDate'  
    closed_date = 'Parent.ClosedDate'
    Incoming='Incoming'
    subtype= 'Parent.SUB_TYPE_TXT__c'
    message_date= 'MessageDate'

askunum_text = load_askunum_df(folder, year, usecols = [idx, parent_id, text_body, created_date, closed_date, Incoming, subtype,message_date], nrows=5)

In [None]:
askunum_text.head(2)

In [None]:
df=pd.DataFrame({"id":[1,2],"text":["Hello, word. I am chadadf ok nice.","It is a nice adafred day. ok nice, I know dfadad. nice."]})
df

In [None]:
## removing non-english words from text
words = set(nltk.corpus.words.words())
df['text'] = df['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words or not w.isalpha()))

In [None]:
df

In [None]:
my_folder="s3://trident-retention-output/"
file_name="unique_unum_v1"
unique_unum_id=pd.read_csv(os.path.join(my_folder,file_name+".csv"))
unique_unum_id.head()

In [None]:
year=2022
my_folder="s3://trident-retention-output/"
df=pd.read_csv(os.path.join(my_folder,f"askunum_textbody_{year}"+".csv"))
df.head(2)

In [4]:
askunum_text=pd.DataFrame()
for year in [2018,2019,2020,2021,2022]:
    new_data=pd.read_csv(os.path.join(my_folder,f"askunum_textbody_{year}"+".csv"))
    askunum_text=pd.concat([askunum_text,new_data])
    # print(askunum_text.shape)
    print(f"{:<25}{:<25,}".format(year,new_data.shape[0]))

(1023585, 13)
(2173519, 13)
(3348090, 13)
(4493050, 13)
(5016545, 13)


In [7]:
print("{:<15}{:<20,}".format(year,new_data.shape[0]))

2022           523,495             


In [8]:
askunum_text.head(2)

Unnamed: 0.1,Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
0,0,02s0c00001KQIDDAA5,5000c00001iYULlAAO,True,2018-12-27T17:51:02.000Z,"please see attached, the form showing that cry...",2018-12-27 17:51:24+00:00,2018-12-28T19:18:23.000Z,Enrollment Submission,2018,12,001a000001lKDBx,440668861.0
1,1,02s0c00001KQg0hAAD,5000c00001iYULlAAO,False,2018-12-28T19:16:56.000Z,"good afternoon robin, thank you for contacting...",2018-12-27 17:51:24+00:00,2018-12-28T19:18:23.000Z,Enrollment Submission,2018,12,001a000001lKDBx,440668861.0


In [None]:
askunum_text.drop(['Unnamed: 0'],axis=1,inplace=True)
askunum_text['unum_id']=askunum_text['unum_id'].astype(int).astype(str)
askunum_text.sort_values(["unum_id","year","month","MessageDate"],inplace=True,ascending=True)
askunum_text.to_pickle(os.path.join(my_folder,"askunum_text_pickle"))

In [122]:
start=time.time()
askunum_text=pd.read_pickle(os.path.join(my_folder,'askunum_text_pickle'))
end=time.time()
print("It took {:0.4f} seconds to read text data".format(end-start))

It took 253.6405 seconds to read text data


In [11]:
askunum_text.shape

(5016545, 12)

In [123]:
for i in askunum_text[askunum_text["ParentId"]=='5003x00002GuQ6tAAF'].sort_values('MessageDate', ascending=False)['TextBody']:
    print('-'*200)
    print(i)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
hello team, thank you for your email. we have completed this termination request for jonathan sanchez, with a termination effective date of . the group will receive a credit of . on the invoice for the premiums paid for their coverage. please let us know if there is anything else that we can assist you with. thank you, the ask unum team client success organization askunum askunumunum.com unum covid response click here how to file a claim online click here we appreciate the opportunity to meet your benefit needs. if you have any questions, we have experienced service specialists available to help you monday through friday, a.m. to p.m. eastern time. original message
-----------------------------------------------------------------------------------------------------------------------------

In [124]:
askunum_text.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [66]:
np.unique(askunum_text["Subtype"].apply(str).values).shape

(156,)

In [15]:
unum_id=np.unique(tempt.unum_id.values)
unum_id

array(['1029314', '1032773472', '1106085736', '203278756', '218081500',
       '250346112', '521502', '667906663', '711161932', '797598268',
       '870566381', '879092819'], dtype=object)

In [17]:
np.array_split(unum_id,5)

[array(['1029314', '1032773472', '1106085736'], dtype=object),
 array(['203278756', '218081500', '250346112'], dtype=object),
 array(['521502', '667906663'], dtype=object),
 array(['711161932', '797598268'], dtype=object),
 array(['870566381', '879092819'], dtype=object)]

In [24]:
df=pd.read_csv(os.path.join(my_folder, "unique_unum_v1.csv"))["0"].values
df

array([1000110382, 1000210715, 1000233839, ...,  167994141,  167996771,
        168007991])

In [60]:
def chunks_split(data,n):
    k=len(data)//n
    for i in range(0,n-1):
        yield data[i*k:(i+1)*k]
    yield data[(n-1)*k:]
    
unum_id=np.unique(askunum_text.unum_id.values)
chunk=chunks_split(unum_id,10)

In [47]:
unum_id

array(['1000110382', '1000197507', '1000210715', ..., '999535665',
       '999536230', '999603547'], dtype=object)

In [61]:
for i in tqdm(range(4)):
    print(next(iter(chunk)))

100%|██████████| 4/4 [00:00<00:00, 10407.70it/s]

['1000110382' '1000197507' '1000210715' ... '162089611' '162095022'
 '162100821']
['162115948' '162121782' '162125981' ... '346666687' '346669776'
 '346688639']
['346715331' '346718562' '34674078' ... '463385' '463388' '463399']
['463401029' '463402159' '46343729' ... '526434' '526444' '526454']





In [51]:
tempt=next(iter(chunk))

In [56]:
pd.DataFrame(tempt,columns=["unum_id"]).reset_index(drop=True).to_csv("tempt.csv")

In [59]:
pd.read_csv("tempt.csv",usecols=["unum_id"]).head()

Unnamed: 0,unum_id
0,590992
1,590996294
2,590998
3,590999
4,591023


In [36]:
next(iter(chunk))

array(['502232', '507668553', '558933586'], dtype=object)

In [37]:
next(iter(chunk))

array(['564543503', '625762', '650410023'], dtype=object)

In [38]:
next(iter(chunk))

array(['715351897', '743568709', '743807465', '842965', '994377397'],
      dtype=object)

In [125]:
askunum_text.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [126]:
askunum_df=askunum_text.copy()
askunum_df['CreatedDate'] = pd.to_datetime(askunum_df['CreatedDate'])
askunum_df['year']=askunum_df['CreatedDate'].apply(lambda x:x.year)
askunum_df['month']=askunum_df['CreatedDate'].apply(lambda x:x.month)
askunum_df.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [128]:
email_counts_by_month = askunum_df.groupby(['account_id', 'year', 'month'])[['Id']].count()
issue_counts_by_month = askunum_df.drop('Id', axis=1).drop_duplicates().groupby(['account_id', 'year', 'month'])[['ParentId']].count()

In [129]:
combined_df = email_counts_by_month.join(issue_counts_by_month)
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Id,ParentId
account_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1
0010c00001sr7B7,2018,4,5,5
0010c00001sr7B7,2018,5,10,10
0010c00001sr7B7,2018,6,10,10
0010c00001sr7B7,2018,8,3,3
0010c00001sr7B7,2018,9,5,5


In [130]:
combined_df.rename({"Id":'askunum_id_count', 'ParentId':'askunum_parentid_count'}, axis=1, inplace=True)
combined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,askunum_id_count,askunum_parentid_count
account_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1
0010c00001sr7B7,2018,4,5,5
0010c00001sr7B7,2018,5,10,10
0010c00001sr7B7,2018,6,10,10
0010c00001sr7B7,2018,8,3,3
0010c00001sr7B7,2018,9,5,5


In [133]:
combined_df.reset_index(drop=True)
combined_df.columns

Index(['askunum_id_count', 'askunum_parentid_count'], dtype='object')

In [135]:
combined_df=combined_df.reset_index()
combined_df.head(2)

Unnamed: 0,account_id,year,month,askunum_id_count,askunum_parentid_count
0,0010c00001sr7B7,2018,4,5,5
1,0010c00001sr7B7,2018,5,10,10


In [138]:
askunum_df.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [139]:
askunum_df.groupby(['account_id', 'ParentId']).agg({'CreatedDate': 'first', 'ClosedDate':'first', 'Subtype' : 'first'})

Unnamed: 0_level_0,Unnamed: 1_level_0,CreatedDate,ClosedDate,Subtype
account_id,ParentId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0010c00001sr7B7,5000c00001XJbwJAAT,2018-04-04 17:08:37+00:00,2018-04-05T15:15:13.000Z,Employee Coding
0010c00001sr7B7,5000c00001YGQaPAAX,2018-04-27 13:14:10+00:00,2018-04-27T13:29:31.000Z,Portability or Conversion
0010c00001sr7B7,5000c00001YGs9iAAD,2018-04-30 12:22:32+00:00,2018-04-30T16:50:42.000Z,Employee Coding
0010c00001sr7B7,5000c00001Z9VPfAAN,2018-05-16 12:40:47+00:00,2018-05-17T15:56:59.000Z,Renewal Inquiry
0010c00001sr7B7,5000c00001Ze7J8AAJ,2018-05-21 14:20:07+00:00,2018-05-21T20:55:53.000Z,Census Listing
...,...,...,...,...
001a000001st9BC,5000c00001tqZvyAAE,2019-09-19 19:41:49+00:00,2019-09-19T20:05:49.000Z,Employee Coding Discrepancy
001a000001st9BC,5003x00001zczNcAAI,2020-04-30 17:10:20+00:00,2020-05-02T00:42:37.000+0000,Broker of Record Change (BOR)
001a000001st9BC,5003x000020F0YFAA0,2020-06-11 16:57:52+00:00,2020-06-11T17:13:48.000+0000,Enrollment
001a000001st9BC,5003x000020Fm5PAAS,2020-06-19 19:32:43+00:00,2020-06-23T18:44:02.000+0000,Enrollment Kit


In [141]:
askunum_df = askunum_df.groupby(['account_id', 'ParentId']).agg({'CreatedDate': 'first', 'ClosedDate':'first', 'Subtype' : 'first'})
askunum_df['CreatedDate'] = pd.to_datetime(askunum_df['CreatedDate'])
askunum_df['ClosedDate'] = pd.to_datetime(askunum_df['ClosedDate'])
askunum_df['count'] = 1
askunum_df['askunum_days'] = (askunum_df['ClosedDate'] - askunum_df['CreatedDate']).apply(lambda x: (x.days * 24 + x.seconds / 3600)/24)

# per account, per subtype txt_c sum
askunum_count_pivot = pd.pivot_table(askunum_df, index='account_id', columns='Subtype', values='count', aggfunc='sum').fillna(0)
askunum_days_pivot = pd.pivot_table(askunum_df, index='account_id', columns='Subtype', values='askunum_days', aggfunc='sum').fillna(0)

In [142]:
askunum_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CreatedDate,ClosedDate,Subtype,count,askunum_days
account_id,ParentId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0010c00001sr7B7,5000c00001XJbwJAAT,2018-04-04 17:08:37+00:00,2018-04-05 15:15:13+00:00,Employee Coding,1,0.92125
0010c00001sr7B7,5000c00001YGQaPAAX,2018-04-27 13:14:10+00:00,2018-04-27 13:29:31+00:00,Portability or Conversion,1,0.01066
0010c00001sr7B7,5000c00001YGs9iAAD,2018-04-30 12:22:32+00:00,2018-04-30 16:50:42+00:00,Employee Coding,1,0.186227
0010c00001sr7B7,5000c00001Z9VPfAAN,2018-05-16 12:40:47+00:00,2018-05-17 15:56:59+00:00,Renewal Inquiry,1,1.13625
0010c00001sr7B7,5000c00001Ze7J8AAJ,2018-05-21 14:20:07+00:00,2018-05-21 20:55:53+00:00,Census Listing,1,0.274838


In [143]:
askunum_count_pivot.head()

Subtype,1099,ACH Inquiry/Confirmation,Acquisition/Merger,"Add, Remove, or Update user access",Additional Claim Information Received,Address Change,Annual Re-Enrollment,Assignment,Attempted Self Service - Support Needed,Attempted Self-Service - Billing Support,...,Unum Employee Licensing,W-2 Tax Coupon,Website Orientation,Website Registration,Website Service Changes,Website Servicing,Write Off,eSign Declined,eSign Education,eSign Provided
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0010c00001sr7B7,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0010c00001sr7B9,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0010c00001sr7BA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0010c00001sr7BB,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0010c00001sr7BE,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
len(askunum_count_pivot.columns), len(askunum_days_pivot.columns)

(155, 155)

In [148]:
cols = list(askunum_count_pivot.columns)
new_count_cols = ["SUB_TYPE_TXT__c_{}".format(i) for i in cols]
new_days_cols = ["SUB_TYPE_TXT__c_{}_duration".format(i) for i in cols]
askunum_count_pivot = askunum_count_pivot.rename(dict(zip(cols, new_count_cols)), axis=1) 
askunum_days_pivot = askunum_days_pivot.rename(dict(zip(cols, new_days_cols)), axis=1)
askunum_subtypes_df = askunum_count_pivot.join(askunum_days_pivot)
len(askunum_subtypes_df.columns)

310

In [136]:
my_folder="s3://trident-retention-output/"
folder = 's3://trident-retention-data/askunum/'

churn_text_pickle=pd.read_pickle(os.path.join(my_folder, "churn_text_pickle_v1"))
for i in range(2,11):
    X=pd.read_pickle(os.path.join(my_folder, f"churn_text_pickle_v{i}"))
    churn_text_pickle=pd.concat([churn_text_pickle,X])
    
print(churn_text_pickle.shape)
churn_text_pickle=churn_text_pickle[churn_text_pickle['Full_TextBody']!='original message'] ## there are 30 observations that have textbody=='original message'
print(churn_text_pickle.shape)

(184177, 12)
(184173, 12)


In [150]:
my_folder="s3://trident-retention-output/"
folder = 's3://trident-retention-data/askunum/'
