In [1]:
import argparse
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.tokenize import word_tokenize
from functools import reduce
en_stopwords = set(stopwords.words('english')) 
import itertools
import re
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import textwrap

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/ec2-user/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
argparser = argparse.ArgumentParser("Data Preprocessing") 
argparser.add_argument('--year', type=str, default="unique_unum_v1")
args,_ = argparser.parse_known_args()

print(args)

my_folder="s3://trident-retention-output/"
folder = 's3://trident-retention-data/askunum/'


Namespace(year='unique_unum_v1')


In [3]:
def load_askunum_df(folder, year, usecols=None, nrows=None): 
    if year == 2018: # ['ID', 'PARENTID', 'PARENT.CREATEDDATE', 'PARENT.CLOSEDDATE']
        askunum_df = pd.read_csv(folder + 'askunum_2018.csv', encoding='latin-1', usecols=usecols, nrows=nrows)
       
    if year == 2019: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2019_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(1, 4)]) 
        
    if year == 2020:  
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2020_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)])

    if year == 2021: 
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2021_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)]) 
        
    if year == 2022: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2022_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(0, 4)])
        
    return askunum_df

In [None]:
args.year=2022
year=args.year

In [None]:
if year in [2018, 2019]: 
    idx = 'ID'
    parent_id = 'PARENTID'
    text_body = 'TEXTBODY'
    created_date = 'PARENT.CREATEDDATE'
    closed_date = 'PARENT.CLOSEDDATE'
    Incoming='INCOMING'
    subtype= 'PARENT.SUB_TYPE_TXT__C' 
    message_date= 'MESSAGEDATE'

if year in [2020, 2021]: 
    idx = 'Id'
    parent_id = 'ParentId'
    text_body = 'TextBody'
    created_date = 'CreatedDate' 
    closed_date = 'ClosedDate'
    Incoming='Incoming'
    subtype= 'SUB_TYPE_TXT__c'
    message_date= 'MessageDate'

if year in [2022]:
    idx = 'Id'
    parent_id = 'ParentId'
    text_body = 'TextBody'
    created_date = 'Parent.CreatedDate'  
    closed_date = 'Parent.ClosedDate'
    Incoming='Incoming'
    subtype= 'Parent.SUB_TYPE_TXT__c'
    message_date= 'MessageDate'

askunum_text = load_askunum_df(folder, year, usecols = [idx, parent_id, text_body, created_date, closed_date, Incoming, subtype,message_date], nrows=5)

In [None]:
askunum_text.head(2)

In [None]:
df=pd.DataFrame({"id":[1,2],"text":["Hello, word. I am chadadf ok nice.","It is a nice adafred day. ok nice, I know dfadad. nice."]})
df

In [None]:
## removing non-english words from text
words = set(nltk.corpus.words.words())
df['text'] = df['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words or not w.isalpha()))

In [None]:
df

In [None]:
my_folder="s3://trident-retention-output/"
file_name="unique_unum_v1"
unique_unum_id=pd.read_csv(os.path.join(my_folder,file_name+".csv"))
unique_unum_id.head()

In [None]:
year=2022
my_folder="s3://trident-retention-output/"
df=pd.read_csv(os.path.join(my_folder,f"askunum_textbody_{year}"+".csv"))
df.head(2)

In [4]:
askunum_text=pd.DataFrame()
for year in [2018,2019,2020,2021,2022]:
    new_data=pd.read_csv(os.path.join(my_folder,f"askunum_textbody_{year}"+".csv"))
    askunum_text=pd.concat([askunum_text,new_data])
    # print(askunum_text.shape)
    print(f"{:<25}{:<25,}".format(year,new_data.shape[0]))

(1023585, 13)
(2173519, 13)
(3348090, 13)
(4493050, 13)
(5016545, 13)


In [7]:
print("{:<15}{:<20,}".format(year,new_data.shape[0]))

2022           523,495             


In [8]:
askunum_text.head(2)

Unnamed: 0.1,Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
0,0,02s0c00001KQIDDAA5,5000c00001iYULlAAO,True,2018-12-27T17:51:02.000Z,"please see attached, the form showing that cry...",2018-12-27 17:51:24+00:00,2018-12-28T19:18:23.000Z,Enrollment Submission,2018,12,001a000001lKDBx,440668861.0
1,1,02s0c00001KQg0hAAD,5000c00001iYULlAAO,False,2018-12-28T19:16:56.000Z,"good afternoon robin, thank you for contacting...",2018-12-27 17:51:24+00:00,2018-12-28T19:18:23.000Z,Enrollment Submission,2018,12,001a000001lKDBx,440668861.0


In [None]:
askunum_text.drop(['Unnamed: 0'],axis=1,inplace=True)
askunum_text['unum_id']=askunum_text['unum_id'].astype(int).astype(str)
askunum_text.sort_values(["unum_id","year","month","MessageDate"],inplace=True,ascending=True)
askunum_text.to_pickle(os.path.join(my_folder,"askunum_text_pickle"))

In [10]:
start=time.time()
askunum_text=pd.read_pickle(os.path.join(my_folder,'askunum_text_pickle'))
end=time.time()
print("It took {:0.4f} seconds to read text data".format(end-start))

It took 233.3024 seconds to read text data


In [11]:
askunum_text.shape

(5016545, 12)

In [12]:
askunum_text.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [66]:
np.unique(askunum_text["Subtype"].apply(str).values).shape

(156,)

In [15]:
unum_id=np.unique(tempt.unum_id.values)
unum_id

array(['1029314', '1032773472', '1106085736', '203278756', '218081500',
       '250346112', '521502', '667906663', '711161932', '797598268',
       '870566381', '879092819'], dtype=object)

In [17]:
np.array_split(unum_id,5)

[array(['1029314', '1032773472', '1106085736'], dtype=object),
 array(['203278756', '218081500', '250346112'], dtype=object),
 array(['521502', '667906663'], dtype=object),
 array(['711161932', '797598268'], dtype=object),
 array(['870566381', '879092819'], dtype=object)]

In [24]:
df=pd.read_csv(os.path.join(my_folder, "unique_unum_v1.csv"))["0"].values
df

array([1000110382, 1000210715, 1000233839, ...,  167994141,  167996771,
        168007991])

In [60]:
def chunks_split(data,n):
    k=len(data)//n
    for i in range(0,n-1):
        yield data[i*k:(i+1)*k]
    yield data[(n-1)*k:]
    
unum_id=np.unique(askunum_text.unum_id.values)
chunk=chunks_split(unum_id,10)

In [47]:
unum_id

array(['1000110382', '1000197507', '1000210715', ..., '999535665',
       '999536230', '999603547'], dtype=object)

In [61]:
for i in tqdm(range(4)):
    print(next(iter(chunk)))

100%|██████████| 4/4 [00:00<00:00, 10407.70it/s]

['1000110382' '1000197507' '1000210715' ... '162089611' '162095022'
 '162100821']
['162115948' '162121782' '162125981' ... '346666687' '346669776'
 '346688639']
['346715331' '346718562' '34674078' ... '463385' '463388' '463399']
['463401029' '463402159' '46343729' ... '526434' '526444' '526454']





In [51]:
tempt=next(iter(chunk))

In [56]:
pd.DataFrame(tempt,columns=["unum_id"]).reset_index(drop=True).to_csv("tempt.csv")

In [59]:
pd.read_csv("tempt.csv",usecols=["unum_id"]).head()

Unnamed: 0,unum_id
0,590992
1,590996294
2,590998
3,590999
4,591023


In [36]:
next(iter(chunk))

array(['502232', '507668553', '558933586'], dtype=object)

In [37]:
next(iter(chunk))

array(['564543503', '625762', '650410023'], dtype=object)

In [38]:
next(iter(chunk))

array(['715351897', '743568709', '743807465', '842965', '994377397'],
      dtype=object)

In [92]:
askunum_text.head(2)

Unnamed: 0,Id,ParentId,Incoming,MessageDate,TextBody,CreatedDate,ClosedDate,Subtype,year,month,account_id,unum_id
1136587,02s3x00001lwyxeAAA,5003x00002CFUfAAAX,False,2021-10-25T17:07:44.000+0000,fopittsburgh pa pam schon emailpambloomfieldga...,2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382
1136585,02s3x00001lwzYbAAI,5003x00002CFUfAAAX,False,2021-10-25T17:08:22.000+0000,"hi pam, i hope this email finds you well. we a...",2021-10-22 17:18:20+00:00,2021-10-25T17:08:45.000+0000,Charge or Credit Clarification,2021,10,0013x00002Ipz9G,1000110382


In [70]:
df=pd.read_csv(os.path.join(my_folder, "unique_unum_v1.csv"), usecols=["unum_id"]).values
df.squeeze().shape

(8927,)

In [71]:
churn_data=pd.read_pickle(os.path.join(my_folder,'churn_data_pickle'))

In [94]:
churn_df=churn_data.copy()
text_df=askunum_text.copy()
churn_df["unum_id"]=churn_df["unum_id"].apply(str)
text_df["unum_id"]=askunum_text["unum_id"].astype(int).apply(str)

file_name="unique_unum_v1"
unique_unum_id=pd.read_csv(os.path.join(my_folder,file_name+".csv"))['unum_id'].values[0:10]
unique_unum_id=unique_unum_id.astype(str)
unique_unum_id

array(['1000110382', '1000197507', '1000210715', '1000233839',
       '1000293377', '1000354872', '1000364014', '1000433738',
       '1000437341', '1000469823'], dtype='<U21')

In [91]:
tempt=pd.DataFrame(unique_unum_id,columns=["unum_id"]).reset_index(drop=True)
tempt.to_csv(os.path.join(my_folder,"test.csv"))

In [85]:
pd.to_datetime(text_df["MessageDate"]).dt.date>=row["start_date"]

  result = libops.scalar_compare(x.ravel(), y, op)


1136587    True
1136585    True
1136586    True
1136590    True
1136589    True
           ... 
892536     True
892539     True
892540     True
455631     True
455632     True
Name: MessageDate, Length: 479, dtype: bool

In [84]:
row["start_date"]

Timestamp('2021-06-01 00:00:00')

In [105]:
row

unum_id                1000469823
policy_id                  436049
pivot_date    2022-03-01 00:00:00
year                         2022
month                           3
start_date    2021-06-01 00:00:00
end_date      2021-12-01 00:00:00
churn                           0
Name: 445511, dtype: object

In [118]:
text_df=text_df[text_df['unum_id'].isin(unique_unum_id)]
churn_df=churn_df[churn_df["unum_id"].isin(unique_unum_id)]

unum_id=[]
policy_id=[]
pivot_date=[]
year=[]
month=[]
start_date=[]
end_date=[]
Full_TextBody=[]
Client_TextBody=[]
Latest_TextBody=[]
Subtype=[]
churn=[]

# text_df.loc[:,'day']=1
# text_df.loc[:,'date']=pd.to_datetime(text_df[['year','month','day']],format="%Y%m%d")
# text_df.drop(['day'],inplace=True, axis=1)

for index,row in tqdm(churn_df.iterrows(), total=churn_df.shape[0]):

    ### Concatenate email message between start_date and end_date
    tempt=text_df[(text_df["unum_id"]==row["unum_id"]) & (pd.to_datetime(text_df["MessageDate"]).dt.date>=row["start_date"]) &  (pd.to_datetime(text_df["MessageDate"]).dt.date<=row["end_date"])]
    tempt.dropna(subset=["TextBody"],inplace=True)

    if tempt.empty:
        continue

    tempt.sort_values(["unum_id","MessageDate"],inplace=True,ascending=True)
    tempt2=tempt.drop_duplicates(subset=["unum_id"],keep="last")
    Subtype.append(tempt.drop_duplicates(subset=["unum_id"],keep="last")["Subtype"].values[0])
    
    tempt_1=tempt.groupby(["unum_id"])["TextBody"].apply(lambda x : ".".join(x)).reset_index()
    Full_TextBody.append(tempt_1["TextBody"][0])
    

    tempt_2=tempt[tempt["Incoming"]==True]
    tempt_2=tempt_2.groupby(["unum_id"])["TextBody"].apply(lambda x : ".".join(x)).reset_index()
    if tempt_2.empty:
        Client_TextBody.append(None)
    else:
        Client_TextBody.append(tempt_2["TextBody"][0])

    tempt_3=tempt.sort_values(["unum_id","ParentId","MessageDate"],ascending=True)
    tempt_3.drop_duplicates(subset=["unum_id","ParentId"],keep="last",inplace=True)
    tempt_3=tempt_3.groupby(["unum_id"])["TextBody"].apply(lambda x : ".".join(x)).reset_index()
    if tempt_3.empty:
        Latest_TextBody.append(None)
    else:
        Latest_TextBody.append(tempt_3["TextBody"][0])        

    unum_id.append(row["unum_id"])
    policy_id.append(row["policy_id"])
    pivot_date.append(row["pivot_date"])
    year.append(row["year"])
    month.append(row["month"])
    start_date.append(row["start_date"])
    end_date.append(row["end_date"])
    churn.append(row["churn"])
churn_text_data=pd.DataFrame({"unum_id":unum_id,"policy_id":policy_id,"pivot_date":pivot_date,"year":year,"month":month,\
                             "start_date":start_date,"end_date":end_date,"Full_TextBody":Full_TextBody,"Client_TextBody":Client_TextBody,\
                              "Latest_TextBody":Latest_TextBody,"Subtype":Subtype,"churn":churn})

  result = libops.scalar_compare(x.ravel(), y, op)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
100%|██████████| 46/46 [00:00<00:00, 136.58it/s]


In [119]:
churn_text_data

Unnamed: 0,unum_id,policy_id,pivot_date,year,month,start_date,end_date,Full_TextBody,Client_TextBody,Latest_TextBody,Subtype,churn
0,1000469823,436049,2022-03-01,2022,3,2021-06-01,2021-12-01,"hello, can you please assist me in obtaining t...","hello, can you please assist me in obtaining t...","broker id hello tessa, thank you for your emai...",Employee Coding,0
1,1000437341,712269,2022-02-01,2022,2,2021-05-01,2021-11-01,policy holder bridgewater homes policy number ...,policy holder bridgewater homes policy number ...,"good afternoon, congratulations on being named...",BOR Packet,0
2,1000437341,435828,2022-02-01,2022,2,2021-05-01,2021-11-01,policy holder bridgewater homes policy number ...,policy holder bridgewater homes policy number ...,"good afternoon, congratulations on being named...",BOR Packet,0
3,1000433738,435500,2022-02-01,2022,2,2021-05-01,2021-11-01,unum covid response click herehttpsbit.lycjrbm...,unum covid response click herehttpsbit.lycjrbm...,"hello, can you please reset my unum password. ...",Premium Calculation Education,0
4,1000364014,440697,2022-03-01,2022,3,2021-06-01,2021-12-01,"lucy, happy to assist hope you have been doing...","lucy, happy to assist hope you have been doing...","great, thank you thank you, lucy .. lucero rod...",Employee Coding,0
5,1000364014,440697,2021-03-01,2021,3,2020-06-01,2020-12-01,", , arthritis osteoporosis center of south tex...",,", , arthritis osteoporosis center of south tex...",Online Payment Inquiry or Setup,0
6,1000364014,427531,2020-09-01,2020,9,2019-12-01,2020-06-01,"to whom it may concern, my name is nikki marti...","to whom it may concern, my name is nikki marti...",", good afternoon nikki, thank you for your ema...","Add, Remove, or Update user access",1
7,1000354872,907863,2022-01-01,2022,1,2021-04-01,2021-10-01,"good afternoon, we see george hurt was reinsta...","good afternoon, we see george hurt was reinsta...","good afternoon, we see george hurt was reinsta...",Employee Coding,0
8,1000354872,907863,2021-01-01,2021,1,2020-04-01,2020-10-01,"hi askunum, i am hoping someone might be able ...","hi askunum, i am hoping someone might be able ...","hello ginnifer, thank you for reaching out. we...",Compensation Inquiry,0
9,1000354872,907863,2019-01-01,2019,1,2018-04-01,2018-10-01,this message originated outside of unum. use c...,this message originated outside of unum. use c...,"hello shakila, thank you for your email. pleas...",Enrollment Submission,0


In [117]:
tempt2=tempt.drop_duplicates(subset=["unum_id"],keep="last")
tempt2["Subtype"].values[0]

'Employee Coding'

In [88]:
churn_text_data=pd.DataFrame({"unum_id":unum_id,"policy_id":policy_id,"pivot_date":pivot_date,"year":year,"month":month,
                             "start_date":start_date,"end_date":end_date,"Full_TextBody":Full_TextBody,"Client_TextBody":Client_TextBody,"Latest_TextBody":Latest_TextBody,"churn":churn})

In [89]:
churn_text_data

Unnamed: 0,unum_id,policy_id,pivot_date,year,month,start_date,end_date,Full_TextBody,Client_TextBody,Latest_TextBody,churn
0,1000469823,436049,2022-03-01,2022,3,2021-06-01,2021-12-01,"hello, can you please assist me in obtaining t...","hello, can you please assist me in obtaining t...","broker id hello tessa, thank you for your emai...",0
1,1000437341,712269,2022-02-01,2022,2,2021-05-01,2021-11-01,policy holder bridgewater homes policy number ...,policy holder bridgewater homes policy number ...,"good afternoon, congratulations on being named...",0
2,1000437341,435828,2022-02-01,2022,2,2021-05-01,2021-11-01,policy holder bridgewater homes policy number ...,policy holder bridgewater homes policy number ...,"good afternoon, congratulations on being named...",0
3,1000433738,435500,2022-02-01,2022,2,2021-05-01,2021-11-01,unum covid response click herehttpsbit.lycjrbm...,unum covid response click herehttpsbit.lycjrbm...,"hello, can you please reset my unum password. ...",0
4,1000364014,440697,2022-03-01,2022,3,2021-06-01,2021-12-01,"lucy, happy to assist hope you have been doing...","lucy, happy to assist hope you have been doing...","great, thank you thank you, lucy .. lucero rod...",0
5,1000364014,440697,2021-03-01,2021,3,2020-06-01,2020-12-01,", , arthritis osteoporosis center of south tex...",,", , arthritis osteoporosis center of south tex...",0
6,1000364014,427531,2020-09-01,2020,9,2019-12-01,2020-06-01,"to whom it may concern, my name is nikki marti...","to whom it may concern, my name is nikki marti...",", good afternoon nikki, thank you for your ema...",1
7,1000354872,907863,2022-01-01,2022,1,2021-04-01,2021-10-01,"good afternoon, we see george hurt was reinsta...","good afternoon, we see george hurt was reinsta...","good afternoon, we see george hurt was reinsta...",0
8,1000354872,907863,2021-01-01,2021,1,2020-04-01,2020-10-01,"hi askunum, i am hoping someone might be able ...","hi askunum, i am hoping someone might be able ...","hello ginnifer, thank you for reaching out. we...",0
9,1000354872,907863,2019-01-01,2019,1,2018-04-01,2018-10-01,this message originated outside of unum. use c...,this message originated outside of unum. use c...,"hello shakila, thank you for your email. pleas...",0
