In [86]:
import os
import numpy as np
import pandas as pd
# from tqdm.auto import tqdm
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)
import random
import argparse
import logging

from sklearn.metrics import roc_auc_score, f1_score,average_precision_score
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.metrics import roc_curve,precision_recall_curve
from sklearn.metrics import auc as auc_score

import textwrap

from datasets import load_dataset, load_metric, concatenate_datasets,DatasetDict,Dataset
from datasets import load_from_disk

import transformers
print("Transformers version is {}".format(transformers.__version__))

import torch

from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelWithLMHead,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    default_data_collator,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup,
    get_scheduler
)

import utils

import seaborn as sns
from pylab import rcParams
from matplotlib import pyplot as plt
from matplotlib import rc

sns.set(style="whitegrid",palette='muted',font_scale=1.2)
# rcParams['figure.figsize']=16,10

%config InlineBackend.figure_format="retina"
%matplotlib inline

Transformers version is 4.6.1


In [2]:
def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [60]:
class Sample_Creation:
    def __init__(self, df, subtype, keyword):
        self.df=df
        self.subtype=subtype
        self.keyword=keyword
        
    def subtype_find(self):
        mask=[False for _ in range(len(self.subtype))]
        for i in range(len(self.subtype)):
            for k in self.keyword:
                if k in self.subtype[i]:
                    mask[i]=True
                    break
        return np.array(self.subtype)[mask].tolist()
    
    def data_creation(self,val_ratio, test_ratio, train_pos_neg_ratio, val_pos_neg_ratio, test_pos_neg_ratio):
        """"
        Description
        -----------
        create training, validation and test set for positive and negative samples
    
        Parameters:
        ----------
        val_ratio: validation set ratio, default=10% 
        test_ratio: test set ratio, default=10%.  training set ratio=1-val_ratio-test_ratio
        train_pos_neg_ratio : The proportion of positive sample vs negative sampel in training set
        val_pos_neg_ratio :   The proportion of positive sample vs negative sampel in validation set
        test_pos_neg_ratio :  The proportion of positive sample vs negative sampel in test set

        Returns:
        --------
        Traning, validationa and test dataset including positive and negative samples
        
        """
        _subtype=self.subtype_find()
        pos_sample=self.df[self.df['Subtype'].isin(_subtype)].reset_index()
        neg_sample=self.df[~self.df['Subtype'].isin(_subtype)].reset_index()
        
        def train_val_test(data,val_ratio,test_ratio):
            np.random.seed(101)
            _idx=np.arange(len(data))

            np.random.shuffle(_idx)
            test_idx=_idx[:int(len(_idx)*test_ratio)]
            val_idx=_idx[int(len(_idx)*test_ratio) : int(len(_idx)*(val_ratio+test_ratio))]
            train_idx=_idx[int(len(_idx)*(val_ratio+test_ratio)):]
            
            train_data=data.loc[train_idx,:]
            val_data=data.loc[val_idx,:]
            test_data=data.loc[test_idx,:]
            
            return train_data, val_data, test_data
        
        train_positive, val_positive, test_positive=train_val_test(pos_sample,val_ratio,test_ratio)
        train_negative, val_negative, test_negative=train_val_test(neg_sample,val_ratio,test_ratio)
        
        train_neg_num=len(train_positive)* train_pos_neg_ratio
        val_neg_num=len(val_positive)* val_pos_neg_ratio
        test_neg_num=len(test_positive)* test_pos_neg_ratio
        
        train_negative=train_negative.sample(n=train_neg_num, random_state=101)
        val_negative=val_negative.sample(n=val_neg_num, random_state=101)
        test_negative=test_negative.sample(n=test_neg_num, random_state=101)
        
        train_positive["label"]=1
        val_positive["label"]=1
        test_positive["label"]=1
        
        train_negative["label"]=0
        val_negative["label"]=0
        test_negative["label"]=0
        
        train_df=pd.concat([train_positive, train_negative],axis=0).reset_index()
        val_df=pd.concat([val_positive, val_negative],axis=0).reset_index()
        test_df=pd.concat([test_positive, test_negative],axis=0).reset_index()
        
        # train_df.to_csv(os.path.join(args.output_dir ,'train_df.csv'))
        # val_df.to_csv(os.path.join(args.output_dir ,'val_df.csv'))
        # test_df.to_csv(os.path.join(args.output_dir ,'test_df.csv'))
        return train_df, val_df, test_df

In [76]:
input_dir="s3://trident-retention-output/"
askunum_text=pd.read_pickle(os.path.join(input_dir,"askunum_text"))
askunum_text['Subtype'] = askunum_text['Subtype'].fillna("").astype(str).str.lower()
askunum_text["Subtype"]=askunum_text["Subtype"].progress_apply(lambda x: x.encode("latin1").decode("cp1252"))
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace("/"," or ")
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace("&"," and ")
askunum_text["Subtype"]=askunum_text["Subtype"].str.replace(r"\s{2,}", " ", regex=True)
    
df=askunum_text[~askunum_text["Subtype"].isin(["attempted self-service - billing support"])]
subtype=list(df["Subtype"].unique())
keyword=["bill not received","bill hold","bill hide or delete"]
sample_class=Sample_Creation(df, subtype, keyword)
train_df, val_df, test_df=sample_class.data_creation(val_ratio=0.1, test_ratio=0.1, train_pos_neg_ratio=3, val_pos_neg_ratio=3, test_pos_neg_ratio=9)

100%|██████████| 1452978/1452978 [00:02<00:00, 656246.60it/s]


In [81]:
def label_distribution(df):
    tempt1=pd.DataFrame(df["label"].value_counts(dropna=False)).reset_index().rename(columns={'index':'label','label':'count'})
    tempt2=pd.DataFrame(df["label"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'label','label':'percentage'})
    return tempt1.merge(tempt2, on="label", how="inner")

def style_format(df,  data_type="Training set"):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"{data_type} label distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '12px')]}])

In [82]:
label_train=label_distribution(train_df)
style_format(label_train,  data_type="Training set")

Unnamed: 0,label,count,percentage
0,0,14316,75.00%
1,1,4772,25.00%


In [84]:
label_train=label_distribution(val_df)
style_format(label_train,  data_type="validation set")

Unnamed: 0,label,count,percentage
0,0,1791,75.00%
1,1,597,25.00%


In [83]:
label_train=label_distribution(test_df)
style_format(label_train,  data_type="Test set")

Unnamed: 0,label,count,percentage
0,0,5364,90.00%
1,1,596,10.00%


In [87]:
wrapper = textwrap.TextWrapper(width=150) 
# Randomly choose some examples.
for i in range(5):
    random.seed(101+i)
    
    j = random.choice(train_df.index)
    emails=train_df.loc[j,"TextBody"]
    subtype=train_df.loc[j,"Subtype"]
    unum_id=train_df.loc[j,"unum_id"]
    
    print('')
    print("*"*80)
    print(f'*  Full TextBody : unum_id={unum_id}, subtype={subtype} *')
    print("*"*80)
    print('')
    # print(j)
    print(wrapper.fill(emails))
    print('')
    print("*"*50)


********************************************************************************
*  Full TextBody : unum_id=660146, subtype=eoi status *
********************************************************************************

hi team, please see attached and below regarding eois fothe mote marine enrollment fo/1. i am unfamiliawith the unum inbox they sent this to, so i
want to ensuwe ataking action steps as needed. can you partnewith client services foresolution and follow up with the brokethank you, catherine thorpe
unum client manageflorida o 813.207.2637 c 813.734.4393 foinformation and resources regarding unum's covid-19 response and faqs, visit hello natalie,
. i have confirmed that the effective dates fothese employees have been updated to 5/1/2021 as requested. if we may assist any furtheplease let us
know. thank you, amanda altieservice specialist client success organization 1-800-ask-unum 1-800-275-8686 askunumunum.com unum covid-19 response - how
to file a claim online - . , .

**

In [53]:
def metric_table(table_name="training_output.txt"):
    Model_Type=[]
    EPOCH=[]
    LOSS=[]
    True_Prediction=[]
    False_Prediction=[]
    Accuracy=[]
    Precision=[]
    Recall=[]
    F1_Score=[]
    AUC=[]
    PR_AUC=[]

    with open(os.path.join(os.getcwd(),table_name),'r') as f:
        for line in f:
            Model_Type.append(str(line.split(",")[0]))
            EPOCH.append(int(line.split(",")[1]))
            LOSS.append(float(line.split(",")[2]))
            True_Prediction.append(int(line.split(",")[3]))
            False_Prediction.append(int(line.split(",")[4]))
            Accuracy.append(float(line.split(",")[5]))
            Precision.append(float(line.split(",")[6]))
            Recall.append(float(line.split(",")[7]))
            F1_Score.append(float(line.split(",")[8]))
            AUC.append(float(line.split(",")[12]))
            PR_AUC.append(float(line.split(",")[13]))

    metrics=pd.DataFrame({"model_type":Model_Type,"epoch":EPOCH,"loss":LOSS,"true_prediction":True_Prediction,"false_prediction":False_Prediction,"accuracy":Accuracy,\
                         "precision":Precision,"recall":Recall,"f1_score":F1_Score,"auc":AUC,"pr_auc":PR_AUC})
    metrics.drop_duplicates(subset=["model_type","epoch"],inplace=True)
    metrics.sort_values(by=['model_type','epoch'],inplace=True)       
    
    return metrics

def style_format(metrics_training, metrics_test, model):
    metrics_training=metrics_training[metrics_training["model_type"]==model].reset_index(drop=True)
    metrics_training=metrics_training.sort_values('f1_score', ascending=False).head(1)
    metrics_training.drop("epoch",inplace=True,axis=1)
    metrics_training["data"]=["training set"]
    
    metrics_test=metrics_test[metrics_test["model_type"]==model].reset_index(drop=True)
    metrics_test=metrics_test.sort_values('f1_score', ascending=False).head(1)
    metrics_test.drop("epoch",inplace=True,axis=1)
    metrics_test["data"]=["test set"]
    
    metrics=pd.concat([metrics_training,metrics_test])
    first_column =  metrics.pop('data')
    metrics.insert(0, 'data', first_column)
    
    return metrics.style.format({"loss":"{:.4f}","accuracy":"{:.2%}","true_prediction":"{:,}","false_prediction":"{:,}", "precision":"{:.2%}", "recall":"{:.2%}", \
                                "f1_score":"{:.2%}", "auc":"{:.2%}","pr_auc":"{:.2%}"}) \
    .set_caption(f"Performance Summary for-- {model}") \
    .set_table_styles([{
        'selector': 'caption',
        'props': [
            ('color', 'red'),
            ('font-size', '20px')
        ]
    }])

In [55]:
metric_training=metric_table(table_name="metrics_training.txt")
metric_test=metric_table(table_name="metrics_test.txt")

In [56]:
style_format(metric_training,metric_test, model="bert_base")

Unnamed: 0,data,model_type,loss,true_prediction,false_prediction,accuracy,precision,recall,f1_score,auc,pr_auc
0,training set,bert_base,0.6106,13766,7710,64.10%,41.59%,69.38%,52.01%,74.41%,49.50%
0,test set,bert_base,0.6145,3632,2328,60.94%,20.06%,56.54%,29.61%,72.28%,24.58%


In [57]:
style_format(metric_training,metric_test, model="bert_large")

Unnamed: 0,data,model_type,loss,true_prediction,false_prediction,accuracy,precision,recall,f1_score,auc,pr_auc
0,training set,bert_large,0.3085,18912,2564,88.06%,80.91%,83.89%,82.37%,95.06%,83.82%
0,test set,bert_large,0.2766,5146,814,86.34%,62.68%,81.71%,70.94%,95.56%,67.79%


In [58]:
style_format(metric_training,metric_test, model="roberta_base")

Unnamed: 0,data,model_type,loss,true_prediction,false_prediction,accuracy,precision,recall,f1_score,auc,pr_auc
0,training set,roberta_base,0.2295,19281,2195,89.78%,84.42%,86.20%,85.30%,96.99%,92.88%
0,test set,roberta_base,0.2763,5277,683,88.54%,75.98%,78.02%,76.99%,97.37%,84.90%


In [59]:
style_format(metric_training,metric_test, model="roberta_large")

Unnamed: 0,data,model_type,loss,true_prediction,false_prediction,accuracy,precision,recall,f1_score,auc,pr_auc
0,training set,roberta_large,0.168,19923,1553,92.77%,91.65%,87.50%,89.53%,98.34%,96.18%
0,test set,roberta_large,0.1948,5481,479,91.96%,88.77%,80.87%,84.64%,98.68%,92.01%
