In [1]:
import argparse
import pandas as pd
import numpy as np
from numpy import savez_compressed, load
import itertools
import re
import time
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import transformers

from transformers import (
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup,
)
print("Transformers version is {}".format(transformers.__version__))

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from IPython.display import display, HTML

sns.set(style="whitegrid",palette='muted',font_scale=1.2)
rcParams['figure.figsize']=16,10

%config InlineBackend.figure_format="retina"
%matplotlib inline

pd.set_option('display.max_columns', None,'display.max_rows',None)

  from .autonotebook import tqdm as notebook_tqdm


Transformers version is 4.19.0


In [2]:
input_dir="s3://trident-retention-output/multi-class"
train_df=pd.read_csv(os.path.join(input_dir,"train_df.csv"))
val_df=pd.read_csv(os.path.join(input_dir,"val_df.csv"))
test_df=pd.read_csv(os.path.join(input_dir,"test_df.csv"))

In [3]:
train_df.drop(['Unnamed: 0'],axis=1,inplace=True)
val_df.drop(['Unnamed: 0'],axis=1,inplace=True)
test_df.drop(['Unnamed: 0'],axis=1,inplace=True)
train_df.head()

Unnamed: 0,ParentId,Subtype,TextBody
0,5003x00001yYXa5AAG,broker of record change (bor),"hi jasmine, please see attached. chad baransky..."
1,5003x00002DrO7YAAV,policy termination,the group moved coverage to anothecarrier. dia...
2,5003x00002BE03YAAT,missing information,gsc-mi kristen garrison client name industrial...
3,5003x0000268XdtAAE,policy termination,"hello, can you please process the attached can..."
4,5003x00001vXWp8AAG,policy termination,"good morning, schilleemily, please confirm if ..."


In [4]:
hf_train=Dataset.from_pandas(train_df)
hf_val=Dataset.from_pandas(val_df)
hf_test=Dataset.from_pandas(test_df)

hf_data=DatasetDict({"train":hf_train, "val":hf_val,  "test":hf_test})

In [5]:
hf_data

DatasetDict({
    train: Dataset({
        features: ['ParentId', 'Subtype', 'TextBody'],
        num_rows: 111864
    })
    val: Dataset({
        features: ['ParentId', 'Subtype', 'TextBody'],
        num_rows: 13983
    })
    test: Dataset({
        features: ['ParentId', 'Subtype', 'TextBody'],
        num_rows: 13977
    })
})

In [6]:
def label_distribution(df,col):
    tempt1=pd.DataFrame(df[col].value_counts(dropna=False)).reset_index().rename(columns={'index':col,col:'count'})
    tempt2=pd.DataFrame(df[col].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':col,col:'percentage'})
    return tempt1.merge(tempt2, on=col, how="inner")

def style_format(df, col, data_type="Training set"):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"{data_type} {col} distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [7]:
label_train=label_distribution(train_df,col="Subtype")
x1=label_train[label_train["Subtype"] != "other-category"]
x2=label_train[label_train["Subtype"] == "other-category"]
label_train=pd.concat([x1,x2])
style_format(label_train,col="Subtype",  data_type="Training set")

Unnamed: 0,Subtype,count,percentage
0,policy termination,28320,25.32%
1,missing information,23712,21.20%
2,broker of record change (bor),17451,15.60%
3,new plan administrator,16087,14.38%
5,premium discrepancy,6873,6.14%
6,bill not received,2841,2.54%
7,late notice or collections,2240,2.00%
8,missing or skipped payment,2200,1.97%
9,policy level discrepancy,1370,1.22%
10,less than minimum lives,1031,0.92%


In [8]:
label_test=label_distribution(test_df,col="Subtype")
x1=label_test[label_test["Subtype"] != "other-category"]
x2=label_test[label_test["Subtype"] == "other-category"]
label_test=pd.concat([x1,x2])
style_format(label_test,col="Subtype",  data_type="Test set")

Unnamed: 0,Subtype,count,percentage
0,policy termination,3540,25.33%
1,missing information,2964,21.21%
2,broker of record change (bor),2181,15.60%
3,new plan administrator,2010,14.38%
5,premium discrepancy,859,6.15%
6,bill not received,355,2.54%
7,late notice or collections,280,2.00%
8,missing or skipped payment,274,1.96%
9,policy level discrepancy,171,1.22%
10,less than minimum lives,128,0.92%


In [9]:
model_checkpoint="allenai/longformer-base-4096"
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

Downloading: 100%|██████████| 694/694 [00:00<00:00, 1.12MB/s]
Downloading: 100%|██████████| 878k/878k [00:00<00:00, 83.9MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 87.0MB/s]
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 91.7MB/s]


In [10]:
def train_test_data(df_train, df_test, feature_cols):
    train_df=Dataset.from_pandas(df_train)
    train_df=train_df.filter(lambda x: x[feature_cols]!=None)
    train_df=train_df.map(lambda x: tokenizer(x[feature_cols]),batched=True)

    test_df=Dataset.from_pandas(df_test)
    test_df=test_df.filter(lambda x: x[feature_cols]!=None)
    test_df=test_df.map(lambda x: tokenizer(x[feature_cols]),batched=True)
    
    def compute_lenth(example):
        return {"text_length":len(example["input_ids"])}
    
    train_df=train_df.map(compute_lenth)
    test_df=test_df.map(compute_lenth)
    
    return train_df, test_df

In [11]:
train_df1, test_df1=train_test_data(train_df, test_df, feature_cols="TextBody")

100%|██████████| 112/112 [00:00<00:00, 189.27ba/s]
  3%|▎         | 3/112 [00:00<00:18,  5.99ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5557 > 4096). Running this sequence through the model will result in indexing errors
100%|██████████| 112/112 [00:15<00:00,  7.11ba/s]
100%|██████████| 14/14 [00:00<00:00, 191.14ba/s]
100%|██████████| 14/14 [00:01<00:00,  7.10ba/s]
100%|██████████| 111864/111864 [00:38<00:00, 2894.35ex/s]
100%|██████████| 13977/13977 [00:04<00:00, 2924.55ex/s]


In [12]:
def statistics_compute(hf_df1,hf_df2,p=1):

    X=[]
    X.append(np.percentile(hf_df1['text_length'],p))
    X.append(np.percentile(hf_df2['text_length'],p))
    
    result={}
    result['percentile']=X
    result["min"]=[np.min(hf_df1['text_length']),np.min(hf_df2['text_length'])]
    result["max"]=[np.max(hf_df1['text_length']),np.max(hf_df2['text_length'])]
    result["mean"]=[np.mean(hf_df1['text_length']),np.mean(hf_df2['text_length'])]
    return result

def statistics_table(hf_df1,hf_df2):
    dict_data={}
    dict_data["data_type"]=["training", "test"]
    dict_data["# of obs"]=[len(hf_df1['text_length']),len(hf_df2['text_length'])]
    dict_data["Min of tokens"]=statistics_compute(hf_df1, hf_df2)["min"]
    dict_data["1% of tokens"]=statistics_compute(hf_df1, hf_df2, p=1)['percentile']
    dict_data["5% of tokens"]=statistics_compute(hf_df1, hf_df2, p=5)['percentile']
    dict_data["10% of tokens"]=statistics_compute(hf_df1, hf_df2, p=10)['percentile']
    dict_data["25% of tokens"]=statistics_compute(hf_df1, hf_df2, p=25)['percentile']
    dict_data["Median of tokens"]=statistics_compute(hf_df1, hf_df2, p=50)['percentile']
    dict_data["Average tokens"]=statistics_compute(hf_df1, hf_df2)["mean"]
    dict_data["75% of tokens"]=statistics_compute(hf_df1, hf_df2, p=75)['percentile']
    dict_data["90% of tokens"]=statistics_compute(hf_df1, hf_df2, p=90)['percentile']
    dict_data["95% of tokens"]=statistics_compute(hf_df1, hf_df2, p=95)['percentile']
    dict_data["99% of tokens"]=statistics_compute(hf_df1, hf_df2, p=99)['percentile']
    dict_data["Max of tokens"]=statistics_compute(hf_df1, hf_df2)["max"]
    token_count_df=pd.DataFrame(dict_data)
    return token_count_df

def style_format(token_count_df,  textbody="Full_TextBody"):
    token_count_df=token_count_df.set_index("data_type")
    token_count_df[list(token_count_df.columns)] = token_count_df[list(token_count_df.columns)].astype(int)
    return token_count_df.style.format("{:,}").set_caption(f"Summary Statistics of token lengths for {textbody} ").set_table_styles([{
        'selector': 'caption',
        'props': [
            ('color', 'red'),
            ('font-size', '20px')
        ]
    }])

In [13]:
token_count_df=statistics_table(train_df1,test_df1)
style_format(token_count_df,  textbody="Full_TextBody")

Unnamed: 0_level_0,# of obs,Min of tokens,1% of tokens,5% of tokens,10% of tokens,25% of tokens,Median of tokens,Average tokens,75% of tokens,90% of tokens,95% of tokens,99% of tokens,Max of tokens
data_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
training,111864,9,62,99,125,193,288,321,398,549,655,969,24279
test,13977,9,64,99,124,190,287,322,396,548,660,996,9292
