In [10]:
import argparse
import pandas as pd
import numpy as np
from numpy import savez_compressed, load
import itertools
import re
import time
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import transformers

from transformers import (
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup,
)
print("Transformers version is {}".format(transformers.__version__))

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from IPython.display import display, HTML

sns.set(style="whitegrid",palette='muted',font_scale=1.2)
rcParams['figure.figsize']=16,10

%config InlineBackend.figure_format="retina"
%matplotlib inline

pd.set_option('display.max_columns', None,'display.max_rows',None)

Transformers version is 4.22.1


In [11]:
input_dir="s3://trident-retention-output/"
train_df=pd.read_csv(os.path.join(input_dir,"train_df.csv"))
val_df=pd.read_csv(os.path.join(input_dir,"val_df.csv"))
test_df=pd.read_csv(os.path.join(input_dir,"test_df.csv"))

In [13]:
train_df.drop(['Unnamed: 0'],axis=1,inplace=True)
val_df.drop(['Unnamed: 0'],axis=1,inplace=True)
test_df.drop(['Unnamed: 0'],axis=1,inplace=True)
train_df.head()

Unnamed: 0,unum_id,year,month,Subtype,TextBody,label
0,660464,2018,3,bill not received,"policy 657647 - harriston-mayo llc hello nick,...",1
1,212520822,2020,4,bill not received,can you please remove this bill hold and confi...,1
2,545813,2018,1,bill not received,"hi tina, thank you foyoucall today. i have att...",1
3,553799,2019,3,bill hide or delete,please reset ouportal fothe month of march. -0...,1
4,139295091,2018,5,bill not received,"sheila, i enjoyed speaking with you today. i w...",1


In [16]:
hf_train=Dataset.from_pandas(train_df)
hf_val=Dataset.from_pandas(val_df)
hf_test=Dataset.from_pandas(test_df)

hf_data=DatasetDict({"train":hf_train, "val":hf_val,  "test":hf_test})

In [18]:
hf_data

DatasetDict({
    train: Dataset({
        features: ['unum_id', 'year', 'month', 'Subtype', 'TextBody', 'label'],
        num_rows: 19088
    })
    val: Dataset({
        features: ['unum_id', 'year', 'month', 'Subtype', 'TextBody', 'label'],
        num_rows: 2388
    })
    test: Dataset({
        features: ['unum_id', 'year', 'month', 'Subtype', 'TextBody', 'label'],
        num_rows: 5960
    })
})

In [19]:
def label_distribution(df):
    tempt1=pd.DataFrame(df["label"].value_counts(dropna=False)).reset_index().rename(columns={'index':'label','label':'count'})
    tempt2=pd.DataFrame(df["label"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'label','label':'percentage'})
    return tempt1.merge(tempt2, on="label", how="inner")

def style_format(df,  data_type="Training set"):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"{data_type} label distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [21]:
label_train=label_distribution(train_df)
style_format(label_train,  data_type="Training set")

Unnamed: 0,label,count,percentage
0,0,14316,75.00%
1,1,4772,25.00%


In [22]:
label_test=label_distribution(test_df)
style_format(label_test,  data_type="Test set")

Unnamed: 0,label,count,percentage
0,0,5364,90.00%
1,1,596,10.00%


In [24]:
model_checkpoint="allenai/longformer-base-4096"
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

In [25]:
def train_test_data(df_train, df_test, feature_cols):
    train_df=Dataset.from_pandas(df_train)
    train_df=train_df.filter(lambda x: x[feature_cols]!=None)
    train_df=train_df.map(lambda x: tokenizer(x[feature_cols]),batched=True)

    test_df=Dataset.from_pandas(df_test)
    test_df=test_df.filter(lambda x: x[feature_cols]!=None)
    test_df=test_df.map(lambda x: tokenizer(x[feature_cols]),batched=True)
    
    def compute_lenth(example):
        return {"text_length":len(example["input_ids"])}
    
    train_df=train_df.map(compute_lenth)
    test_df=test_df.map(compute_lenth)
    
    return train_df, test_df

In [28]:
train_df1, test_df1=train_test_data(train_df, test_df, feature_cols="TextBody")

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/19088 [00:00<?, ?ex/s]

  0%|          | 0/5960 [00:00<?, ?ex/s]

In [30]:
def statistics_compute(hf_df1,hf_df2,p=1):

    X=[]
    X.append(np.percentile(hf_df1['text_length'],p))
    X.append(np.percentile(hf_df2['text_length'],p))
    
    result={}
    result['percentile']=X
    result["min"]=[np.min(hf_df1['text_length']),np.min(hf_df2['text_length'])]
    result["max"]=[np.max(hf_df1['text_length']),np.max(hf_df2['text_length'])]
    result["mean"]=[np.mean(hf_df1['text_length']),np.mean(hf_df2['text_length'])]
    return result

def statistics_table(hf_df1,hf_df2):
    dict_data={}
    dict_data["data_type"]=["training", "test"]
    dict_data["# of obs"]=[len(hf_df1['text_length']),len(hf_df2['text_length'])]
    dict_data["Min of tokens"]=statistics_compute(hf_df1, hf_df2)["min"]
    dict_data["1% of tokens"]=statistics_compute(hf_df1, hf_df2, p=1)['percentile']
    dict_data["5% of tokens"]=statistics_compute(hf_df1, hf_df2, p=5)['percentile']
    dict_data["10% of tokens"]=statistics_compute(hf_df1, hf_df2, p=10)['percentile']
    dict_data["25% of tokens"]=statistics_compute(hf_df1, hf_df2, p=25)['percentile']
    dict_data["Median of tokens"]=statistics_compute(hf_df1, hf_df2, p=50)['percentile']
    dict_data["Average tokens"]=statistics_compute(hf_df1, hf_df2)["mean"]
    dict_data["75% of tokens"]=statistics_compute(hf_df1, hf_df2, p=75)['percentile']
    dict_data["90% of tokens"]=statistics_compute(hf_df1, hf_df2, p=90)['percentile']
    dict_data["95% of tokens"]=statistics_compute(hf_df1, hf_df2, p=95)['percentile']
    dict_data["99% of tokens"]=statistics_compute(hf_df1, hf_df2, p=99)['percentile']
    dict_data["Max of tokens"]=statistics_compute(hf_df1, hf_df2)["max"]
    token_count_df=pd.DataFrame(dict_data)
    return token_count_df

def style_format(token_count_df,  textbody="Full_TextBody"):
    token_count_df=token_count_df.set_index("data_type")
    token_count_df[list(token_count_df.columns)] = token_count_df[list(token_count_df.columns)].astype(int)
    return token_count_df.style.format("{:,}").set_caption(f"Summary Statistics of token lengths for {textbody} ").set_table_styles([{
        'selector': 'caption',
        'props': [
            ('color', 'red'),
            ('font-size', '20px')
        ]
    }])

In [31]:
token_count_df=statistics_table(train_df1,test_df1)
style_format(token_count_df,  textbody="Full_TextBody")

Unnamed: 0_level_0,# of obs,Min of tokens,1% of tokens,5% of tokens,10% of tokens,25% of tokens,Median of tokens,Average tokens,75% of tokens,90% of tokens,95% of tokens,99% of tokens,Max of tokens
data_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
training,19088,12,56,87,109,172,279,386,448,732,1001,1902,40068
test,5960,12,58,86,109,178,289,391,465,749,1015,1941,12749
