In [None]:
import argparse
import pandas as pd
import numpy as np
from numpy import savez_compressed, load
import itertools
import re
import time
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

from fuzzywuzzy import fuzz

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
# Load the stopwords from the new directory
nltk_data_dir=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models","nltk_data")
stopwords_file = open(nltk_data_dir + '/corpora/stopwords/english')
stopwords_list = stopwords_file.readlines()
nltk.data.path.append(nltk_data_dir)

import spacy
model_name=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models","en_core_web_md","en_core_web_md-3.3.0")
nlp = spacy.load(model_name)

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from IPython.display import display, HTML

sns.set(style="whitegrid",palette='muted',font_scale=1.2)
rcParams['figure.figsize']=16,10

%config InlineBackend.figure_format="retina"
%matplotlib inline

pd.set_option('display.max_columns', None,'display.max_rows',None)

In [None]:
root_dir="/opt/omniai/work/instance1/jupyter/v2_new_email/datasets/raw_data"
data_name=[x for x in os.listdir(root_dir) if x.split(".")[-1]=="csv"]
df=pd.DataFrame()
for data in data_name:
    x=pd.read_csv(os.path.join(root_dir,data))
    x=x.dropna(subset=['email'])
    x=x[x.email.notna()]
    x=x[x.email.str.len()>0]
    df=pd.concat([df,x],axis=0,ignore_index=True)
    print("{:<20}{:<20,}".format(data.split("_")[2],x.shape[0]))
    
df=df.reset_index(drop=True)

In [None]:
df['time'] = pd.to_datetime(df['time'])
df['year'] = df.time.apply(lambda x: x.year)
df['month'] = df.time.apply(lambda x: x.month)
df['day'] = df.time.apply(lambda x: x.day)
df.sort_values(by='time', inplace = True) 

### remove duplicated emails based on thread id

In [None]:
grouped_df=df.groupby('thread_id')
sorted_groups=[group.sort_values("time",ascending=False).reset_index(drop=True) for _, group in grouped_df]
df=pd.concat(sorted_groups).drop_duplicates(subset="thread_id", keep="first").reset_index(drop=True)

In [None]:
def label_distribution(df,year,month):
    df=df[(df.year==year) & (df.month==month)]
    tempt1=pd.DataFrame(df["is_complaint"].value_counts(dropna=False)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'count'})
    tempt2=pd.DataFrame(df["is_complaint"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'percentage'})
    tempt3=tempt1.merge(tempt2, on="is_complaint", how="inner")
    tempt3['year']=year
    tempt3['month']=month
    tempt3=tempt3.loc[:,['year','month','is_complaint','count','percentage']]
    return tempt3

def style_format(df,  data_type="Training set"):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"label distribution\n{data_type}")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [None]:
dist_df=pd.DataFrame()
# dist_df=pd.concat([dist_df,label_distribution(df,2022,8)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,9)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,10)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,11)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,12)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,1)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,2)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,3)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,4)])
style_format(dist_df,  data_type="split by month")

In [None]:
tempt1=df['state'].value_counts(dropna=False).reset_index(name="count")
tempt2=df['state'].value_counts(dropna=False,normalize=True).reset_index(name="percentage")
tempt3=tempt1.merge(tempt2, on=['index'], how="inner")
tempt3.rename(columns={"index":"state"}, inplace=True)
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
df=df[df.state=="closed"]
dist_df=pd.DataFrame()
# dist_df=pd.concat([dist_df,label_distribution(df,2022,8)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,9)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,10)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,11)])
dist_df=pd.concat([dist_df,label_distribution(df,2022,12)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,1)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,2)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,3)])
dist_df=pd.concat([dist_df,label_distribution(df,2023,4)])
style_format(dist_df,  data_type="split by month")

In [None]:
## train: 09/2022 ~ 02/2023. validation: 03/2023  test: 04/2023
set_categories=lambda row: "train" if (row["year"] in [2022,2023] and row["month"] in [9,10,11,12,1,2]) \
else ("val" if (row["year"]==2023 and row["month"]==3) else "test")

df["data_type"]=df.progress_apply(set_categories,axis=1)

### After Data preprocessing

In [None]:
root_dir="/opt/omniai/work/instance1/jupyter/v2_new_email/datasets/split_data"
data_name=[x for x in os.listdir(root_dir) if x.split("_")[-2]=="pickle"]
df1=pd.DataFrame()
for data in data_name:
    x=pd.read_pickle(os.path.join(root_dir,data))
    x=x.dropna(subset=['email'])
    x=x[x.email.notna()]
    x=x[x.email.str.len()>0]
    df1=pd.concat([df1,x],axis=0,ignore_index=True)
    # print("{:<20}{:<20,}".format(data.split("_")[2],x.shape[0]))
    
df1=df1.reset_index(drop=True)

df1['time'] = pd.to_datetime(df1['time'])
df1['year'] = df1.time.apply(lambda x: x.year)
df1['month'] = df1.time.apply(lambda x: x.month)
df1['day'] = df1.time.apply(lambda x: x.day)
df1.sort_values(by='time', inplace = True) 

grouped_df=df1.groupby('thread_id')
sorted_groups=[group.sort_values("time",ascending=False).reset_index(drop=True) for _, group in grouped_df]
df1=pd.concat(sorted_groups).drop_duplicates(subset="thread_id", keep="first").reset_index(drop=True)

In [None]:
dist_df=pd.DataFrame()
# dist_df=pd.concat([dist_df,label_distribution(df1,2022,8)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,9)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,10)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,11)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,12)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,1)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,2)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,3)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,4)])
style_format(dist_df,  data_type="split by month")

In [None]:
tempt1=df1['state'].value_counts(dropna=False).reset_index(name="count")
tempt2=df1['state'].value_counts(dropna=False,normalize=True).reset_index(name="percentage")
tempt3=tempt1.merge(tempt2, on=['index'], how="inner")
tempt3.rename(columns={"index":"state"}, inplace=True)
tempt3.style.format({'count':'{:,}','percentage':'{:.2%}'})

In [None]:
df1=df1[df1.state=="closed"]
dist_df=pd.DataFrame()
# dist_df=pd.concat([dist_df,label_distribution(df1,2022,8)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,9)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,10)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,11)])
dist_df=pd.concat([dist_df,label_distribution(df1,2022,12)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,1)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,2)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,3)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,4)])
style_format(dist_df,  data_type="split by month")

In [None]:
train_val_test_month=pd.DataFrame({"data_type":["train","val","test"],\
                                   "month":["09/22 ~ 03/23","09/22 ~ 03/23","04/23"],\
                                  "split":["80%","20%",""]})
train_val_test_month

In [None]:
## train: 09/2022 ~ 02/2023. validation: 03/2023  test: 04/2023
set_categories=lambda row: "train" if (row["year"] in [2022,2023] and row["month"] in [9,10,11,12,1,2]) \
else ("val" if (row["year"]==2023 and row["month"]==3) else "test")

df1["data_type"]=df1.progress_apply(set_categories,axis=1)

In [None]:
def label_distribution(df,data_type):
    df=df[df["data_type"]==data_type]
    tempt1=pd.DataFrame(df["is_complaint"].value_counts(dropna=False)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'count'})
    tempt2=pd.DataFrame(df["is_complaint"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'percentage'})
    tempt3=tempt1.merge(tempt2, on="is_complaint", how="inner")
    tempt3['data_type']=data_type
    tempt3=tempt3.loc[:,['data_type','is_complaint','count','percentage']]
    return tempt3

def style_format(df):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"label distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,label_distribution(df1,"train")])
dist_df=pd.concat([dist_df,label_distribution(df1,"val")])
dist_df=pd.concat([dist_df,label_distribution(df1,"test")])
style_format(dist_df)

In [None]:
def feedback_distribution(df):
    tempt1=df.groupby('is_feedback')['is_complaint'].value_counts(dropna=False).reset_index(name="count")
    tempt2=df.groupby('is_feedback')['is_complaint'].value_counts(dropna=False,normalize=True).reset_index(name="percentage")
    tempt3=tempt1.merge(tempt2, on=['is_feedback',"is_complaint"], how="inner")
    tempt3=tempt3.loc[:,['is_feedback','is_complaint','count','percentage']]
    return tempt3

def style_format(df):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"feedback distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

dist_df=feedback_distribution(df1)
style_format(dist_df)

### email text length

In [None]:
import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

import transformers
from transformers import AutoTokenizer

model_name="longformer-base-4096"
model_path=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models",model_name)
tokenizer=AutoTokenizer.from_pretrained(model_path)
print()
print(f"Vocabulary size : {tokenizer.vocab_size:,}")
print()

In [None]:
def dataframe_hf(df):
   
    train_df=df[df["data_type"]=="train"]
    val_df=df[df["data_type"]=="val"]
    test_df=df[df["data_type"]=="test"]
    
    hf_train=Dataset.from_pandas(train_df)
    hf_val=Dataset.from_pandas(val_df)
    hf_test=Dataset.from_pandas(test_df)
    
    hf_data=DatasetDict({"train":hf_train, "val":hf_val,  "test":hf_test})
    # hf_data=hf_data.select_columns(['snapshot_id','thread_id','time','preprocessed_email','is_feedback','is_complaint'])
    
    return hf_data

hf_v0=dataframe_hf(df)
hf_v1=dataframe_hf(df1)

In [None]:
def compute_lenth(example):
    return {"text_length":len(example["input_ids"])}

hf_v0=hf_v0.map(lambda x: tokenizer(x["email"]),batched=True)
hf_v0=hf_v0.map(compute_lenth)

hf_v1=hf_v1.map(lambda x: tokenizer(x["preprocessed_email"]),batched=True)
hf_v1=hf_v1.map(compute_lenth)

In [None]:
def statistics_compute(hf_df1,hf_df2,hf_df3,p=1):

    X=[]
    X.append(np.percentile(hf_df1['text_length'],p))
    X.append(np.percentile(hf_df2['text_length'],p))
    X.append(np.percentile(hf_df3['text_length'],p))
    
    result={}
    result['percentile']=X
    result["min"]=[np.min(hf_df1['text_length']),np.min(hf_df2['text_length']),np.min(hf_df3['text_length'])]
    result["max"]=[np.max(hf_df1['text_length']),np.max(hf_df2['text_length']),np.max(hf_df3['text_length'])]
    result["mean"]=[np.mean(hf_df1['text_length']),np.mean(hf_df2['text_length']),np.mean(hf_df3['text_length'])]
    return result

def statistics_table(hf_df1,hf_df2,hf_df3):
    dict_data={}
    dict_data["data_type"]=["training", "validation", "test"]
    dict_data["# of obs"]=[len(hf_df1['text_length']),len(hf_df2['text_length']),len(hf_df3['text_length'])]
    dict_data["Min of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3)["min"]
    dict_data["1% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=1)['percentile']
    dict_data["5% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=5)['percentile']
    dict_data["10% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=10)['percentile']
    dict_data["25% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=25)['percentile']
    dict_data["Median of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=50)['percentile']
    dict_data["Average tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3)["mean"]
    dict_data["75% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=75)['percentile']
    dict_data["90% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=90)['percentile']
    dict_data["95% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=95)['percentile']
    dict_data["99% of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3, p=99)['percentile']
    dict_data["Max of tokens"]=statistics_compute(hf_df1, hf_df2, hf_df3)["max"]
    token_count_df=pd.DataFrame(dict_data)
    return token_count_df

def style_format(token_count_df,  textbody="preprocessed_email"):
    token_count_df=token_count_df.set_index("data_type")
    token_count_df[list(token_count_df.columns)] = token_count_df[list(token_count_df.columns)].astype(int)
    return token_count_df.style.format("{:,}").set_caption(f"Summary Statistics of token lengths for {textbody} ").set_table_styles([{
        'selector': 'caption',
        'props': [
            ('color', 'red'),
            ('font-size', '15px')
        ]
    }])

In [None]:
token_count_df=statistics_table(hf_v0["train"],hf_v0["val"],hf_v0["test"])
style_format(token_count_df,  textbody="email data")

In [None]:
token_count_df=statistics_table(hf_v1["train"],hf_v1["val"],hf_v1["test"])
style_format(token_count_df,  textbody="preprocessed email")

### text length distribution for complaint email

In [None]:
token_count_df=statistics_table(hf_v1["train"].filter(lambda x : x["is_complaint"]=="Y"), \
                                hf_v1["val"].filter(lambda x : x["is_complaint"]=="Y"),\
                                hf_v1["test"].filter(lambda x : x["is_complaint"]=="Y"))
style_format(token_count_df,  textbody="Complaint email")

In [None]:
token_count_df=statistics_table(hf_v1["train"].filter(lambda x : x["is_complaint"]=="N"), \
                                hf_v1["val"].filter(lambda x : x["is_complaint"]=="N"),\
                                hf_v1["test"].filter(lambda x : x["is_complaint"]=="N"))
style_format(token_count_df,  textbody="Non-complaint email")

In [None]:
def pcut_func(df,var,nbin=5):
    df[var]=df[var].astype(float)
    df["cut"]=pd.qcut(df[var],nbin,precision=2,duplicates="drop")
    decile=df.groupby(df["cut"])['target'].mean().reset_index()
    decile["cut"]=decile["cut"].astype(str)
    return decile

In [None]:
train_df=hf_v1["train"]
val_df=hf_v1["val"]
test_df=hf_v1["test"]

train_df.set_format("pandas")
df_train=train_df[:]
df_train["target"]=df_train['is_complaint'].apply(lambda x : 1 if x=="Y" else 0)

val_df.set_format("pandas")
df_val=val_df[:]
df_val["target"]=df_val['is_complaint'].apply(lambda x : 1 if x=="Y" else 0)

test_df.set_format("pandas")
df_test=test_df[:]
df_test["target"]=df_test['is_complaint'].apply(lambda x : 1 if x=="Y" else 0)

In [None]:
import matplotlib.ticker as ticker

def y_formatter(x,_):
    return f'{x*100:.2f}%'
fig, ax = plt.subplots(1,3,figsize=(15,6))
plt.subplot(1,3,1)
df=pcut_func(df_train,var="text_length",nbin=10)
ax[0].plot(df["cut"],df["target"],color="r",marker="*",linewidth=2, markersize=12)
ax[0].set_title("text_length\n(training set)")
ax[0].set_ylabel("complaint %")
ax[0].tick_params(labelrotation=45)
ax[0].yaxis.set_major_formatter(ticker.FuncFormatter(y_formatter))
plt.subplot(1,3,2)
df=pcut_func(df_val,var="text_length",nbin=10)
ax[1].plot(df["cut"],df["target"],color="r",marker="*",linewidth=2, markersize=12)
ax[1].set_title("text_length\n(validation set)")
ax[1].set_ylabel("complaint %")
ax[1].tick_params(labelrotation=45)
ax[1].yaxis.set_major_formatter(ticker.FuncFormatter(y_formatter))
plt.subplot(1,3,3)
df=pcut_func(df_test,var="text_length",nbin=10)
ax[2].plot(df["cut"],df["target"],color="r",marker="*",linewidth=2, markersize=12)
ax[2].set_title("text_length\n(test set)")
ax[2].set_ylabel("complaint %")
ax[2].tick_params(labelrotation=45)
ax[2].yaxis.set_major_formatter(ticker.FuncFormatter(y_formatter))
fig.tight_layout()

#### short and long email

In [None]:
root_dir="/opt/omniai/work/instance1/jupyter/v2_new_email/datasets/split_data"
data_name=[x for x in os.listdir(root_dir) if x.split("_")[-2]=="pickle"]
df1=pd.DataFrame()
for data in data_name:
    x=pd.read_pickle(os.path.join(root_dir,data))
    x=x.dropna(subset=['email'])
    x=x[x.email.notna()]
    x=x[x.email.str.len()>0]
    df1=pd.concat([df1,x],axis=0,ignore_index=True)
    # print("{:<20}{:<20,}".format(data.split("_")[2],x.shape[0]))
    
df1=df1.reset_index(drop=True)

df1['time'] = pd.to_datetime(df1['time'])
df1['year'] = df1.time.apply(lambda x: x.year)
df1['month'] = df1.time.apply(lambda x: x.month)
df1['day'] = df1.time.apply(lambda x: x.day)
df1.sort_values(by='time', inplace = True) 

### only keep emails with status=closed
df1=df1[df1.state=="closed"]

## train: 09/2022 ~ 02/2023. validation: 03/2023  test: 04/2023
set_categories=lambda row: "train" if (row["year"] in [2022,2023] and row["month"] in [9,10,11,12,1,2]) \
else ("val" if (row["year"]==2023 and row["month"]==3) else "test")
df1["data_type"]=df1.progress_apply(set_categories,axis=1)

In [None]:
df1["long_short"]=df1['text_length'].progress_apply(lambda x : 1 if x>512 else 0)
df_short=df1[df1["long_short"]==0]
df_long=df1[df1["long_short"]==1]

df_short.drop("long_short", axis=1, inplace=True)
df_short=df_short.reset_index(drop=True)

df_long.drop("long_short", axis=1, inplace=True)
df_long=df_long.reset_index(drop=True)

In [None]:
def label_distribution(df,data_type):
    df=df[df["data_type"]==data_type]
    tempt1=pd.DataFrame(df["is_complaint"].value_counts(dropna=False)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'count'})
    tempt2=pd.DataFrame(df["is_complaint"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'percentage'})
    tempt3=tempt1.merge(tempt2, on="is_complaint", how="inner")
    tempt3['data_type']=data_type
    tempt3=tempt3.loc[:,['data_type','is_complaint','count','percentage']]
    return tempt3

def style_format(df, title):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"{title}")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,label_distribution(df_short,"train")])
dist_df=pd.concat([dist_df,label_distribution(df_short,"val")])
dist_df=pd.concat([dist_df,label_distribution(df_short,"test")])
style_format(dist_df,title=f"label distribution for short email")

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,label_distribution(df_long,"train")])
dist_df=pd.concat([dist_df,label_distribution(df_long,"val")])
dist_df=pd.concat([dist_df,label_distribution(df_long,"test")])
style_format(dist_df,title="label distribution for long email")