In [None]:
import argparse
import os
import pandas as pd
pd.options.mode.chained_assignment=None
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML
import re
import textwrap
import random

import string
import nltk
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# Load the stopwords from the new directory
nltk_data_dir=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models","nltk_data")
stopwords_file = open(nltk_data_dir + '/corpora/stopwords/english')
stopwords_list = stopwords_file.readlines()
STOPWORDS=[x.strip() for x in stopwords_list]
nltk.data.path.append(nltk_data_dir)

In [None]:
def bow_preprocess(text):
    # lemma = nltk.wordnet.WordNetLemmatizer()
    text = str(text) 
    ### Remove stop word
    text = [word for word in word_tokenize(text) if word.lower() not in STOPWORDS]
    text = " ".join(text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]
    text=" ".join(text)
    return text    

In [None]:
data_path=os.path.join("/opt/omniai/work/instance1/jupyter/", "v4_new_email","datasets","split_data")

data_name=[x for x in os.listdir(data_path) if x.split("_")[-2]=="pickle"]
df=pd.DataFrame()
for data in data_name:
    x=pd.read_pickle(os.path.join(data_path,data))
    df=pd.concat([df,x],axis=0,ignore_index=True)
    # print("{:<20}{:<20,}".format(data.split("_")[-1],x.shape[0]))

### only keep emails with status=closed
df=df[df.state=="closed"]

df['time'] = pd.to_datetime(df['time'])
df.sort_values(by='time', inplace = True) 

## train: 09/2022 ~ 02/2023. validation: 03/2023  test: 04/2023
set_categories=lambda row: "train" if (row["year"] in [2022,2023] and row["month"] in [9,10,11,12,1,2,3]) else "test"

df["data_type"]=df.progress_apply(set_categories,axis=1)

df.loc[:,'complaint']=df.loc[:,'is_complaint'].progress_apply(lambda x: 1 if x=="Y" else 0)
df.loc[:,'feedback']=df.loc[:,'is_feedback'].progress_apply(lambda x: 1 if x=="Y" else 0)

df["bag_of_word"]=df["preprocessed_email"].progress_apply(bow_preprocess)
words = set(nltk.corpus.words.words())
df["bag_of_word"] = df["bag_of_word"]\
.progress_apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words ))

# df_train=df[df["data_type"]=="train"]
# df_val=df[df["data_type"]=="val"]
# df_test=df[df["data_type"]=="test"]

In [None]:
negative_word=[]
with open("negative-words.txt") as f:
    for curline in f:
        if curline.startswith(";"):
            continue
        if curline.strip():
            negative_word.append(curline.strip())
            
print()
print("There are {:,} negative words externally".format(len(negative_word)))
print()

In [None]:
df['negative_word_set']=df["bag_of_word"].progress_apply(lambda x: set(x.split()).intersection(set(negative_word)))
# df_val['negative_word_set']=df_val["bag_of_word"].progress_apply(lambda x: set(x.split()).intersection(set(negative_word)))
# df_test['negative_word_set']=df_test["bag_of_word"].progress_apply(lambda x: set(x.split()).intersection(set(negative_word)))


df_complaint,  df_no_complaint=df[df['complaint']==1], df[df['complaint']==0]
# val_complaint,  val_no_complaint=df_val[df_val['complaint']==1], df_val[df_val['complaint']==0]
# test_complaint,  test_no_complaint=df_test[df_test['complaint']==1], df_test[df_test['complaint']==0]

df_feedback,  df_no_feedback=df[df['feedback']==1], df[df['feedback']==0]
# val_feedback,  val_no_feedback=df_val[df_val['feedback']==1], df_val[df_val['feedback']==0]
# test_feedback,  test_no_feedback=df_test[df_test['feedback']==1], df_test[df_test['feedback']==0]

def most_common_word(df,feature):
    word_count=Counter()
    for index,row in tqdm(df.iterrows(), total=df.shape[0]):
        if isinstance(row[feature],list):
            word_count.update(set(row[feature].split()))
        elif isinstance(row[feature],set):
            word_count.update(row[feature])
    word,freq=zip(*word_count.most_common())
    return word,freq

word_complaint, freq_complaint = most_common_word(df_complaint, feature="negative_word_set")
# word_val_complaint, freq_val_complaint = most_common_word(val_complaint, feature="negative_word_set")
# word_test_complaint, freq_test_complaint = most_common_word(test_complaint, feature="negative_word_set")

word_no_ccomplaint, freq_no_complaint = most_common_word(df_no_complaint, feature="negative_word_set")
# word_val_no_ccomplaint, freq_val_no_complaint = most_common_word(val_no_complaint, feature="negative_word_set")
# word_test_no_ccomplaint, freq_test_no_complaint = most_common_word(test_no_complaint, feature="negative_word_set")

word_feedback, freq_feedback = most_common_word(df_feedback, feature="negative_word_set")
# word_val_feedback, freq_val_feedback = most_common_word(val_feedback, feature="negative_word_set")
# word_test_feedback, freq_test_feedback = most_common_word(test_feedback, feature="negative_word_set")

word_no_feedback, freq_no_feedback = most_common_word(df_no_feedback, feature="negative_word_set")
# word_val_no_feedback, freq_val_no_feedback = most_common_word(val_no_feedback, feature="negative_word_set")
# word_test_no_feedback, freq_test_no_feedback = most_common_word(test_no_feedback, feature="negative_word_set")

# keyword_training=[w for w in word_train_complaint if w not in word_train_no_churn]
# keyword_test=[w for w in word_test_complaint if w not in word_test_no_churn]


In [None]:
dict_data={}
dict_data["complaint"]=word_complaint[0:50]
dict_data["feedback"]=word_feedback[0:50]

pd.DataFrame(dict_data).style.format().set_caption("Most common negative sentiment word in complaint==1 or feedback==1")\
.set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}]) 

In [None]:
word=set(word_complaint[0:20]).difference(set(word_no_ccomplaint[0:20]))
print()
print(word)
print()
# tempt["negative_word"]=tempt["bag_of_word"].progress_apply(lambda x: 1 if len(set(word).intersection(set(x.split())))!=0 else 0 )
df["negative_word"]=df["bag_of_word"].progress_apply(lambda x: 1 if (len(x.split())>0 and any(item in word for item in x.split())) else 0 )

plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.autolayout"] = True
ax = sns.barplot(data = df, x='negative_word',y='complaint')
ax.set_title(" Top 50 Negative word in email", fontsize=16)
ax.set_xticklabels(["no negative word", "negative word exist"])
ax.set_ylabel("complaint rate", fontsize=16)
ax.set_xlabel("")
plt.legend(loc="best")

In [None]:
word=set(word_feedback[0:20]).difference(set(word_no_feedback[0:20]))
print()
print(word)
print()
# tempt["negative_word"]=tempt["bag_of_word"].progress_apply(lambda x: 1 if len(set(word).intersection(set(x.split())))!=0 else 0 )
df["negative_word"]=df["bag_of_word"].progress_apply(lambda x: 1 if (len(x.split())>0 and any(item in word for item in x.split())) else 0 )

plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.autolayout"] = True
ax = sns.barplot(data = df, x='negative_word',y='complaint')
ax.set_title(" Top 50 Negative word in email", fontsize=16)
ax.set_xticklabels(["no negative word", "negative word exist"])
ax.set_ylabel("complaint rate", fontsize=16)
ax.set_xlabel("")
plt.legend(loc="best")

In [None]:
def label_distribution(df):
    tempt1=df.groupby('is_feedback')['is_complaint'].value_counts(dropna=False).reset_index(name="count")
    tempt2=df.groupby('is_feedback')['is_complaint'].value_counts(dropna=False,normalize=True).reset_index(name="percentage")
    tempt3=tempt1.merge(tempt2, on=['is_feedback',"is_complaint"], how="inner")
    tempt3=tempt3.loc[:,['is_feedback','is_complaint','count','percentage']]
    return tempt3

def style_format(df):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"is_feedback distribution")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

dist_df=label_distribution(df)
style_format(dist_df)

In [None]:
word=set(word_train_complaint[0:50])
print()
print(word)
print()

In [None]:
word=set(word_train_complaint[0:50]).difference(set(word_train_no_ccomplaint[0:50]))
print()
print(word)
print()
# tempt["negative_word"]=tempt["bag_of_word"].progress_apply(lambda x: 1 if len(set(word).intersection(set(x.split())))!=0 else 0 )
df_train["negative_word"]=df_train["bag_of_word"].progress_apply(lambda x: 1 if (len(x.split())>0 and any(item in word for item in x.split())) else 0 )
df_val["negative_word"]=df_val["bag_of_word"].progress_apply(lambda x: 1 if (len(x.split())>0 and any(item in word for item in x.split())) else 0 )
df_test["negative_word"]=df_test["bag_of_word"].progress_apply(lambda x: 1 if (len(x.split())>0 and any(item in word for item in x.split())) else 0 )

df_train["negative_word_no_feedback"]=df_train.progress_apply(lambda x: 1 if (len(x["bag_of_word"].split())>0 and any(item in word for item in x["bag_of_word"].split()) and x['is_feedback']=="N") else 0,axis=1)
df_val["negative_word_no_feedback"]=df_val.progress_apply(lambda x: 1 if (len(x["bag_of_word"].split())>0 and any(item in word for item in x["bag_of_word"].split()) and x['is_feedback']=="N") else 0,axis=1 )
df_test["negative_word_no_feedback"]=df_test.progress_apply(lambda x: 1 if (len(x["bag_of_word"].split())>0 and any(item in word for item in x["bag_of_word"].split()) and x['is_feedback']=="N") else 0,axis=1 )


In [None]:
tempt1=df_train.copy()
tempt1["data_type"]=["training_set"]*len(tempt1)
tempt2=df_val.copy()
tempt2["data_type"]=["validation_set"]*len(tempt2)
tempt3=df_test.copy()
tempt3["data_type"]=["test_set"]*len(tempt3)
tempt=pd.concat([tempt1,tempt2,tempt3],axis=0)

plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.autolayout"] = True
ax = sns.barplot(data = tempt, x='negative_word',y='target',hue="data_type")
# ax = sns.barplot(data = tempt, x='negative_word_no_feedback',y='target',hue="data_type")
ax.set_title(" Negative word (top 50) in email", fontsize=16)
ax.set_xticklabels(["no negative word", "negative word exist"])
ax.set_ylabel("complaint rate", fontsize=16)
ax.set_xlabel("")
plt.legend(loc="best")

In [None]:
plt.rcParams["figure.figsize"] = [10, 6]
plt.rcParams["figure.autolayout"] = True
# ax = sns.barplot(data = tempt, x='negative_word',y='target',hue="data_type")
ax = sns.barplot(data = tempt, x='negative_word_no_feedback',y='target',hue="data_type")
ax.set_title(" Negative word (top 50) in email\n {is_feedback=No}", fontsize=16)
ax.set_xticklabels(["no negative word", "negative word exist"])
ax.set_ylabel("complaint rate", fontsize=16)
ax.set_xlabel("")
plt.legend(loc="best")

In [None]:
def negative_word_dist(df,data_type,col_name="negative_word"):
    tempt1=pd.DataFrame(df[col_name].value_counts(dropna=False)).reset_index().rename(columns={'index':col_name,col_name:'count'})
    tempt2=pd.DataFrame(df[col_name].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':col_name,col_name:'percentage'})
    tempt3=tempt1.merge(tempt2, on=col_name, how="inner")
    tempt3['data_type']=data_type
    tempt3=tempt3.loc[:,["data_type",col_name,'count','percentage']]
    return tempt3

def style_format(df):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"negative word in email")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,negative_word_dist(df_train,"train","negative_word")])
dist_df=pd.concat([dist_df,negative_word_dist(df_val,"validation","negative_word")])
dist_df=pd.concat([dist_df,negative_word_dist(df_test,"test","negative_word")])
dist_df=dist_df[dist_df.negative_word==1].drop(["negative_word"],axis=1)
style_format(dist_df)

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,negative_word_dist(df_train,"train","negative_word_no_feedback")])
dist_df=pd.concat([dist_df,negative_word_dist(df_val,"validation","negative_word_no_feedback")])
dist_df=pd.concat([dist_df,negative_word_dist(df_test,"test","negative_word_no_feedback")])
dist_df=dist_df[dist_df.negative_word_no_feedback==1].drop(["negative_word_no_feedback"],axis=1)
def style_format(df):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"negative word in email\nis_feedback=No")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])
style_format(dist_df)