In [None]:
import sys
sys.path.append("/opt/omniai/work/instance1/jupyter/v5_new_email/Fine-Tuning")
sys.path=list(set(sys.path))

In [None]:
import argparse
import pandas as pd
import numpy as np
from numpy import savez_compressed, load
import itertools
import re
import time
import os
import pickle

import datasets
from datasets import load_dataset, load_metric, Dataset, concatenate_datasets,DatasetDict
from datasets import load_from_disk
from tqdm import tqdm
tqdm.pandas(position=0,leave=True)

from fuzzywuzzy import fuzz

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
# Load the stopwords from the new directory
nltk_data_dir=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models","nltk_data")
stopwords_file = open(nltk_data_dir + '/corpora/stopwords/english')
stopwords_list = stopwords_file.readlines()
nltk.data.path.append(nltk_data_dir)

import spacy
model_name=os.path.join("/opt/omniai/work/instance1/jupyter/", "transformers-models","en_core_web_md","en_core_web_md-3.3.0")
nlp = spacy.load(model_name)

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from IPython.display import display, HTML

sns.set(style="whitegrid",palette='muted',font_scale=1.2)
rcParams['figure.figsize']=16,10

%config InlineBackend.figure_format="retina"
%matplotlib inline

import utils

pd.set_option('display.max_columns', None,'display.max_rows',None)

In [None]:
root_dir="/opt/omniai/work/instance1/jupyter/v5_new_email/datasets"
df1=pd.read_pickle(os.path.join(root_dir,"train_val_test_pickle"))

df1['time'] = pd.to_datetime(df1['time'])
df1['year'] = df1.time.apply(lambda x: x.year)
df1['month'] = df1.time.apply(lambda x: x.month)
df1['day'] = df1.time.apply(lambda x: x.day)
df1.sort_values(by='time', inplace = True) 

In [None]:
def label_distribution(df,year,month):
    df=df[(df.year==year) & (df.month==month)]
    tempt1=pd.DataFrame(df["is_complaint"].value_counts(dropna=False)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'count'})
    tempt2=pd.DataFrame(df["is_complaint"].value_counts(dropna=False,normalize=True)).reset_index().rename(columns={'index':'is_complaint','is_complaint':'percentage'})
    tempt3=tempt1.merge(tempt2, on="is_complaint", how="inner")
    tempt3['year']=year
    tempt3['month']=month
    tempt3=tempt3.loc[:,['year','month','is_complaint','count','percentage']]
    return tempt3

def style_format(df,  data_type="Training set"):
    return df.style.format({'count':'{:,}','percentage':'{:.2%}'})\
           .set_caption(f"label distribution\n{data_type}")\
           .set_table_styles([{'selector': 'caption','props': [('color', 'red'),('font-size', '15px')]}])

In [None]:
dist_df=pd.DataFrame()
dist_df=pd.concat([dist_df,label_distribution(df1,2023,5)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,6)])
dist_df=pd.concat([dist_df,label_distribution(df1,2023,7)])
style_format(dist_df,  data_type="split by month")

In [None]:
def metrics_read(df, model_name):
    # df=pd.read_csv(os.path.join(output_dir , file_name))
    true_y=df["True_label"].values
    pred_y=df["Predicted_label"].values
    pred_prob=df["Predicted_prob"].values
    best_threshold=df['best_threshold'].unique()[0]

    # test_output=utils.model_evaluate(true_y.reshape(-1),pred_y)
    test_output=utils.model_evaluate(true_y.reshape(-1),pred_prob,best_threshold)
    metric=pd.DataFrame()
    metric["model_type"]=[f"{model_name}"]
    metric["total complaint #"]=[test_output["total positive"]]
    metric["false_positive"]=[test_output["false positive"]]
    metric["false_negative"]=[test_output["false_negative"]]
    metric["precision"]=[test_output["precision"]]
    metric["recall"]=[test_output["recall"]]
    metric["f1_score"]=[test_output["f1_score"]]
    metric["roc_auc"]=[test_output["AUC"]]
    metric["pr_auc"]=[test_output["pr_auc"]]
    return metric

def metrics_df(output_dir, model_name):
    data_name=[x for x in os.listdir(output_dir) if x.split(".")[-1]=="csv"]
    data_name=sorted(data_name)
    df=pd.read_csv(os.path.join(output_dir , data_name[0]))
    metrics=metrics_read(df,model_name)
    for i in range(1,len(data_name)):
        df=pd.read_csv(os.path.join(output_dir , data_name[i]))
        metrics=pd.concat([metrics,metrics_read(df,model_name)],axis=0,ignore_index=True)
        
    metrics.drop_duplicates(subset=["recall"],inplace=True, keep="first")
    return metrics

In [None]:
precision=[]
recall=[]

test_date="05_23"
number_feature=990
model_name="randomforest"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "randomforest")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

test_date="05_23"
number_feature=990
model_name="xgboost"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "xgboost")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

test_date="05_23"
number_feature=990
model_name="lightgbm"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "lightgbm")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

# Define precision and recall values for each model
# models = ['TFIDF_lightgbm','Roberta-Large','Deberta-v3','longformer-base','longformer-large', 'bigbird']
# markers = ['o', 's', 'x', '*', '<', 'p']
# colors = ['blue', 'green','purple','red','black','brown']

models = ['TFIDF_randomforest','TFIDF_xgboost','TFIDF_lightgbm']
markers = ['o', 's', 'x']
colors = ['blue', 'green','red']

# Plot precision and recall
plt.figure(figsize=(10, 8))

# Iterate over models
for i in range(len(models)):
    plt.plot(recall[i], precision[i], marker=markers[i],  color=colors[i], label=models[i], linewidth=3, linestyle=":", markersize=8)

plt.xlabel('Recall', fontsize=14)
plt.ylabel('Precision', fontsize=14)
plt.title(f'Precision-Recall Curve \n(test_set={test_date})', fontsize=16)
plt.grid(True)

# Format axis values as percentages
ax = plt.gca()
ax.xaxis.set_major_locator(mtick.MultipleLocator(base=0.01))
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
ax.yaxis.set_major_locator(mtick.MultipleLocator(base=0.001))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=2))

plt.ylim(0.006,0.013)
plt.xlim(0.90,1.001)

# Add horizontal line for benchmark model
benchmark_precision=0.0074
plt.axhline(y=benchmark_precision, color=(0.8,0.7,0.5),linestyle='--', linewidth=3)

# Set the legend
plt.legend(loc='upper left')
plt.legend(models+["Human Review"],bbox_to_anchor=(1.35,0.5), fontsize=14)
# plt.legend(models+["Lexican Search"], fontsize=14)

# Show the plot
plt.show()

In [None]:
model_name="lightgbm"
test_date="05_23"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=pd.read_csv(os.path.join(output_dir, "predictions_97.csv"))
tf_true, tf_pred=df["True_label"].tolist(), df["Predicted_label"].tolist()

from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(tf_true, tf_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot(values_format=',')
plt.title("confusion matrix for TFIDF_lightgbm")
plt.grid(False)
plt.show()

In [None]:
precision=[]
recall=[]

test_date="05_23"
number_feature=990
model_name="randomforest"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "randomforest")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

# test_date="05_23"
# number_feature=990
# model_name="xgboost"
# output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
# df=metrics_df(output_dir, "xgboost")
# df=df[df.recall>0.9]
# precision.append(df["precision"].tolist())
# recall.append(df["recall"].tolist())

recall,precision

In [None]:
test_date="05_23"
number_feature=990
model_name="randomforest"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "randomforest")
df

In [None]:
precision=[]
recall=[]

test_date="06_23"
number_feature=990

model_name="randomforest"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "randomforest")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

model_name="xgboost"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "xgboost")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

model_name="lightgbm"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "lightgbm")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())


models = ['TFIDF_randomforest','TFIDF_xgboost','TFIDF_lightgbm']
markers = ['o', 's', 'x']
colors = ['blue', 'green','red']

# Plot precision and recall
plt.figure(figsize=(10, 8))

# Iterate over models
for i in range(len(models)):
    plt.plot(recall[i], precision[i], marker=markers[i],  color=colors[i], label=models[i], linewidth=3, linestyle=":", markersize=8)

plt.xlabel('Recall', fontsize=14)
plt.ylabel('Precision', fontsize=14)
plt.title(f'Precision-Recall Curve \n(test_set={test_date})', fontsize=16)
plt.grid(True)

# Format axis values as percentages
ax = plt.gca()
ax.xaxis.set_major_locator(mtick.MultipleLocator(base=0.01))
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
ax.yaxis.set_major_locator(mtick.MultipleLocator(base=0.001))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=2))

plt.ylim(0.004,0.011)
plt.xlim(0.90,1.001)

# Add horizontal line for benchmark model
data_name="test_data_"+str(number_feature)
input_data=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_data/{test_date}/{data_name}"
tst=pd.read_pickle(input_data)
a, b=tst["target_variable"].value_counts().tolist()
print("Precision of human review : {:.2%}".format(b/(a+b)))
benchmark_precision=b/(a+b)
plt.axhline(y=benchmark_precision, color=(0.8,0.7,0.5),linestyle='--', linewidth=3)

# Set the legend
plt.legend(loc='upper left')
plt.legend(models+["Huamn Review"],bbox_to_anchor=(1.35,0.5), fontsize=14)
# plt.legend(models+["Lexican Search"], fontsize=14)

# Show the plot
plt.show()

In [None]:
model_name="lightgbm"
test_date="06_23"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=pd.read_csv(os.path.join(output_dir, "predictions_97.csv"))
tf_true, tf_pred=df["True_label"].tolist(), df["Predicted_label"].tolist()

from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(tf_true, tf_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot(values_format=',')
plt.title("confusion matrix for TFIDF_lightgbm")
plt.grid(False)
plt.show()

In [None]:
precision=[]
recall=[]

test_date="07_23"
number_feature=990

model_name="randomforest"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "randomforest")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

model_name="xgboost"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "xgboost")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())

model_name="lightgbm"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=metrics_df(output_dir, "lightgbm")
df=df[df.recall>0.9]
precision.append(df["precision"].tolist())
recall.append(df["recall"].tolist())


models = ['TFIDF_randomforest','TFIDF_xgboost','TFIDF_lightgbm']
markers = ['o', 's', 'x']
colors = ['blue', 'green','red']

# Plot precision and recall
plt.figure(figsize=(10, 8))

# Iterate over models
for i in range(len(models)):
    plt.plot(recall[i], precision[i], marker=markers[i],  color=colors[i], label=models[i], linewidth=3, linestyle=":", markersize=8)

plt.xlabel('Recall', fontsize=14)
plt.ylabel('Precision', fontsize=14)
plt.title(f'Precision-Recall Curve \n(test_set={test_date})', fontsize=16)
plt.grid(True)

# Format axis values as percentages
ax = plt.gca()
ax.xaxis.set_major_locator(mtick.MultipleLocator(base=0.01))
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
ax.yaxis.set_major_locator(mtick.MultipleLocator(base=0.001))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=2))

plt.ylim(0.003,0.007)
plt.xlim(0.90,1.001)

# Add horizontal line for benchmark model
data_name="test_data_"+str(number_feature)
input_data=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_data/{test_date}/{data_name}"
tst=pd.read_pickle(input_data)
a, b=tst["target_variable"].value_counts().tolist()
print("Precision of human review : {:.2%}".format(b/(a+b)))
benchmark_precision=b/(a+b)
plt.axhline(y=benchmark_precision, color=(0.8,0.7,0.5),linestyle='--', linewidth=3)

# Set the legend
plt.legend(loc='upper left')
plt.legend(models+["Lexican Search"],bbox_to_anchor=(1.35,0.5), fontsize=14)
# plt.legend(models+["Lexican Search"], fontsize=14)

# Show the plot
plt.show()

In [None]:
model_name="lightgbm"
test_date="07_23"
output_dir=f"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production/tfidf_model/{test_date}/{number_feature}/{model_name}/"
df=pd.read_csv(os.path.join(output_dir, "predictions_97.csv"))
tf_true, tf_pred=df["True_label"].tolist(), df["Predicted_label"].tolist()

from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(tf_true, tf_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot(values_format=',')
plt.title("confusion matrix for TFIDF_lightgbm")
plt.grid(False)
plt.show()

In [None]:
import os
os.chdir(r"/opt/omniai/work/instance1/jupyter/v5_new_email/TFIDF/production")
df=pd.read_csv("TFIDF_schemas.csv")
df.head()

In [None]:
print(df.name.values.tolist()[0:100],"\n")