In [None]:
# !pip install --quiet arthurai==3.19.1

In [None]:
from arthurai import ArthurAI, ArthurModel
from arthurai.common.constants import InputType, OutputType, Stage, ValueType, Enrichment
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None,'display.max_columns', None)
from tqdm.auto import tqdm
tqdm.pandas(position=0,leave=True)
import os

In [None]:
data_path=os.path.join("/opt/omniai/work/instance1/jupyter/", "v5_new_email","datasets")
df=pd.read_pickle(os.path.join(data_path,"train_val_test_pickle"))

df['time'] = pd.to_datetime(df['time'])
df.sort_values(by='time', inplace = True) 
set_categories=lambda row: "train" if (row["year"] in [2022,2023] and row["month"] in [9,10,11,12,1,2,3,4]) else "test"
df["data_type"]=df.progress_apply(set_categories,axis=1)
# df['target']=df.loc[:,'is_complaint'].progress_apply(lambda x: 1 if (x['is_complaint']=="Y") or (x['is_feedback']=="Y") else 0, axis=1)
df['target']=np.where((df['is_complaint']=="Y") | (df['is_feedback']=="Y"),1,0)

df_train=df[df.data_type=="train"]
df_train=df_train.reset_index(drop=True)

df_test=df[df.data_type=="test"]
df_test=df_test.reset_index(drop=True)
## overwrite the target with the ground true complaint label
df_test['target']=df_test['is_complaint'].progress_apply(lambda x: 1 if x=="Y" else 0)

model_dir=os.path.join("/opt/omniai/work/instance1/jupyter/v5_new_email/Fine-Tuning/results/05_23/",  "longformer_base_4096_customized")
output_df=pd.read_csv(os.path.join(model_dir , "predictions_95.csv"))
pred_complaint=output_df.loc[:,["snapshot_id","thread_id","Predicted_prob"]]
pred_complaint.rename(columns={"Predicted_prob":"pred_complaint"},inplace=True)

In [None]:
best_threshold=output_df["best_threshold"].unique()[0]
best_threshold

In [None]:
reference_data=df_test.loc[:,["snapshot_id","thread_id","time","preprocessed_email","target"]]
reference_data=pd.merge(left=reference_data, right=pred_complaint, on=["snapshot_id","thread_id"],how="inner")

In [None]:
reference_data.head()

In [None]:
reference_data["time"]=pd.to_datetime(reference_data["time"])
reference_data["time"].dtype

In [None]:
arthurai = ArthurAI(access_key="abc123", url='https://abc.net/', organization_id='abc')
arthur_model = arthurai.model(  partner_model_id=f"complaint-model-schema",
                                display_name="CB-CX Complaint Language Model v1",
                                input_type=InputType.NLP,
                                output_type=OutputType.Multiclass,
                                is_batch=False,
                                classifier_threshold=best_threshold)

In [None]:
pred_to_ground_truth_map = {'pred_complaint' : 1}

In [None]:
arthur_model.build(data=reference_data,
                   ground_truth_column= "target",
                   pred_to_ground_truth_map = pred_to_ground_truth_map ,
                   non_input_columns=["snapshot_id","thread_id","time"],
                   set_reference_data=False)

# arthur_model.get_attribute(name="preprocessed_email").value_type = ValueType.String
arthur_model.get_attribute(name="snapshot_id").value_type = ValueType.String
arthur_model.get_attribute(name="thread_id").value_type = ValueType.String

arthur_model.get_attribute(name="preprocessed_email").set(categorical=True)
arthur_model.get_attribute(name="preprocessed_email").set(is_unique=True)

arthur_model.get_attribute(name="snapshot_id").set(categorical=True)
arthur_model.get_attribute(name="thread_id").set(categorical=True)
arthur_model.get_attribute(name="time").value_type = ValueType.Timestamp
arthur_model.get_attribute(name="time").set(categorical=False, categories=[])

In [None]:
arthur_model.review()

In [None]:
# arthur_model.to_csv("schemas.csv")

In [None]:
arthur_model.review().to_csv("language_model_schemas.csv")

In [None]:
json_export=arthur_model.to_json()
print(json_export)

In [None]:
import json
with open("language_model_schemas.json",'w') as file:
    json.dump(json_export,file)

In [None]:
cluster_url = "https://arthur-eval.mm.mlesm.prod.aws.jpmchase.net/" 
org_id = "407f1be3-3cf6-4fd2-8cad-13c5f1390ec3"
token_id="eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IkpKRmRuSURYTngtTERNUWIzR3V4eVpSaC1xZyIsImtpZCI6IkpKRmRuSURYTngtTERNUWIzR3V4eVpSaC1xZyJ9.eyJhdWQiOiJKUE1DOlVSSTpSUy0xMTA0OTYtOTgzNDAtUFJPRE9tbmlBSU1vZGVsTW9uaXRvcmluZy1QUk9EIiwiaXNzIjoiaHR0cDovL2lkYS5qcG1vcmdhbmNoYXNlLmNvbS9hZGZzL3NlcnZpY2VzL3RydXN0IiwiaWF0IjoxNjg5OTUxMTUxLCJuYmYiOjE2ODk5NTExNTEsImV4cCI6MTY4OTk1NDc1MSwiSlBNQ0lkZW50aWZpZXIiOiJSNzM0NDYzIiwiQnVpbGRpbmdDaXR5IjoiNTk2MzUiLCJDb21wYW55IjoiSlBNT1JHQU4gQ0hBU0UgQkFOSyAoSUwpIiwiRGlzcGxheU5hbWUiOiJKaWFuZywgQ2h1YW5saWFuZyIsIkVtYWlsIjoiY2h1YW5saWFuZy5qaWFuZ0BjaGFzZS5jb20iLCJFbXBsb3llZUNvcnBvcmF0ZUlEIjoiUjczNDQ2MyIsIkZpcnN0TmFtZSI6IkNodWFubGlhbmciLCJMYXN0TmFtZSI6IkppYW5nIiwiTE9CIjoiQ09NTUVSQ0lBTCBCQU5LIiwiU0lEIjoiUjczNDQ2MyIsIlJvbGUiOlsiQVJUNGFkMjJhNzFiMDBlNTdiMjNiZjA3MzdkNjk0OWI4ZTctOTgzNDAtMTEwNDk2LVBST0QiLCJBUlRkMWRhNzlkZmUyMzM4OWFlZTM1YzVhZDg0MjhhMjIzNC05ODM0MC0xMTA0OTYtUFJPRCIsIkFSVGUzNDA5NjAzMjZjNGVhYzQ2YWE0ZjM5M2ZmNTA3YzBiLTk4MzQwLTExMDQ5Ni1QUk9EIiwiQVJUNWU5NDI5YjNjMTViOTZhYjIxMDU2NTFjNjliMTNmN2MtOTgzNDAtMTEwNDk2LVBST0QiXSwiQ2xpZW50SVBBZGRyZXNzIjoiMTAuMjAxLjE3NS4yMjMiLCJhcHB0eXBlIjoiUHVibGljIiwiYXBwaWQiOiJQQy0xMTA0OTYtU0lELTE1Njc3Ni1QUk9EIiwiYXV0aG1ldGhvZCI6Imh0dHA6Ly9zY2hlbWFzLm1pY3Jvc29mdC5jb20vd3MvMjAwOC8wNi9pZGVudGl0eS9hdXRoZW50aWNhdGlvbm1ldGhvZC93aW5kb3dzIiwiYXV0aF90aW1lIjoiMjAyMy0wNy0yMVQwOToxMToyOS44NTRaIiwidmVyIjoiMS4wIiwic2NwIjoib3BlbmlkIn0.mZp617Gmz1145ZUc0fGEBPndjz-mI-iNTW37JJxaGxe2jhnehvmkXDJXTLcxqtRqfdZh6YdA6GekuL1JuUHe1uDb5fQg1gkQWOuPS6TKjrfzCyxTIVpm-U9WcLrlgrHBYZWjOOJxd-uQOMR2TzPDFFi27gser8GbbjngQs2q7DZ5Vc9HaxXk-t_RB7JNo75H4_bkfGN78pJuUQLpNERaBEOwSaUp8hyert65oke1k1_lbv5bIWSD2Dmhh6wmpD1m9ukH-ug9Ya4uQNbcok2xS2BMvJIR02wfpXmvstyHrYio7NjKWulWjIXrxfslsm9R9J805u7h1udvqTqsqPhhDA"
connection = ArthurAI(access_key=token_id, url=cluster_url, organization_id=org_id)

f = open("language_model_schemas.json")
json_schema = json.load(f)
arthur_model = ArthurModel.from_json(json_schema)

In [None]:
arthur_model.partner_model_id = 'longformer-base-v1'
arthur_model._client = connection.client

In [None]:
arthur_model.set_reference_data(data = reference_data)
# %pdb
model_id = arthur_model.save()