In [None]:
import os
import pandas as pd
import azureml.core
import numpy as np
import plotly.graph_objects as go
from IPython.core.display import HTML
from utils import *

from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.train.automl.run import AutoMLRun
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
import numpy as np
import pandas as pd
import argparse
import os
import re
import time
import glob
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn import preprocessing
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.integrations import AzureMLCallback
from transformers import AutoTokenizer, DataCollatorWithPadding
# from datasets import Dataset, DatasetDict

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)


In [None]:
sys.path.append(os.path.join(os.path.join(os.getcwd(), ".."), 'project'))

In [None]:
from train_transformer import get_model, adjust_tokenizer, compute_metrics, get_encode_labels, tokenize_function, generate_tokenized_dataset, get_datasets, test_model

In [None]:
from azureml.core import Workspace
ws = Workspace.from_config()


In [None]:
ds_X_train = Dataset.get_by_name(ws, name="owner_g_classfication_train", version=8)
ds_X_val = Dataset.get_by_name(ws, name="owner_g_classfication_val", version=8)
ds_X_test = Dataset.get_by_name(ws, name="owner_g_classfication_test", version=8)
ds_temporal_test = Dataset.get_by_name(ws, name="owner_g_classfication_temporal_test", version=3)

In [None]:
print(f'{ds_X_train.tags}: V{ds_X_train.version}')
print(f'{ds_X_val.tags}: V{ds_X_val.version}')
print(f'{ds_X_test.tags}: V{ds_X_test.version}')
print(f'{ds_temporal_test.tags}: V{ds_temporal_test.version}')

In [None]:
pdf_X_train = ds_X_train.to_pandas_dataframe()
pdf_X_val = ds_X_val.to_pandas_dataframe()
pdf_X_test = ds_X_test.to_pandas_dataframe()
pdf_temporal_test = ds_temporal_test.to_pandas_dataframe()

In [None]:
print(f'pdf_X_train shape: {pdf_X_train.shape}')
print(f'pdf_X_val shape: {pdf_X_val.shape}')
print(f'pdf_X_test shape: {pdf_X_test.shape}')
print(f'pdf_temporal_test shape: {pdf_temporal_test.shape}')

In [None]:
base_checkpoint = "bert-base-uncased"
text_field_name = "txt_field"
target_name = "target"

In [None]:
model_directory = 'model_output/model'
model = AutoModelForSequenceClassification.from_pretrained(model_directory, num_labels=51)
tokenizer = AutoTokenizer.from_pretrained(model_directory)


In [None]:
le=joblib.load(model_directory + '/labelEncoder.joblib')
le

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
model.zero_grad()
print(device)

In [None]:
fields = [text_field_name, target_name, 'labels']

train_ds, tokenized_train_ds = generate_tokenized_dataset(pdf_X_train, fields, le, target_name, text_field_name, tokenizer)
validation_ds, tokenized_validation_ds = generate_tokenized_dataset(pdf_X_val, fields, le, target_name, text_field_name, tokenizer)
test_ds, tokenized_test_ds = generate_tokenized_dataset(pdf_X_test, fields, le, target_name, text_field_name, tokenizer)
temporal_test_ds, tokenized_temporal_test_ds = generate_tokenized_dataset(pdf_temporal_test, fields, le, target_name, text_field_name, tokenizer)

In [None]:
os.makedirs('custom_model', exist_ok=True)

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, compute_metrics=compute_metrics, tokenizer=tokenizer)

In [None]:
a = trainer.predict(tokenized_test_ds)


In [None]:
a.metrics

In [None]:
from azureml.core import Run

run = Run.get_context(allow_offline=True)

In [None]:
test_result = test_model(trainer, tokenized_test_ds, 'test_set')

In [None]:
temporal_result = test_model(trainer, tokenized_temporal_test_ds, 'temporal_test')

In [None]:
pred = best_model.predict(pdf_X_test['txt_field'].to_frame())
pdf_X_test['pred'] = pred

In [None]:
pdf_X_test = pd.read_csv('pdf_X_test_v8.csv')
pdf_X_test.head()

In [None]:
pdf_X_test, df_result = get_base_dataframes(pdf_X_test)

In [None]:
draw_sanky_chart(pdf_X_test)

In [None]:
pd_merged_test = calculate_top_OGs(pdf_X_test, df_result)
pd_merged_test[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_groups', 'top_2_count',
       'top_2_perc']].sort_values('percentage_matched', ascending=False)

In [None]:
pdf_X_val = pd.read_csv('pdf_X_val_v8.csv')

In [None]:
pdf_X_val, df_result = get_base_dataframes(pdf_X_val)

In [None]:
draw_sanky_chart(pdf_X_val)

In [None]:
pd_merged_val = calculate_top_OGs(pdf_X_val, df_result)
pd_merged_val[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_groups', 'top_2_count',
       'top_2_perc']].sort_values('percentage_matched', ascending=False)

In [None]:
pd_merged_val[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_groups', 'top_2_count',
       'top_2_perc']].sort_values('percentage_matched', ascending=False).head(10)

In [None]:
pdf_X_train = pd.read_csv('pdf_X_train_v8.csv')

In [None]:
pdf_X_train, df_result_train = get_base_dataframes(pdf_X_train)

In [None]:
pd_merged_train = calculate_top_OGs(pdf_X_train, df_result_train)

In [None]:
draw_sanky_chart(pdf_X_train)

In [None]:
pd_merged_test[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_groups', 'top_2_count',
       'top_2_perc']].sort_values('percentage_matched', ascending=False).head(10)

In [None]:
pd_merged_val.shape, pd_merged_test.shape

In [None]:
pd_eval_test = pd.merge(pd_merged_val, pd_merged_test, suffixes=("_evaluation", "_test"), on='target', how='left')
pd_eval_test['diff'] = abs(pd_eval_test['percentage_matched_test'] - pd_eval_test['percentage_matched_evaluation'])

In [None]:
pd_eval_test[['target', 'diff', 'percentage_matched_test', 'percentage_matched_evaluation', 'top_2_groups_evaluation', 'top_2_count_evaluation', 'top_2_perc_evaluation',
             'top_2_groups_test', 'top_2_count_test', 'top_2_perc_test']].sort_values('diff', ascending=False)

In [None]:
pd_train_eval_test = pd.merge(pd_merged_train, pd_eval_test, on='target', how='left')

In [None]:
pd_train_eval_test.columns

In [None]:
pd_train_eval_test['diff_val'] = abs(pd_train_eval_test['percentage_matched'] - pd_train_eval_test['percentage_matched_evaluation'])
pd_train_eval_test['diff_test'] = abs(pd_train_eval_test['percentage_matched'] - pd_train_eval_test['percentage_matched_test'])

In [None]:
pd_train_eval_test[['target', 'diff_val', 'diff_test',
                    'percentage_matched', 'percentage_matched_test', 'percentage_matched_evaluation', 
                    'top_2_groups', 'top_2_count', 'top_2_perc',
                    'top_2_groups_evaluation', 'top_2_count_evaluation', 'top_2_perc_evaluation',
                    'top_2_groups_test', 'top_2_count_test', 'top_2_perc_test']].sort_values('percentage_matched', ascending=False)


In [None]:
pdf_temporal_test = pd.read_csv('pdf_temporal_test_v8.csv')

In [None]:
pdf_temporal_test.head()

In [None]:
pdf_temporal_test, df_result_temporal = get_base_dataframes(pdf_temporal_test)
pd_merged_temporal = calculate_top_OGs(pdf_temporal_test, df_result_temporal)
draw_sanky_chart(pdf_temporal_test)

In [None]:
pd_merged_temporal[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_groups', 'top_2_count',
       'top_2_perc']].sort_values('percentage_matched', ascending=False)

In [None]:
calculate_performance(pdf_X_train['target'], pdf_X_train['pred'])

In [None]:
calculate_performance(pdf_X_val['target'], pdf_X_val['pred'])

In [None]:
calculate_performance(pdf_X_test['target'], pdf_X_test['pred'])

In [None]:
calculate_performance(pdf_temporal_test['target'], pdf_temporal_test['pred'])

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(pdf_temporal_test['target'], pdf_temporal_test['pred'])

In [None]:
pd_temporal_eval_test = pd.merge(pd_merged_temporal, pd_eval_test, on='target', how='left')

In [None]:
pd_temporal_eval_test['diff_val'] = abs(pd_temporal_eval_test['percentage_matched'] - pd_temporal_eval_test['percentage_matched_evaluation'])
pd_temporal_eval_test['diff_test'] = abs(pd_temporal_eval_test['percentage_matched'] - pd_temporal_eval_test['percentage_matched_test'])


In [None]:
pd_temporal_eval_test[['target', 'diff_val', 'diff_test',
                    'percentage_matched', 'percentage_matched_test', 'percentage_matched_evaluation', 
                    'top_2_groups', 'top_2_count', 'top_2_perc',
                    'top_2_groups_evaluation', 'top_2_count_evaluation', 'top_2_perc_evaluation',
                    'top_2_groups_test', 'top_2_count_test', 'top_2_perc_test']].sort_values('percentage_matched', ascending=True)
