In [None]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import argparse
import re
import time
import glob
import joblib
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn import preprocessing
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.integrations import AzureMLCallback
from transformers import AutoTokenizer, DataCollatorWithPadding

sys.path.append(os.path.join(os.path.join(os.getcwd(), ".."), 'project'))
from train_transformer import get_model, adjust_tokenizer, compute_metrics, get_encode_labels, tokenize_function, generate_tokenized_dataset, get_datasets, test_model
from utils import *
# from utils import get_valid_runs, get_highest_performing_model, get_dataset

In [None]:
import azureml
import mlflow
from azureml.core import Workspace, Dataset, Environment

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)
print("MLflow version:", mlflow.version.VERSION)


In [None]:
ws = Workspace.from_config()
# mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')


In [None]:
from azureml.core import Experiment

script_folder = './project'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='transformer_hp')
# mlflow.set_experiment('transformer_hp')

In [None]:
# This might take up to 10 minutes
counter = 0
best_temporal_f1_weighted = 0.0

metric_name = "temporal_test_f1_weighted"
dic_runs = get_valid_runs(exp, metric_name)


In [None]:
top_groups = '120'

best_performing_run = get_highest_performing_model(dic_runs, metric_name, top_groups)

dic_datasets = get_dataset(best_performing_run)
pdf_train = dic_datasets['ds_train'].to_pandas_dataframe()
pdf_temporal_test = dic_datasets['ds_temporal_test'].to_pandas_dataframe()

pdf_temporal_test = predict(best_performing_run["run"], pdf_train, pdf_temporal_test, top_groups, 'target')

In [None]:
pdf_temporal_test_preped, df_temp_test_result = get_base_dataframes(pdf_temporal_test)
draw_sanky_chart(pdf_temporal_test_preped, top_groups)


In [None]:
pdf_acc_per_class = calculate_top_groups(pdf_temporal_test_preped, df_temp_test_result, pdf_train)
pdf_acc_per_class_final = pdf_acc_per_class[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_group', 'top_2_count',
       'top_2_perc', 'count_in_training_set', 'count_ratio', 'text_length_average']].sort_values('percentage_matched', ascending=False)
pdf_acc_per_class_final.to_csv(f'output/pdf_acc_per_class_final{top_groups}.csv', index=False)
plot_accuracy_count_plot(pdf_acc_per_class_final, dic_datasets['ds_train'], top_groups).show()


In [None]:
top_groups = '100'

best_performing_run = get_highest_performing_model(dic_runs, metric_name, top_groups)

dic_datasets = get_dataset(best_performing_run)
pdf_train = dic_datasets['ds_train'].to_pandas_dataframe()
pdf_temporal_test = dic_datasets['ds_temporal_test'].to_pandas_dataframe()

pdf_temporal_test = predict(best_performing_run["run"], pdf_train, pdf_temporal_test, top_groups, 'target')


In [None]:
pdf_temporal_test_preped, df_temp_test_result = get_base_dataframes(pdf_temporal_test)
draw_sanky_chart(pdf_temporal_test_preped, top_groups)


In [None]:
pdf_acc_per_class = calculate_top_groups(pdf_temporal_test_preped, df_temp_test_result, pdf_train)
pdf_acc_per_class_final = pdf_acc_per_class[['target', 'total_records',
       'matched', 'percentage_matched', 'top_2_group', 'top_2_count',
       'top_2_perc', 'count_in_training_set', 'count_ratio', 'text_length_average']].sort_values('percentage_matched', ascending=False)
pdf_acc_per_class_final.to_csv(f'output/pdf_acc_per_class_final{top_groups}.csv', index=False)
plot_accuracy_count_plot(pdf_acc_per_class_final, dic_datasets['ds_train'], top_groups).show()
