# Package Imports

In [None]:
import numpy as np
import pandas as pd
import os
import re


from datasets import load_from_disk, load_metric, concatenate_datasets, DatasetDict, load_dataset
import evaluate
from transformers import (
     AutoTokenizer,
     DataCollatorWithPadding,
     TrainingArguments,
     AutoModelForSequenceClassification,
     Trainer,
     logging,
     AdamW,
     get_scheduler,
)
import torch
from ray import tune, train
import pickle
from datetime import datetime
from sklearn.metrics import confusion_matrix
import utility.utility as util
import utility.CustomTrainer as ct
import utility.ModelConfig as mc

# Set ModelConfig

In [None]:
"""
Path to project root-directory, needs to be set if not directly called from prjoect directory.
"""
path_cwd = os.getcwd()

"""
Name of ModelConfig file
"""
_name_config_file = "ModelConfig_roberta-base_English_ConsUncons_07_02_24_23_52.pkl"

"""
Filepath to ModelConfig
"""
path_file_modelconfig = os.path.join("modelconfigs", _name_config_file)

"""
Name of dataset on Hub to be used during inference.
"""
_name_dataset_hub = "HalaJada/FinStmts_ConsUncons_English_EU_Sliding_Predict"

# Load ModelConfig

In [None]:
model_config = None
with open(os.path.join(path_cwd, path_file_modelconfig), "rb") as f:
    model_config = pickle.load(f)

# Filepath to trained model

In [None]:
path_trained_model = os.path.join(path_cwd, model_config.path_trained_model)

# Load Dataset

In [None]:
raw_dataset = util.load_data(True, _name_dataset_hub, "")["train"]

# Load Tokenizer and tokenize dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(path_trained_model)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [None]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation Metrics

In [None]:
clf_metrics = evaluate.combine(model_config.eval_metrics)

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return clf_metrics.compute(predictions = predictions, references = labels)

# Model & Trainer Initialization

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(path_trained_model)

In [None]:
trainer = Trainer(model,
    data_collator=data_collator,
    tokenizer=tokenizer,)

# Create predictions

In [None]:
predictions = trainer.predict(tokenized_dataset)

# Process predictions and merge with dataset

In [None]:
pred_df, pred_mv_df = util.process_prediction_results(raw_dataset.to_pandas(), predictions, "original_id", "text", "id", "label", model_config.flag_mv)

# Prepare meta dataframe

In [None]:
meta_dict = {"date": datetime.now().strftime("%d_%m_%y"),
             "time": datetime.now().strftime("%H_%M"),
             "base_model": model_config.base_model,
             "trained model path": model_config.path_trained_model,
             "dataset": _name_dataset_hub,
             "modelconfig": _name_config_file}
meta_df = pd.DataFrame.from_dict(meta_dict, columns=[""] orient="index")

# Prepare column descriptions

In [None]:
#col_descr_dict = {}
#col_descr_df = pd.DataFrame(col_descr_dict, orient="index")

# Save results

In [None]:
path_excel_file = os.path.join(path_cwd ,"prediction_results", "pred_" + datetime.now().strftime("%d_%m_%y_%H_%M") + ".xlsx")

In [None]:
with pd.ExcelWriter(path_excel_file) as writer:
    meta_df.to_excel(writer, sheet_name="Meta", index = True)
    pred_df.to_excel(writer, sheet_name="Predictions", index=False)
    if model_config.flag_mv:
        pred_mv_df.to_excel(writer, sheet_name="Predictions_MV", index = False)
    #col_descr_df.to_excel(write, sheet_name="Data Description", index = True)