<a href="https://colab.research.google.com/github/hemnemne/master_thesis_submission/blob/main/Binary_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions
First, we need to set up all functions we need.

## Functions to Load the Data

In [None]:
def mount_drive():
  """
  Mounts the Google Drive to be able to retrieve and store data. 
  You will be asked to give permissions as soon as you run the function. 
  """
  
  from google.colab import drive
  drive.mount("/content/drive")

In [None]:
import pandas as pd

def load_data(base_path:str = "/content/drive/MyDrive/Masterarbeit/Colab_Data/LeiKa/"):
  """
  Load both the FAQ data and the services we use for training. 
  They both need to be stored as separate .csv files. 
  All the data here is taken from the LeiKa in the Solr Drive. 
  """

  # where are the files?
  file_path_services = base_path + "services.csv"
  file_path_faqs = base_path + "faq.csv"

  # get the dataframes

  # needs to have at least two columns: sentences and their labels
  df_services = pd.read_csv(file_path_services)
  df_faq = pd.read_csv(file_path_faqs)
  
  return df_services, df_faq

In [None]:
def get_possibilities():
  """
  Here, we define all possible parameters we can choose from. 
  This later makes the parameter choices more easily accessible.
  All of them are stored in a dictionary like so:
  For each parameter (keys) we define a list of possible settings (values).
  """

  possibilities = {
      'train_test_split': [0.9, 0.0, 0.8],
      'german_only': [True, False],
      'learning_rate': [5e-05, .1, .01, 5e-03, 5e-06],
      'epochs': [8,4,10,16],
      'decay': [0.01, 5e-05/10, 0.0], # second option is (lr / # of epochs)
      'num_attention_heads': [12,16],
      'num_hidden_layers': [12,24],
      'random_state': [100, 42, 55],
      'train_bert': [True,False],
      'name_only': [True,False],
      'faq_q_only': [True,False],
      'remove_special_chars': [True,False],
      'weighted': [None,{0:1.0 , 1:2.0}, {0:2.0 , 1:1.0}],
      'current_model': ['german_standard_sota', 'german_cased', 'german_uncased', 'english_basic', 'german_gelectra'],
      'shuffle_between_training': [False,True],
      'distill': [False,True],
      'sample_size': [2000,1000,4000],
      'override': [True,False],
      'explainable': [False,True],
      'optimizer': ['Adam', 'SGD', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl'],
      'output_attentions': [False,True],
      'output_hidden_states': [False,True],
      'batch_size': [8],
      'shuffle_tf': [False,True],
      'enrich_FAQs' : [False,True],
      'real_data' : [False,True],
      'cluster': [False,True]
      }

  return possibilities

In [None]:
import tensorflow as tf

def get_parameters(possibilities:dict):
  """
  Here, we get all parameters as they need to be.
  This is where we can adjust them.

  This method also initializes Models and Optimoizers globally. 
  They can then be received in the whole script as dictionaries if needed. 

  We also install additional requirements, 
  if the "explainable" parameter is set.
  """

  parameters = {
      'train_test_split':possibilities["train_test_split"][0],
      'german_only':possibilities["german_only"][0],
      'learning_rate':possibilities["learning_rate"][0],
      'epochs':possibilities["epochs"][0],
      'decay':possibilities["decay"][0],
      'num_attention_heads':possibilities["num_attention_heads"][0],
      'num_hidden_layers':possibilities["num_hidden_layers"][0],
      'random_state':possibilities["random_state"][0],
      'train_bert':possibilities["train_bert"][0],
      'name_only':possibilities["name_only"][0],
      'faq_q_only':possibilities["faq_q_only"][0],
      'remove_special_chars':possibilities["remove_special_chars"][0],
      'weighted':possibilities["weighted"][1],
      'current_model':possibilities["current_model"][0],
      'shuffle_between_training':possibilities["shuffle_between_training"][0],
      'distill':possibilities["distill"][0],
      'sample_size':possibilities["sample_size"][0],
      'override':possibilities["override"][1],
      'explainable':possibilities["explainable"][1],
      'optimizer':possibilities["optimizer"][0],
      'output_attentions':possibilities["output_attentions"][0],
      'output_hidden_states':possibilities["output_hidden_states"][0],
      'batch_size':possibilities["batch_size"][0],
      'shuffle_tf':possibilities["shuffle_tf"][0],
      'enrich_FAQs':possibilities["enrich_FAQs"][0],
      'real_data':possibilities["real_data"][0],
      'cluster':possibilities["cluster"][0]
      }

  # if we train on real data, we want a sample size of 4000
  if parameters["real_data"]:
    parameters['sample_size']=possibilities["sample_size"][2]

  # all possible models

  # "sota" : state of the art
  # "cased" : Gross/Kleinschreibung beruecksichtigen
  # "uncased" : Gross/Kleinschreibung nicht beruecksichtigen

  global models 
  models = {
      "german_standard_sota":"bert-base-german-cased", 
      "german_cased":"dbmdz/bert-base-german-cased",
      "german_uncased" : "dbmdz/bert-base-german-uncased",
      "english_basic":"bert-base-cased",
      "german_gelectra": "deepset/gelectra-large"}

  # all possible optimizers

  global optimizers
  optimizers = {"SGD":tf.keras.optimizers.SGD(learning_rate=parameters["learning_rate"],decay=parameters["decay"]),
                "RMSprop":tf.keras.optimizers.RMSprop(learning_rate=parameters["learning_rate"],decay=parameters["decay"]), 
                "Adam":tf.keras.optimizers.Adam(learning_rate=parameters["learning_rate"],decay=parameters["decay"]), 
                "Adadelta":tf.keras.optimizers.Adadelta(learning_rate=parameters["learning_rate"],decay=parameters["decay"]), 
                "Adagrad":tf.keras.optimizers.Adagrad(learning_rate=parameters["learning_rate"],decay=parameters["decay"]), 
                "Adamax":tf.keras.optimizers.Adamax(learning_rate=parameters["learning_rate"],decay=parameters["decay"]), 
                "Nadam":tf.keras.optimizers.Nadam(learning_rate=parameters["learning_rate"],decay=parameters["decay"]),
                "Ftrl":tf.keras.optimizers.Ftrl(learning_rate=parameters["learning_rate"],decay=parameters["decay"])}

  return parameters

## Functions for Text Classification

### Pre Process

In [None]:
# This Class takes care of the Timing of the Training

import keras
import time

class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [None]:
def nospecial(text:str):
  """
  Removes all non-words from a string using regular expressions. 
  """

  import re
  text = re.sub("[^a-zA-Z0-9äöüß]+", " ",text)
  return text

In [None]:
# for the following functions, we need the transformers library

%pip install transformers
from transformers import AutoTokenizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def pre_process(df_services:pd.DataFrame, df_faq:pd.DataFrame, parameters:dict):
  """
  This is the pre-processor. 
  It takes both the service and FAQ dataframe as inputs. 
  Depending on the given parameters, it will adjust the texts. 
  It returns the complete and shuffled training data as a DataFrame. 
  """

  if parameters["cluster"]:
    # get the validation set
    df = pd.read_csv("/content/drive/MyDrive/Masterarbeit/Colab_Data/classifications_for_clustering.csv")
    # the new label now is 1 for agree and 0 for disagree
    df["label"] = df["same"].to_list()
    df = df[["text","label"]]
  else:
    # if you only want to train on the name / question and not the description, set the following
    df = pd.DataFrame()
    df_2 = pd.DataFrame()
    if parameters["name_only"]:
      df["text"] = df_services["d115Name"]
    else:
      df["text"] = df_services["d115Name"] + " " + df_services["d115Synonym"]
    if parameters["faq_q_only"]:
      faqs_temp = df_faq["faqQuestionMain"].to_list()
      if parameters["enrich_FAQs"]:
        # if we want to enrich the training data, we add additional FAQs
        new_faq_test_data = pd.read_csv("/content/drive/MyDrive/Masterarbeit/Colab_Data/new_faq_data_for_enriching.csv").drop_duplicates(subset=["angeklickte FAQ-Frage"]).sample(frac=1.,random_state=100)    
        additional_faqs = new_faq_test_data["angeklickte FAQ-Frage"].to_list()
        faqs_temp.extend(additional_faqs)
      df_2["text"] = faqs_temp
      
    else:
      df_2["text"] = df_faq["faqAll"]

    df["label"]=[0 for i in range(len(df_services))]
    df_2["label"]=[1 for i in range(len(df_2["text"].to_list()))]

    # combine the two dataframes
    df = df.append(df_2)
  # and then shuffle the df
  df = df.sample(frac=1.0,random_state=parameters["random_state"])

  # remove special chars if needed
  if parameters["remove_special_chars"]:
    df["text"] = [nospecial(i) for i in df["text"].to_list()]

  return df

In [None]:
# now, we also need transformers datasets
# after this, you must restart (watch concole output)

%pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def tokenize_df(df:pd.DataFrame, parameters:dict):
  """
  Here, we take a full dataframe (training and testing data) as an input. 
  This DataFrame needs to have both a "text" and a "label" column. 
  It gets split (depending on the split parameter) and tokenized. 
  The transformers dataset is returned.
  This can then be turned into tensors to be fed into the model. 
  """

  split = parameters["train_test_split"]

  def split_df(df:pd.DataFrame, split:float=parameters["train_test_split"]):
    """
    Splits a given DataFrame into training set and set, depending 
    on the split parameter. 
    """
    # Splits a DataFrame into Train and Test DataFrames
    separator = int(split*df.shape[0])
    train_df = df.iloc[:separator]
    test_df = df.iloc[separator:]
    return train_df,test_df

  import datasets

  def create_ds(train_df:pd.DataFrame, test_df:pd.DataFrame):
    """
    Creates a Transformers Dataset from given test and train DataFrames. 
    This is necessary for further processing of the data. 
    """

    train_dataset = datasets.Dataset.from_dict(train_df)
    test_dataset = datasets.Dataset.from_dict(test_df)

    return datasets.DatasetDict({"train":train_dataset,"test":test_dataset})


  tokenizer = AutoTokenizer.from_pretrained(models[parameters["current_model"]], 
                              output_attentions=parameters["output_attentions"])

  def tokenize_function(examples):
    """
    This tokenizes text. 
    The tokenize function needs to be defined up front in order for the 
    dataset.map() function to work. 
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True)

  # depending on the split parameter, we either use everything 
  # or seperate training and testing sets. 

  if split > .0:
    train_df,test_df = split_df(df,split=split)
    dataset = create_ds(train_df,test_df)
  else:
    dataset = datasets.Dataset.from_dict(df)
    
  return dataset.map(tokenize_function, batched=True)

In [None]:
# we now need the DefaultDataCollator

from transformers import DefaultDataCollator

In [None]:
def tensor_df(df:pd.DataFrame,parameters:dict, val=False):
  """
  Turns our transformers.Dataset into tensors. 
  These tensors can then be fed into the model.

  This method can be seen as an end-to-end processor:
  It takes a DataFrame that has both a text and a label column. 
  It returns tensors, which can be used to fit() a model. 

  val:
  If we want to "tensorize" our validation dataset, 
  we do not want to split it since we do not train or test, we just validate. 
  """

  split_temp = parameters["train_test_split"]

  if val:
    parameters["train_test_split"] = .0

  batch_size = parameters["batch_size"]
  shuffle = parameters["shuffle_tf"]

  tokenized_datasets = tokenize_df(df,parameters=parameters)

  data_collator = DefaultDataCollator(return_tensors="tf")

  
  # Set the split back to where it was after tokenization
  parameters["train_test_split"] = split_temp

  try:

    # either saves both training and testing set ...

    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=["attention_mask", "input_ids"],
        label_cols=["labels"],
        shuffle=shuffle,
        collate_fn=data_collator,
        batch_size=batch_size,
    )

    tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
        columns=["attention_mask", "input_ids"],
        label_cols=["labels"],
        shuffle=shuffle,
        collate_fn=data_collator,
        batch_size=batch_size,
    )

    result = tf_train_dataset, tf_test_dataset

  except (KeyError, ValueError):

    # ... or only returns one result

    result = tokenized_datasets.to_tf_dataset(
        columns=["attention_mask", "input_ids"],
        label_cols=["labels"],
        shuffle=shuffle,
        collate_fn=data_collator,
        batch_size=batch_size,
    ) , None

  # if we want to validate, we use this for predictions.
  # thus, we only return the first part of the tuple
  # None the predict() method cannot handle
  if val:
    return result[0]
  else:
    return result

### Classify

In [None]:
from transformers import TFAutoModelForSequenceClassification, AutoModelForSequenceClassification

In [None]:
def load_model(parameters:dict):
  """
  This is how we get the actual model we want to train. 
  If we already have an available and stored model from before at hand 
  (saved in our drive) AND if the "override" parameter is set to True, 
  this method will automatically load that model. 

  A keras.model is returned that can then be trained and make predictions. 
  """

  current_model = models[parameters["current_model"]]
  override = parameters["override"]
  output_hidden_states=parameters["output_hidden_states"]
  output_attentions=parameters["output_attentions"]
  num_hidden_layers=parameters["num_hidden_layers"]

  if override:
    model = TFAutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2,output_attentions=output_attentions,output_hidden_states=output_hidden_states,num_hidden_layers=num_hidden_layers)
  else:
    try:
      model = TFAutoModelForSequenceClassification.from_pretrained("./my_model/",output_attentions=output_attentions,num_hidden_layers=num_hidden_layers)
    except OSError:  
      model = TFAutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2,output_attentions=output_attentions,output_hidden_states=output_hidden_states,num_hidden_layers=num_hidden_layers)

  if current_model != "deepset/gelectra-large":
    model.attention_probs_dropout_prob= 0.1
    model.hidden_act= "gelu"
    model.hidden_dropout_prob= 0.1
    model.hidden_size= 768
    model.initializer_range= 0.02
    model.intermediate_size= 3072
    model.layer_norm_eps= 1e-12
    model.max_position_embeddings= 512
    model.model_type= "bert"
    model.num_attention_heads= parameters["num_attention_heads"]
    model.num_hidden_layers= parameters["num_hidden_layers"]
    model.pad_token_id= 0
    model.position_embedding_type= "absolute"
    model.transformers_version= "4.21.0"
    model.type_vocab_size= 2
    model.use_cache= True
    model.vocab_size= 30000

    # this is how to get the layers:

    layer = model.get_layer('bert')

    # and this is how you make a layer NOT trainable

    layer.trainable = parameters["train_bert"]

  return model

In [None]:
def get_test_data(parameters:dict, different_random_state=False, 
                  path="/content/drive/MyDrive/Masterarbeit/Colab_Data/log.csv"):
  """
  We use this for after the Training, to evaluate the model. 
  We get test data from the LOG.
  It is split exactly 1:1 between services and FAQs. 
  Afterwards, it gets shuffled and a DataFrame with text and labels is returned.
  """

  sample_size = parameters["sample_size"]
  german_only = parameters["german_only"]

  test_data = pd.read_csv(path)

  if german_only:
    test_data = test_data[test_data["searchResultsWithScore"] == "de"]

  test_data = test_data[((test_data["selectedID"].str.len() == 6)  & (test_data["userQuestion"] == "SERVICE_SELECTION_REQUEST"))| (test_data["userQuestion"] == "FAQ_ANSWER")]
  test_data = test_data[["sessionID","userQuestion"]]
  test_data.columns = ["text","label"]

  if parameters["remove_special_chars"]:
    test_data["text"] = [nospecial(i) for i in test_data["text"].to_list()]

  test_data["label"] = test_data["label"].replace({"SERVICE_SELECTION_REQUEST":0,"FAQ_ANSWER":1})
  test_data=test_data.sample(frac=1.,random_state=parameters["random_state"])

  if different_random_state:
    test_data = test_data.sample(frac=1., random_state=1234)

  test_data_0 = test_data[test_data["label"]==0].iloc[:int(sample_size/2)]
  test_data_1 = test_data[test_data["label"]==1].iloc[:int(sample_size/2)]
  test_data = test_data_0.append(test_data_1)
  test_data = test_data.sample(frac=1., random_state=parameters["random_state"])

  return test_data

In [None]:
def generate_data(parameters:dict):
  """
  Depending on the parameter settings, we get the data from the LeiKa or the Log.  
  """
  
  # Do we want to train on log data / real data? ...
  if parameters["real_data"]:
    df = get_test_data(sample_size=parameters["sample_size"], 
                      german_only=parameters["german_only"], 
                      different_random_state=True)
  # ... or do we train on the LeiKa
  else:
    df = pre_process(df_services=df_services, df_faq=df_faq, 
                    parameters=parameters)
    
  return df

In [None]:
def train_model(model,parameters, tf_train_dataset, tf_test_dataset):
  """
  Trains the model. 
  Depending on the "shuffle_between_training" parameter, 
  we train half-and-half or all the way.
  We also track the time for each epoch. 

  At tbe end, the model is saved. 

  """

  time_callback = TimeHistory()


  class_weight = parameters["weighted"]

  if not parameters["shuffle_between_training"]:

    history = model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=parameters["epochs"], class_weight=class_weight, callbacks=[time_callback])
    hist = history.history
    
  else:

    # if we split the epochs, we also need to split the results

    epochs = int(parameters["epochs"]/2)
    history_1 = model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=epochs, class_weight=class_weight, callbacks=[time_callback])
    df = df.sample(frac=1.0,random_state=parameters["random_state"])

    tf_train_dataset, tf_test_dataset = tensor_df(df, parameters=parameters)
    
    history_2 = model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=epochs, class_weight=class_weight, callbacks=[time_callback])
    hist = history_1.history
    hist_2 = history_2.history
    for key in hist.keys():
      hist[key] = hist[key].extend(hist_2[key])


  model.save_pretrained("my_model")

  return model, hist, time_callback

### Evaluate

In [None]:
# for the evaluation of the model, we certainly need numpy

import numpy

In [None]:
def invert_labels(y_true:list, y_pred:list):
  """
  Just a helper method that inverts the labels for further evaluation.
  The goal is to take both the services and the FAQ perspective. 
  """

  translator = {0:1, 1:0}
  y_true = [translator[i] for i in y_true]
  y_pred = [translator[i] for i in y_pred]
  
  return y_true,y_pred

In [None]:
def score(cm, true_label:str, y_true, y_pred):
  """
  We compute all relevant scores for the evaluation here. 
  The method takes a confusion matrix and the true labels as an input and 
  returns a score series with all relevant values. 
  """

  tn, fp, fn, tp = cm.ravel()
  scores = {}

  scores[f"{true_label}_tn"] = tn
  scores[f"{true_label}_fp"] = fp
  scores[f"{true_label}_fn"] = fn
  scores[f"{true_label}_tp"] = tp
  scores[f"{true_label}_false_positive_rate"] = fp / (fp + tn)
  scores[f"{true_label}_false_negative_rate"] = fn / (tp + fn)
  scores[f"{true_label}_true_negative_rate"] = tn / (tn + fp)
  scores[f"{true_label}_negative_predictive_value"] = tn/ (tn + fn)
  scores[f"{true_label}_false_discovery_rate"] = fp/ (tp + fp)
  scores[f"{true_label}_recall"] = tp / (tp + fn)
  scores[f"{true_label}_precision"] = tp/ (tp + fp)
  scores[f"{true_label}_accuracy"] = (tp + tn) / (tp + fp + fn + tn)
  scores[f"{true_label}_f1"] = f1_score(y_true, y_pred)
  scores[f"{true_label}_matthews_corr"] = matthews_corrcoef(y_true, y_pred)

  return pd.Series(scores)

In [None]:
def evaluate_model(predictions, sample_val):
  """
  The Input here are all 
  predictions the model made and the 
  sample_val validation data with the true labels. 

  It attaches the text legth and predictions to the validation dataset. 
  Thus, we return a DataFrame with 
  the original text, predicted labels, true labels and text length. 
  """

  sample_val["pred_label"] = [numpy.argmax(i) for i in predictions['logits']]
  sample_val["text_length"] = [len(i) for i in sample_val["text"].to_list()]
  check = sample_val["label"] == sample_val["pred_label"]
  sample_val["same"] = [int(i) for i in check]  
  
  return sample_val

In [None]:
def get_y(sample_val):
  """
  Here, from the validation data, we get y_true and y_pred 
  --> return the true and the predicted labels as list(s)
  """

  y_true = sample_val["label"].to_list()
  y_pred = sample_val["pred_label"].to_list()
  
  return y_true, y_pred

In [None]:
# for the final evaluation, we also use sklearn and math

from sklearn.metrics import confusion_matrix, f1_score,matthews_corrcoef
import math

In [None]:
def eval_model(predictions, sample_val, time_callback):
  """
  Here, we do the final evaluation of the model. 
  We do that from the FAQ, then we invert the labels and finally, 
  we calculate the scorees from the Service perspective.

  This method returns the full score series with all values we need 
  for the quantitative evaluation of the thesis. 
  """

  sample_val = evaluate_model(predictions, sample_val)
  y_true, y_pred = get_y(sample_val)

  cm_faq = confusion_matrix(y_true, y_pred)
  scores = score(cm_faq, "faq", y_true, y_pred)

  y_true, y_pred = invert_labels(y_true, y_pred)
  cm_services = confusion_matrix(y_true, y_pred)
  scores = scores.append(score(cm_services, "services", y_true, y_pred))

  y_true, y_pred = invert_labels(y_true, y_pred)


  scores = scores.append(pd.Series({"time[sec]":math.fsum(time_callback.times)}))

  return scores

In [None]:
def write_results_to_csv(hist, result_path="/content/drive/MyDrive/Masterarbeit/Colab_Data/results.csv"):
  """
  This method
  (1) gets the result_csv
  (2) appends a new row and 
  (3) writes the results back to the csv file 
  we can define in the result_path parameter. 
  """
  result_frame = pd.read_csv(result_path,index_col=False)
  new_hist = {}
  for key in hist.keys():
    for idx,val in enumerate(hist[key]):
      new_hist[f"{key}_{idx}"] = val
  new_row = pd.Series(parameters).append(scores).append(pd.Series(new_hist))
  result_frame = result_frame.append(new_row,ignore_index=True)
  result_frame.to_csv(result_path, index=False)

### Interpret

In [None]:
# restart the runtime for it to work properly

%pip install bertviz
%pip install transformers-interpret

In [None]:
# for the interpretation, we need torch and bertviz

import torch
from bertviz import head_view, model_view

In [None]:
# these two methods are some necessary adjustments to align our 
# tensorflow model to be able to adapt to the torch requirements of bertviz

def show_head_view_tf(model, tokenizer, sentence_a, sentence_b=None):

    if parameters["remove_special_chars"]:
      sentence_a = nospecial(sentence_a)
      if sentence_b:
        sentence_b = nospecial(sentence_b)

    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='tf', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].numpy().tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    
    # Convert attention from TF tensors to torch tensors
    attention = [torch.from_numpy(layer_attn.numpy()) for layer_attn in attention]
    
    input_id_list = input_ids[0].numpy().tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    head_view(attention, tokens, sentence_b_start)

def show_model_view_tf(model, tokenizer, sentence_a, sentence_b=None):

    if parameters["remove_special_chars"]:
      sentence_a = nospecial(sentence_a)
      if sentence_b:
        sentence_b = nospecial(sentence_b)

    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='tf', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].numpy().tolist().index(1)
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    
    # Convert attention from TF tensors to torch tensors
    attention = [torch.from_numpy(layer_attn.numpy()) for layer_attn in attention]
    
    input_id_list = input_ids[0].numpy().tolist() # Batch index 0
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    model_view(attention, tokens, sentence_b_start, display_mode="light")

In [None]:
def visualize_model_on_example_sentence(parameters:dict, example_sentence="Ich brauche einen neuen Personalausweis."):
  """
  We load our previously saved model and show the Attention Heads and 
  Layers for an example sentence. 
  Per default, this sentence is set to
  "Ich brauche einen neuen Personalausweis."
  """

  sentence_a = example_sentence

  parameters["output_attentions"] = True
  parameters["num_hidden_layers"] = 12
  parameters["override"] = False

  model = load_model(parameters=parameters)

  tokenizer = AutoTokenizer.from_pretrained(models[parameters["current_model"]], 
                              output_attentions=parameters["output_attentions"])
  show_head_view_tf(model, tokenizer, sentence_a)
  show_model_view_tf(model, tokenizer, sentence_a)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer


def show_heads_for_many_real_examples(sample_val, parameters, how_many:int=10):
  """
  This method iterates over as many real examples as you like and shows
  the "weights" / importance of the words. 
  Will be used later in the Evaluation Part of the Script. 
  """

  # With both the model and tokenizer initialized we are now able to get explanations on an example text.

  # tokenizer = BertTokenizer.from_pretrained("./my_model/")
  model = BertForSequenceClassification.from_pretrained("./my_model/",from_tf=True)
  
  tokenizer = AutoTokenizer.from_pretrained(models[parameters["current_model"]], 
                              output_attentions=parameters["output_attentions"])
  
  cls_explainer = SequenceClassificationExplainer(model,tokenizer)

  idx = 0

  sample_val = sample_val.sample(frac=1,random_state=parameters["random_state"])

  for sample in sample_val.text.to_list()[:how_many]:
    word_attributions = cls_explainer(sample)
    print(word_attributions)
    cls_explainer.visualize(true_class=sample_val.label.to_list()[idx])
    idx+=1

In [None]:
def get_correlation(sample_val):
  """
  Returns the specific correlation between the text length and the performance
  or accuracy of the model. 
  """

  correlation = sample_val["text_length"].astype(float).corr(sample_val["same"].astype(float))
  
  return round(correlation,4)

# Initialize the Classifier

Then, we prepare the data and load the model. 

## Prepare the Data

In [None]:
# get the data
mount_drive()
df_services, df_faq = load_data()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# get the parameters
parameters = get_parameters(possibilities=get_possibilities())

In [None]:
# get the train data
df = generate_data(parameters=parameters)

In [None]:
# show some info
print("The data is as follows:")
print(df, "\n\n")
print("The label distribution is as follows:")
print(df["label"].value_counts())

The data is as follows:
                                                  text  label
661  Jugendschutz Erzieherischer Kinder und Jugends...      0
216  Wo kann ich aktuell einen Antrag auf Ersatzfüh...      1
188  Kartenverkauf Landeskartenwerke Sonderkarten B...      0
172           Wie soll ich die Projektlaufzeit planen       1
320            Aufenthaltstitel für ehemalige Deutsche      0
..                                                 ...    ...
802  Glücksspiel Buchmachergehilfenerlaubnis beantr...      0
53               Fahrerlaubnis Neuerteilung beantragen      0
350  Sozialversicherung Auskunft Kontenklärung für ...      0
79   Handwerk Eintragung in das Verzeichnis handwer...      0
792  Feuerwerk Verkauf von Kleinfeuerwerk und Klein...      0

[1138 rows x 2 columns] 


The label distribution is as follows:
0    881
1    257
Name: label, dtype: int64


In [None]:
# now, we turn the df into tensors

tf_train_dataset, tf_test_dataset = tensor_df(df, parameters=parameters)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Load the Model

In [None]:
# get the Model accoarding to the Parameter Settings

model = load_model(parameters=parameters)

Some layers from the model checkpoint at ./my_model/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./my_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
# Compile the Model

model.compile(
    optimizer=optimizers[parameters["optimizer"]],
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

# Train the Model

We train the model. 

In [None]:
# Train the model and save its, its training history and the time(s)

model, hist, time_callback = train_model(model=model, parameters=parameters, tf_train_dataset=tf_train_dataset, tf_test_dataset=tf_test_dataset)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Evaluate the Model
Do all evaluations.

In [None]:
# get the evaluation / validation / test dataset

sample_val = get_test_data(parameters=parameters)

# and then turn it into tensors

tf_val_test = tensor_df(sample_val, parameters=parameters, val=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# to make predictions

predictions = model.predict(tf_val_test)



In [None]:
# and now, we get all scores

scores = eval_model(predictions=predictions, sample_val=sample_val, time_callback=time_callback)

In [None]:
write_results_to_csv(hist)

# Interpret the Model
And finally interpret the results. 

In [None]:
if parameters["explainable"]:

  # visualize an example sentence
  visualize_model_on_example_sentence(parameters=parameters)

In [None]:
if parameters["explainable"]:

  # show real examples
  show_heads_for_many_real_examples(sample_val,parameters)

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


[('[CLS]', 0.0), ('Um', -0.20596806129412581), ('##melde', 0.7289676552613763), ('##n', 0.33480172249281304), ('nach', 0.3097405202487212), ('umzu', 0.056972134532900884), ('##g', 0.46357966729369454), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_0 (1.00),LABEL_0,1.69,[CLS] Um ##melde ##n nach umzu ##g [SEP]
,,,,


[('[CLS]', 0.0), ('Ich', 0.32505710854648273), ('bin', 0.24693045068081243), ('Reise', 0.006131195031992667), ('##rück', 0.057567252326286844), ('##kehr', 0.03020300877907522), ('##er', 0.11844569940244655), ('aus', 0.13001866914098742), ('einem', 0.08971251205046502), ('Risiko', 0.008756776682311618), ('##gebiet', -0.0597804387837352), ('Welche', 0.7435317931274418), ('##n', 0.03763353430872337), ('Text', 0.03456028950594967), ('benöt', 0.04913714804860758), ('##ige', 0.023163943591796805), ('ich', 0.43515312523237604), ('zum', 0.059877354168393536), ('Frei', 0.029180214913119805), ('##testen', -0.031192203360573583), ('PC', 0.04969083318095675), ('##R', 0.023652760798993887), ('oder', 0.08509036136984209), ('reicht', 0.039093500657054865), ('ein', 0.13885620680524957), ('Schnell', 0.005489780170894013), ('##test', 0.0467841302573735), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.72,[CLS] Ich bin Reise ##rück ##kehr ##er aus einem Risiko ##gebiet Welche ##n Text benöt ##ige ich zum Frei ##testen PC ##R oder reicht ein Schnell ##test [SEP]
,,,,


[('[CLS]', 0.0), ('brauche', -0.0035058332594901436), ('ich', 0.8869762565605234), ('einen', 0.13894182910800695), ('Termin', 0.043395767675220846), ('für', 0.18443942154113968), ('ein', 0.29268340951389143), ('erweiterte', 0.14170572516100718), ('##s', 0.17489259050609826), ('Führungs', -0.01769516641132262), ('##zeug', 0.05836681693642998), ('##nis', 0.13417746267154845), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_1 (1.00),LABEL_1,2.03,[CLS] brauche ich einen Termin für ein erweiterte ##s Führungs ##zeug ##nis [SEP]
,,,,


[('[CLS]', 0.0), ('Melde', 0.8355074561782707), ('##register', -0.09328034247698269), ('##aus', 0.25561196444564527), ('##kunft', 0.4773767820182817), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_0 (1.00),LABEL_0,1.48,[CLS] Melde ##register ##aus ##kunft [SEP]
,,,,


[('[CLS]', 0.0), ('Gib', 0.18014286492289824), ('##t', 0.16368987396398002), ('es', 0.5745664082158662), ('eine', 0.2008409957956396), ('trage', -0.0337043700403531), ('##verpflichtung', -0.02081628568668618), ('für', 0.21364190663533486), ('Mas', 0.012740509436692036), ('##ken', 0.018347343395351094), ('auf', 0.17783919262060457), ('öffentlichen', 0.05269812670553654), ('Straßen', -0.0095097744800326), ('und', 0.1316693538333293), ('wenn', 0.19880198167727126), ('ja', 0.13284701738806423), ('welche', 0.6431000813858728), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.64,[CLS] Gib ##t es eine trage ##verpflichtung für Mas ##ken auf öffentlichen Straßen und wenn ja welche [SEP]
,,,,


[('[CLS]', 0.0), ('Dar', 0.3955166338182706), ('##f', 0.5166313468866274), ('ich', 0.4791194346043311), ('derzeit', 0.16647324965304944), ('meinen', 0.4267155536702288), ('Sohn', 0.09763151409710155), ('in', 0.35146240206550017), ('Rostock', -0.06113781441859121), ('besuchen', 0.02254195233419652), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.39,[CLS] Dar ##f ich derzeit meinen Sohn in Rostock besuchen [SEP]
,,,,


[('[CLS]', 0.0), ('ich', 0.5688709759641511), ('brä', 0.31305380211457906), ('##uchte', 0.019168420696645632), ('unbedingt', 0.17871359841048567), ('einen', 0.5789637601056098), ('ter', -0.034919207234508746), ('##min', -0.004332980802455864), ('bei', 0.352193942933482), ('einem', 0.2884311552912477), ('Bürger', 0.045801544440011686), ('##amt', -0.017581341002843035), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.29,[CLS] ich brä ##uchte unbedingt einen ter ##min bei einem Bürger ##amt [SEP]
,,,,


[('[CLS]', 0.0), ('es', 0.08567029481450668), ('gibt', 0.43927562618018334), ('keinen', 0.09567582262581696), ('ter', 0.03829483943007786), ('##min', 0.02348931776961513), ('innerhalb', 0.07248270125397847), ('der', 0.2575868307545052), ('nächsten', 0.20893134856087495), ('zwei', 0.21698154128516914), ('wo', 0.11613937486840643), ('##chen', 0.15057491665948108), ('zur', 0.47441989064938067), ('anmel', -0.16943107365069132), ('##dung', -0.04471257607822439), ('in', 0.27827652571679545), ('ber', -0.021528620226342603), ('##lin', 0.09036999682172267), ('ist', 0.34468268434579963), ('das', 0.185945672025263), ('ein', 0.286662699699384), ('problem', 0.1123406268237385), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_1 (1.00),LABEL_1,3.24,[CLS] es gibt keinen ter ##min innerhalb der nächsten zwei wo ##chen zur anmel ##dung in ber ##lin ist das ein problem [SEP]
,,,,


[('[CLS]', 0.0), ('Grunds', 0.49298855209648224), ('##teuer', 0.8700357966783975), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_0 (1.00),LABEL_0,1.36,[CLS] Grunds ##teuer [SEP]
,,,,


[('[CLS]', 0.0), ('muss', 0.4664261034688644), ('der', 0.6599455605579276), ('ehe', -0.11871079563074319), ('##partner', 0.061402304251567175), ('bei', 0.1620957364053492), ('der', 0.35740666018698947), ('anmel', -0.1147165653161756), ('##dung', -0.07487205038868691), ('bei', 0.15516689198144404), ('der', 0.2356132572413218), ('wohn', -0.06403441468801677), ('##ung', 0.14882269325716904), ('mit', 0.18140479428456216), ('##kommen', 0.1323975728805735), ('[SEP]', 0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,LABEL_1 (1.00),LABEL_1,2.19,[CLS] muss der ehe ##partner bei der anmel ##dung bei der wohn ##ung mit ##kommen [SEP]
,,,,


In [None]:
if parameters["explainable"]:

  # show correlation betweeen text length and performance
  correlation = get_correlation(sample_val)
  print(f"The Correlation between the 'Correctness' of the BC and the length of the user query is {correlation}")

The correlation between the correctness of the SCC and the length of the user query is -0.0309
