<a href="https://colab.research.google.com/github/hemnemne/master_thesis_submission/blob/main/SSC_v1.1/Sentence_Similarity_Classifier_%5Bv1_1%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

## Parameters

In [None]:
def get_parameters():
  """
  We define all parameters and set them. 
  Afterwards we return a dictionary containing the current parameters
  """

  # are we doing services or faqs or topics?
  classifiers = ["services", "faqs", "topics", "topics and services"]
  # pick one
  classifier = classifiers[0]
  # how many user queries do we want to test the model on?
  how_many = 1000
  # are we removing the stopwords
  remove_stopwords = False
  # do we want to see the actual output?
  wanna_print = True
  # "out of k" results
  k = 25
  # see the GPU
  get_hardware_info = False
  # are we lower casing everything?
  do_lower_case = False
  # 0 is the default, so nothing is filtered. set to >1.0 to filter out any services that are not relevant accoarding to d115
  boost_value_threshold = .0
  # all available pooling modes
  pooling_modes = ["cls","mean","max","msl"]
  # pick one
  pooling_mode = pooling_modes[1] 
  # embedding dimension of the representation vectors
  dimension = 1024
  # random seed
  random_state=100
  # new k for the topic --> service classifier (for the SERVICES. topics is still the regular k above)
  k_topics = 10

  # available models
  models = ['Sahajtomar/German-semantic', # sota
            'clips/mfaq', 
            'symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli', 
            'deepset/gelectra-large-germanquad', 
            'deepset/gbert-large']
  current_model = models[0]

  parameters={
      "random_state":random_state,
      "how_many" : how_many,
      "wanna_print":wanna_print, 
      "get_hardware_info" : get_hardware_info,
      "classifier" : classifier,
      "remove_stopwords":remove_stopwords, 
      "k":k, 
      "k_topics" : k_topics,
      "do_lower_case":do_lower_case,
      "current_model":current_model,
      "boost_value_threshold" : boost_value_threshold,
      "pooling_mode":pooling_mode,
      "dimension":dimension
  }

  return parameters

## Data

In [None]:
import pandas as pd

def get_data(base_path = 
              "/content/drive/MyDrive/Masterarbeit/Colab_Data/LeiKa/"):
  """
  Get all relevant files. 
  You need to connect the Google Drive containing the files here.
  Returns the services, topics and FAQs each as a DataFrame. 
  """

  # load the real dataset

  # sync google Drive

  from google.colab import drive
  drive.mount("/content/drive")

  base_path = base_path

  # where is the file?
  file_path_services = base_path + "services.csv"
  file_path_faqs = base_path + "faq.csv"
  file_path_topics = base_path + "topics.csv"

  # get all services

  df_services = pd.read_csv(file_path_services)

  if parameters["boost_value_threshold"] > 1.0:
    # and now filter the ones that have a boost value
    df_services = df_services[df_services[[i for i in df_services.columns if i[:4] == "d115"]]["d115DocumentBoostValue"]>boost_value_threshold].sample(frac=1.,random_state=random_state)

  # get the faqs

  df_faqs = pd.read_csv(file_path_faqs)

  # get the topics

  df_topics = pd.read_csv(file_path_topics)

  return df_services, df_faqs, df_topics

In [None]:
# this is all for the topics. It maps them to the corresponding services

def get_fibel(df_topics):
  """
  Here, we generate s dictionary that has 
  the topics as keys and the service IDs as values. 
  """

  topics_fibel = {} 
  for row in df_topics.iterrows():
    row = row[1]
    category = row["d115Name"]
    services = row["d115Services"]
    if type(services) == float:
      continue
    service_ids = []
    for service in services.split('"'):
      try: 
        service_ids.append(int(service))
      except ValueError:
        continue
    topics_fibel[category] = service_ids

  return topics_fibel

def get_topics_with_services_as_df(df_topics) -> pd.DataFrame:
  """
  Cast the dictionary into a DataFrame which makes it easier to use it 
  for later joins. 
  """
  
  topics_fibel = get_fibel(df_topics)
  topics_fibel_transposed = []
  for key, value in topics_fibel.items():
    for i in value:
      topics_fibel_transposed.append([i,key])

  return pd.DataFrame(topics_fibel_transposed,columns=["service_id", "topic"])

In [None]:
def get_new_faqs(parameters,
                 faq_path = 
                 "/content/drive/MyDrive/Masterarbeit/Colab_Data/faqClicks.csv"):
  """
  Here, we get the NEW FAQ data that actually has clicks in it. 
  Now, we know what the user clicked and thus, what the 'real' answer is.
  """

  new_faq_test_data = pd.read_csv(faq_path).drop_duplicates(subset=["ursprüngliche Frage"])
  new_faq_test_data.sample(frac=1., random_state=parameters["random_state"])
  
  return new_faq_test_data

def get_test_data(parameters, df_services, df_faqs, df_topics,
                  path=
                  "/content/drive/MyDrive/Masterarbeit/Colab_Data/log.csv",
                  ):
  """
  Get the log data to test the SCC later. 
  Also, get all possible labels from the log.
  If the classifier is "topics and services", we also create a dictionary
  with all topics as keys and the embedded services as values. 
  """

  # we get the log file

  path = path
  test_data = pd.read_csv(path)

  # get rid of the "-" and "NaN" values in the ID column

  new_id = []
  for i in test_data["selectedID"].to_list():
    try:
      new_id.append(int(i))
    except ValueError:
      new_id.append(0)
  test_data["selectedID"] = new_id

  # we merge the log (test_data) with the actual services and topics to not only have the selected service -ID-, but also its corresponding -name- and -topic-

  if parameters["classifier"] != "faqs":
    test_data = test_data[(test_data["userQuestion"]=="SERVICE_SELECTION_REQUEST") & (test_data["selectedID"]>0)]
    test_data = test_data.merge(df_services, left_on="selectedID", right_on="id")
    test_data = test_data.merge(get_topics_with_services_as_df(df_topics), left_on="selectedID", right_on="service_id")
    
    # now, we need the question and true answer all be in the same column, 
    # independent on the classifier 
    test_data["questions"] = test_data["sessionID"]

    if parameters["classifier"] == "topics":
      test_data["true_answers"] = test_data["topic"]
    else:
      test_data["true_answers"] = test_data["d115Name"]

  else:
    test_data = get_new_faqs(parameters)

    test_data["questions"] = test_data["ursprüngliche Frage"]
    test_data["true_answers"] = test_data["angeklickte FAQ-Frage"]

  # shuffle the test data
  test_data = test_data.sample(frac=1.0,random_state=parameters["random_state"])
  
  return test_data

In [None]:
# for a topic (as str) this gets the list of str of all services
def public_get_service_names_from_topic_name(topic:str, df_services, df_topics, fibel):

  def get_service_name_from_id_here(id:int):
    for i in df_services.iterrows():
      if i[1]["id"] == id:
        return i[1]["d115Name"]
        
  return [get_service_name_from_id_here(i) for i in fibel[topic]]

# embedd the labels

def embed_labels(labels, parameters, df_services, df_topics):
  if parameters["classifier"] == "topics and services":

    # if we are looking for the topic fist and then the service

    topic_labels_embedded = {}
    fibel = get_fibel(df_topics)

    # function to get the service name from an id
    def get_service_name_from_id(id:int):
      for i in df_services.iterrows():
        if i[1]["id"] == id:
          return i[1]["d115Name"]

    # for a topic (as str) this gets the list of str of all services
    def get_service_names_from_topic_name(topic:str):
      return [get_service_name_from_id(i) for i in fibel[topic]]

    for topic in list(fibel.keys()):
      topic_labels_embedded[topic] = model.encode(get_service_names_from_topic_name(topic), convert_to_tensor=True)
      labels_embedded = topic_labels_embedded
  else:
    labels_embedded = model.encode(labels, convert_to_tensor=True)

  return labels_embedded

## Dependencies

In [None]:
# install dependencies

%pip install -U sentence-transformers
%pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Classifier

In [None]:
# here come the stopwords

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
german_stop_words = stopwords.words('german')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def stop_word_removal(x):
  """
  Here, we remove all stop words from a String. 
  """

  token = x.split()
  
  return ' '.join([w for w in token if not w in german_stop_words])

In [None]:
import torch
from torch import Tensor

def pytorch_cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    return cos_sim(a, b)

def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [None]:
from sentence_transformers import SentenceTransformer, util

def get_model(parameters:dict):

  model = SentenceTransformer(parameters["current_model"])

  model[0].do_lower_case = parameters["do_lower_case"]

  model[1].word_embedding_dimension = parameters["dimension"]

  # now, we set the pooling mode. this looks complicated but simply sets the selected one to True and all the rest to False

  if parameters["pooling_mode"] == "cls":
    model[1].pooling_mode_cls_token = True
    model[1].pooling_mode_mean_tokens = model[1].pooling_mode_max_tokens = model[1].pooling_mode_mean_sqrt_len_tokens = False
  elif parameters["pooling_mode"] == "mean":
    model[1].pooling_mode_mean_tokens = True
    model[1].pooling_mode_cls_token = model[1].pooling_mode_max_tokens = model[1].pooling_mode_mean_sqrt_len_tokens = False
  elif parameters["pooling_mode"] == "max":
    model[1].pooling_mode_max_tokens = True
    model[1].pooling_mode_cls_token = model[1].pooling_mode_mean_tokens = model[1].pooling_mode_mean_sqrt_len_tokens = False
  elif parameters["pooling_mode"] == "msl":
    model[1].pooling_mode_mean_sqrt_len_tokens = True
    model[1].pooling_mode_cls_token = model[1].pooling_mode_mean_tokens = model[1].pooling_mode_max_tokens = False
  else:
    raise ValueError(f"The pooling mode is not deefined correctly.")

  return model

In [None]:
def get_labels(parameters, df_services, df_faqs, df_topics):
  """
  Gets all labels as text from the dataframes, depending on the parameters. 
  Returns a list of labels (which can then be embedded).
  """

  # store the availiable labels in the "labels" list

  if parameters["classifier"] == "services":
    labels = df_services["d115Name"].to_list()
  elif parameters["classifier"] == "topics" or parameters["classifier"] == "topics and services":
    # for the 'topics and services', the first classification will be topics
    # i.e., they are treated the same for now
    labels = list(get_fibel(df_topics).keys())
  else:
    faqs = get_new_faqs(parameters)
    labels = [i for i in faqs["angeklickte FAQ-Frage"].unique()]

  return labels

In [None]:
def get_new_fibel(df_topics):

  new_fibel = {}

  for row in df_topics.iterrows():
    services = row[1]["d115Services"]
    if type(services) == str:
      topic = row[1]["d115Name"]
      new_fibel[topic] = {}
      service_ids = []
      small_list = services.split('"')
      for idx, val in enumerate(small_list):
        try: 
          service_number = int(val)
          service_ids.append(service_number)
          service_name = small_list[idx+4]
          new_fibel[topic][service_number] = service_name
        except ValueError:
          continue
  return new_fibel

# Classify

In [None]:
# get the current parameter settings
parameters = get_parameters()

# get the dataframes
df_services, df_faqs, df_topics = get_data()

# get the log
test_data = get_test_data(parameters, df_services, df_faqs, df_topics)

# load the model
model = get_model(parameters=parameters)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
for classifier in ["topics and services"]: #  ["services", "faqs", "topics", "topics and services"]: todo change back

  if classifier == "faqs":
    parameters["k"] = 50
    parameters["remove_stopwords"] = True
    parameters["pooling_mode"] = "mean"
  else:
    parameters["k"] = 25
    parameters["remove_stopwords"] = False
    if classifier == "topics and services":
      parameters["pooling_mode"] = "cls"
    else:
      parameters["pooling_mode"] = "msl"


  sample_size = 1000 # parameters["how_many"] todo change back

  hits = 0

  parameters["classifier"] = classifier # todo change back as soon as function

  labels = get_labels(parameters, df_services, df_faqs, df_topics)
  labels_embedded = model.encode(labels, convert_to_tensor=True)

  if classifier == "topics and services":
    # we need to get the translator
    new_fibel = get_new_fibel(df_topics)

  test_data = get_test_data(parameters, df_services, df_faqs, df_topics)
  
  # only get relevant test data
  test_data = test_data[["questions", "true_answers"]].iloc[:sample_size]

  embedded_qs = []

  # embed the questions
  for q in test_data["questions"].to_list():
    # remove the stopwords
    if parameters["remove_stopwords"]:
      q = stop_word_removal(q)
    embedded_q = model.encode(q, convert_to_tensor=True)
    embedded_qs.append(embedded_q)

  idx = 0 
  for row in test_data.iterrows():
    question = row[1]["questions"]
    true_answer = row[1]["true_answers"]
    embedded_question = embedded_qs[idx]
    cosine_scores = util.pytorch_cos_sim(embedded_question, labels_embedded)
    
    scores = cosine_scores[0].tolist()

    mvps = pd.DataFrame(columns = ["label", "score"])
    mvps["label"] = labels
    mvps["score"] = scores

    mvps = mvps.sort_values(by="score",ascending=False)

    mvps = mvps.iloc[:parameters["k"]]

    # the topics and services classifier goes even deeper

    if parameters["classifier"] == "topics and services":
      new_mvps = pd.DataFrame(columns=["label","score"])
      for label in mvps.iloc[:parameters["k_topics"]]["label"].to_list():
        possible_labels = list(new_fibel[label].values())
        possible_labels_embedded = model.encode(possible_labels, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(embedded_question, possible_labels_embedded)
        scores = cosine_scores[0].tolist()
        mvps_here = pd.DataFrame(columns=["label","score"])
        mvps_here["label"] = possible_labels
        mvps_here["score"] = scores
        new_mvps = new_mvps.append(mvps_here, ignore_index=True)

      new_mvps = new_mvps.sort_values(by="score",ascending=False)
      new_mvps = new_mvps.drop_duplicates()
      mvps = new_mvps.iloc[:parameters["k"]]
      

    if true_answer in mvps["label"].to_list():
      hits += 1

    idx += 1
  
  score = (hits / sample_size) * 100

  classifier = parameters["classifier"]
  k = parameters["k"]
  k_topics = parameters["k_topics"]

  print(f"The '{classifier}' Classifier has a Score of {int(score)}% if k equals {k} and k_topics equals {k_topics} with a sample size of {sample_size} \n\n")

The 'topics and services' Classifier has a Score of 71% if k equals 25 and k_topics equals 10 with a sample size of 1000 




In [None]:
invest_len_df = pd.DataFrame(columns=["query", "length", "correct"])

hits = 0

labels = get_labels(parameters, df_services, df_faqs, df_topics)
print(f"The labels are of size {len(labels)} and are as follows:")
print(labels)
test_data = get_test_data(parameters,df_services,df_faqs,df_topics)
print("The test data is of shape")
print(test_data.shape)
labels_embedded = embed_labels(labels,parameters,df_services,df_topics)

# also, embed the topics separately
if parameters["classifier"] == "topics and services":
  # we fist need to get the topics from the dictionary keys
  topics_list = list(labels_embedded.keys())
  # then embedd them
  embedded_topics = model.encode(topics_list, convert_to_tensor=True)
  # and then we also need the dictionary to look up the services for each topic
  fibel = get_fibel(df_topics)

for index,row in test_data.iloc[:parameters["how_many"]].iterrows():
  # get the user query
  question = row["questions"]
  # remove the stopwords
  if parameters["remove_stopwords"]:
    question = stop_word_removal(question)
  # embedd the question
  query_embedded = model.encode(question, convert_to_tensor=True)
  # get the true answer
  true_answer = row["true_answers"]

  if parameters["classifier"] != "topics and services":
    cosine_scores = util.pytorch_cos_sim(query_embedded, labels_embedded)
    scores = cosine_scores[0].tolist()
    score_df = pd.DataFrame(columns=["service", "score"])
    score_df["service"] = labels
    score_df["score"] = scores
    ranking = score_df.sort_values(by="score",ascending=False)
    mvps = ranking.iloc[:parameters["k"]]
  else:
    cosine_scores = util.pytorch_cos_sim(query_embedded, embedded_topics)
    scores = cosine_scores[0].tolist()
    score_df = pd.DataFrame(columns=["service", "score"])
    score_df["service"] = topics_list
    score_df["score"] = scores
    ranking = score_df.sort_values(by="score",ascending=False)
    mvps = ranking.iloc[:parameters["k_topics"]]

    # this is if we not only classify topics, but also their services
  
    servs = pd.DataFrame(columns=["service","score"])
    for i in mvps["service"].to_list():
      cosine_scores = util.pytorch_cos_sim(query_embedded, labels_embedded[i])
      scores = cosine_scores[0].tolist()
      score_df = pd.DataFrame(columns=["service", "score"])
      score_df["service"] = public_get_service_names_from_topic_name(i, df_services,df_topics,fibel)
      score_df["score"] = scores
      ranking = score_df.sort_values(by="score",ascending=False)
      mvps_for_each_topic = ranking.iloc[:parameters["k"]]
      servs = servs.append(mvps_for_each_topic)
    servs = servs.sort_values(by="score",ascending=False)

    mvps = servs.iloc[:parameters["k"]]

  print(question)
  print(true_answer)
  print(mvps)

  invest_len_dict = {}
  invest_len_dict["query"] = question
  invest_len_dict["length"] = len(question)

  if true_answer in mvps['service'].to_list():
    # increase the hits
    # print("correctly classified")
    hits += 1
    invest_len_dict["correct"] = 1
  else:
    invest_len_dict["correct"] = 0
  # print("\n\n")

invest_len_df = invest_len_df.append(pd.Series(invest_len_dict),ignore_index=True)
score = (hits/parameters["how_many"])*100

print(score)

In [None]:
# add the score
parameters["score"] = score
# cast to series
results_row = pd.Series(parameters)

# save to frame
results_path = "/content/drive/MyDrive/Masterarbeit/Colab_Data/results_SCC.csv"
frame = pd.read_csv(results_path,index_col=False)
frame = frame.append(results_row,ignore_index=True)
frame.to_csv(results_path, index=False)

In [None]:
# get the service distribution for Section 5.4.4
investigate = True

if investigate and parameters["classifier"] == "services":
  threshold = 10
  distr = test_data["d115Name"].value_counts()
  n = sum(distr.to_list())
  distr_dict = dict(distr)
  counter_irrelevant = 0
  for key,value in distr_dict.items():
    if value < threshold:
      counter_irrelevant += 1
    print(f"{key}:{round((value/n)*100,2)}")
  print("\n\n")
  print(f"{counter_irrelevant} services out of {len(distr.index.to_list())} selected services have been selected less than {threshold} times\n\n")
  distr.plot(kind="pie")

In [None]:
# investigate the correlation between the sequence length and correct classifications

invest_len = True

if invest_len:

  print(invest_len_df)
  correlation = invest_len_df["correct"].astype(float).corr(invest_len_df["length"].astype(float))

  print(f"The correlation between the correctness of the SCC and the length of the user query is {round(correlation,4)}")