In [None]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import csv

# Load data files

In [None]:
test_df = pd.read_pickle("../../data/datasets/test_df.pkl")

In [None]:
# we only use this for the "doc" field and NOT for prediction
test_features_s1 = pd.read_pickle("../../data/gbdt_features/test_features_step1.pkl")
cols = test_features_s1.columns[2:]
np_test_features_s1 = test_features_s1[cols].to_numpy()

In [None]:
# Step 2 after Step 1 BERT
# we only use this for the "doc" field and NOT for prediction
test_features_s2 = pd.read_pickle("../../data/gbdt_features/test_features_step2_all_feat_lightGBM_S1_BERT.pkl")  # computed on results of STEP1 
cols = test_features_s2.columns[2:]
np_test_features_s2 = test_features_s2[cols].to_numpy()

In [None]:
test_df.head()

In [None]:
# we use predicted labels from file instead of running the model in inference
predFile_Step1 = "../../data/bert_models/pred_BERT_MSMARCO_step1.tsv"
pred_df_step1 = pd.read_csv(predFile_Step1, delimiter="\t", header=None)

pred_df_step1.head()

In [None]:
# we use predicted labels from file instead of running the model in inference
predFile_Step2 = "../../data/bert_models/pred_BERT_MSMARCO_step2.tsv"
pred_df_step2 = pd.read_csv(predFile_Step2, delimiter="\t", header=None)

pred_df_step2.head()

# Assemble results and simulate prediction

In [None]:
test_index = list(test_df.index)

In [None]:
# Result dict is a dictionary:
# - key: qid 
# - value: a tuple of (predicted_label, groundtruth_label, original_utterance)

result_dict = {}

for i in test_index:
    utt_id = test_df[0][i]
    if (test_df[0][i].split("_")[1]==str(1)):
        result_dict[test_df[0][i]] = ("SE", test_df[2][i], test_df[1][i])
    else:
        # predictions for STEP 1
        result_step1 = pred_df_step1[2][i]
        if result_step1==1:
            result_dict[test_df[0][i]] = ("SE", test_df[2][i], test_df[1][i])
        else:
            # predictions for STEP 2
            aux = pred_df_step2.loc[pred_df_step2[0]==utt_id]
            result_step2 = int(aux[4])
            if result_step2 == 1:
                result_dict[test_df[0][i]] = ("FT", test_df[2][i], test_df[1][i])
            else:
                result_dict[test_df[0][i]] = ("PT", test_df[2][i], test_df[1][i])

# Visualize results and evaluate

In [None]:
result_dict # predicted vs ground truth

In [None]:
true_pos = 0
for a,b,c in result_dict.values():
    if a ==b=="SE":
        true_pos += 1
    if (a==b=="FT") or (a==b=="PT"):
        true_pos += 1  

In [None]:
print("Accuracy {}/{} is : {}".format(true_pos, 194, true_pos/194.))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_test = test_df[2].values
y_pred = [a for a,b,c in result_dict.values()]

print(confusion_matrix(y_test, y_pred, labels=["SE", "FT", "PT"]))
print(classification_report(y_test, y_pred, labels=["SE", "FT", "PT"]))

# Utterance rewriting strategies for post-classification

In [None]:
from topic_utils import create_doc, _find_topic, _rewrite_utt, _find_cue_topic, _find_topic_all

## Strategy 1: Standard - Enrich with first or previous topic
- extract first and previous topic and rewrite utterance 
- if missing third person pronoun we trail either first or previous

In [None]:
def strategy_Standard(test_df, pred_df_step1, pred_df_step2, test_features_s2):
    """
    Enrich with first or previous topic
    - extract first and previous topic and rewrite utterance
    - if missing third person pronoun we trail either first or previous
    :param test_df: the test dataset
    :param pred_df_step1: predictions dataframe for Step1 (we don't use the model,
    we just assemble the results)
    :param pred_df_step2: predictions dataframe for Step1
    :param test_features_s2: features dataframe for which we use the doc object
    (with the nlp by spacy) for rewriting
    :return:
    """
    test_index = list(test_df.index)
    result_dict = {}

    for i in test_index:
        utt_id = test_df[0][i]
        if test_df[0][i].split("_")[1] == str(1):
            result_dict[test_df[0][i]] = test_df[1][i]
        else:
            # STEP 1
            result_step1 = pred_df_step1[2][i]
            if result_step1 == 1:
                result_dict[test_df[0][i]] = test_df[1][i]
            else:
                # STEP 2
                aux = pred_df_step2.loc[pred_df_step2[0] == utt_id]
                result_step2 = int(aux[4])

                current_doc = test_features_s2.at[i, "doc"]

                if result_step2 == 1:
                    # get the first topic
                    conv_id = test_df[0][i].split("_")[0]
                    first_utt_id = conv_id + "_1"
                    row_index_first = test_features_s2.index[
                        test_features_s2[0] == first_utt_id].tolist()[0]
                    first_utt_doc = test_features_s2.at[row_index_first, "doc"]
                    first_topic = _find_topic(first_utt_doc)

                    new_utt = _rewrite_utt(current_doc, first_topic=first_topic,
                                           previous_topic="", context_list=None,
                                           trailing=True)
                    result_dict[test_df[0][i]] = new_utt
                else:
                    # get the previous topic
                    prev_utt_id = test_df[0][i - 1]
                    row_index_previous = test_features_s2.index[
                        test_features_s2[0] == prev_utt_id].tolist()[0]
                    prev_utt_doc = test_features_s2.at[
                        row_index_previous, "doc"]
                    prev_topic = _find_topic(prev_utt_doc)

                    new_utt = _rewrite_utt(current_doc, first_topic="",
                                           previous_topic=prev_topic,
                                           context_list=None, trailing=True)
                    result_dict[test_df[0][i]] = new_utt

    return result_dict

In [None]:
result_dict = strategy_Standard(test_df, pred_df_step1, pred_df_step2, test_features_s2)
list(result_dict.items())[:50]

## Strategy 2: Extract PT on enriched utterance

- just like Strategy 1 but for PT we extract on enriched utterance 

In [None]:
def strategy_Enriched(test_df, pred_df_step1, pred_df_step2, test_features_s2):
    """
    Similar to Strategy Standard but for PT we extract on enriched utterance
    :param test_df: the test dataset
    :param pred_df_step1: predictions dataframe for Step1 (we don't use the model,
    we just assemble the results)
    :param pred_df_step2: predictions dataframe for Step1
    :param test_features_s2: features dataframe for which we use the doc object
    (with the nlp by spacy) for rewriting
    :return:
    """
    test_index = list(test_df.index)
    result_dict = {}
    enriched_utt_dict = {}

    for i in test_index:
        utt_id = test_df[0][i]
        if test_df[0][i].split("_")[1] == str(1):
            result_dict[test_df[0][i]] = ("SE", test_df[1][i])
            enriched_utt_dict[i] = test_df[1][i]
        else:
            # STEP 1
            result_step1 = pred_df_step1[2][i]
            if result_step1 == 1:
                result_dict[test_df[0][i]] = ("SE", test_df[1][i])
                enriched_utt_dict[i] = test_df[1][i]
            else:
                # STEP 2
                aux = pred_df_step2.loc[pred_df_step2[0] == utt_id]
                result_step2 = int(aux[4])

                current_doc = test_features_s2.at[i, "doc"]

                if result_step2 == 1:
                    # get the first topic
                    conv_id = test_df[0][i].split("_")[0]
                    first_utt_id = conv_id + "_1"
                    row_index_first = test_features_s2.index[
                        test_features_s2[0] == first_utt_id].tolist()[0]
                    first_utt_doc = test_features_s2.at[row_index_first, "doc"]
                    first_topic = _find_topic(first_utt_doc)

                    new_utt = _rewrite_utt(current_doc, first_topic=first_topic,
                                           previous_topic="", context_list=None,
                                           trailing=True)
                    result_dict[test_df[0][i]] = ("FT", new_utt)
                    enriched_utt_dict[i] = new_utt
                else:
                    # get the previous topic
                    # this changes respect to Strategy 1
                    prev_utt_doc = create_doc(enriched_utt_dict[i - 1])
                    prev_topic = _find_topic(prev_utt_doc)

                    new_utt = _rewrite_utt(current_doc, first_topic="",
                                           previous_topic=prev_topic,
                                           context_list=None, trailing=True)
                    result_dict[test_df[0][i]] = ("PT", new_utt)
                    enriched_utt_dict[i] = new_utt
    return result_dict

In [None]:
result_dict = strategy_Enriched(test_df, pred_df_step1, pred_df_step2, test_features_s2)
list(result_dict.items())[:50]

## Strategy 3:

- propagate everything from the last SE

In [None]:
def strategy_Last_SE(test_df, pred_df_step1, test_features_s2):
    """
    Propagate everything from the last SE
    :param test_df: the test dataset
    :param pred_df_step1: predictions dataframe for Step1 (we don't use the model,
    we just assemble the results)
    :param test_features_s2: features dataframe for which we use the doc object
    (with the nlp by spacy) for rewriting
    :return:
    """

    test_index = list(test_df.index)
    result_dict = {}
    last_SE_topic = ""

    for i in test_index:
        if test_df[0][i].split("_")[1] == str(1):
            result_dict[test_df[0][i]] = test_df[1][i]
            last_SE_topic = _find_topic(test_features_s2["doc"][i])

        else:
            resultSE = pred_df_step1[2][i]
            if resultSE == 1:
                result_dict[test_df[0][i]] = test_df[1][i]
                last_SE_topic = _find_topic(test_features_s2["doc"][i])

            else:
                current_doc = test_features_s2.at[i, "doc"]
                new_utt = _rewrite_utt(current_doc, first_topic="",
                                       previous_topic=last_SE_topic,
                                       context_list=None, trailing=True)
                result_dict[test_df[0][i]] = new_utt

    return result_dict

In [None]:
result_dict = strategy_Last_SE(test_df, pred_df_step1, test_features_s2)
list(result_dict.items())[:50]

## Strategy 4: 

- propagate everything from the last SE and keep FT for context (expand for all previous also with first! , similar to trailing)

In [None]:
def strategy_First_and_Last_SE(test_df, pred_df_step1, test_features_s2):
    """
    Propagate everything from the last SE and keep FT for context
    (expand for all previous also with first!, similar to trailing)
    :param test_df: the test dataset
    :param pred_df_step1: predictions dataframe for Step1 (we don't use the model,
    we just assemble the results)
    :param test_features_s2: features dataframe for which we use the doc object
    (with the nlp by spacy) for rewriting
    :return:
    """

    test_index = list(test_df.index)
    result_dict = {}
    last_SE_topic = ""
    first_SE_topic = ""

    for i in test_index:
        if test_df[0][i].split("_")[1] == str(1):
            result_dict[test_df[0][i]] = test_df[1][i]

            last_SE_topic = _find_topic(test_features_s2["doc"][i])
            first_SE_topic = last_SE_topic

        else:
            resultSE = pred_df_step1[2][i]
            if resultSE == 1:
                result_dict[test_df[0][i]] = test_df[1][i] + " " + first_SE_topic
                last_SE_topic = _find_topic(test_features_s2["doc"][i])

            else:
                current_doc = test_features_s2.at[i, "doc"]
                new_utt = _rewrite_utt(current_doc, first_topic="",
                                       previous_topic=last_SE_topic,
                                       context_list=None, trailing=True)
                result_dict[test_df[0][i]] = new_utt + " " + first_SE_topic
    return result_dict

In [None]:
result_dict = strategy_First_and_Last_SE(test_df, pred_df_step1, test_features_s2)
list(result_dict.items())[:50]

## Strategy 5:

If FT enrich with first SE. If PT enrich with last SE.

In [None]:
def strategy_First_or_Last_SE(test_df, pred_df_step1, pred_df_step2, test_features_s2):
    """
    If FT enrich with first SE. If PT enrich with last SE.
    :param test_df: the test dataset
    :param pred_df_step1: predictions dataframe for Step1 (we don't use the model,
    we just assemble the results)
    :param pred_df_step2: predictions dataframe for Step1
    :param test_features_s2: features dataframe for which we use the doc object
    (with the nlp by spacy) for rewriting
    :return:
    """

    test_index = list(test_df.index)
    result_dict = {}
    last_SE_topic = ""
    first_SE_topic = ""

    for i in test_index:
        utt_id = test_df[0][i]
        if test_df[0][i].split("_")[1] == str(1):
            result_dict[test_df[0][i]] = test_df[1][i]
            last_SE_topic = _find_topic_all(test_features_s2["doc"][i])
            first_SE_topic = last_SE_topic

        else:
            # STEP 1
            result_step1 = pred_df_step1[2][i]
            if result_step1 == 1:
                result_dict[test_df[0][i]] = test_df[1][i]
                last_SE_topic = _find_topic_all(test_features_s2["doc"][i])
            else:
                # STEP 2
                aux = pred_df_step2.loc[pred_df_step2[0] == utt_id]
                result_step2 = int(aux[4])

                current_doc = test_features_s2.at[i, "doc"]

                if result_step2 == 1:
                    new_utt = _rewrite_utt(current_doc,
                                           first_topic=first_SE_topic,
                                           previous_topic="", context_list=None,
                                           trailing=True)
                    result_dict[test_df[0][i]] = new_utt
                else:
                    new_utt = _rewrite_utt(current_doc, first_topic="",
                                           previous_topic=last_SE_topic,
                                           context_list=None, trailing=True)
                    result_dict[test_df[0][i]] = new_utt

    return result_dict

In [None]:
result_dict = strategy_First_or_Last_SE(test_df, pred_df_step1, pred_df_step2, test_features_s2)
list(result_dict.items())[:50]