In [49]:
import os
from configparser import ConfigParser
from loader.dataset_woz3 import DatasetWoz3
from loader.task import Task
from loader.task import Examplars
from loader.task import generate_task
import pandas as pd
from collections import Counter

# Settings
config_file = "./config/config.cfg"
config = ConfigParser()
config.read(config_file)

datasplit_file = config["DATA"]["data_split"]
experiment_prefix = config["EXPERIMENT"]["experiment_prefix"]
experiment_type = 'examplar_herding_64'
examplar_prefix = f"{experiment_prefix}{experiment_type}/examplars/"
template_file = config["DATA"]["template_file"]
output_file = experiment_prefix+experiment_type+".csv"

if __name__ == "__main__":

    # Read template
    with open(template_file, "r") as f:
        template = f.read()
    template = template.split("\n")
    da_data = [term for term in template if term.startswith("d-a:")]
    sv_data = [term for term in template if term.startswith("d-a-s-v:")]

    # Get all training examplars of given experiment
    examplar_files = os.listdir(examplar_prefix)
    examplar_files = [examplar_prefix + file for file in examplar_files]
    examplar_files = [file for file in examplar_files if "train" in file and "json" in file]
    task_names = [file.split("/")[-1].split("_")[0] for file in examplar_files]

    # Initialize dataframe holder
    data_df = pd.DataFrame(index=task_names,
                           columns=["DA recall", "SV recall (Fine grained)", "SV recall (Coarse grained)", "SV length",
                                    "Full task coverage (Fine-grained)", "Full task coverage (Coarse grained)"])

    # Initialize global dataset
    dataset = DatasetWoz3(config, datasplit_file, percentage=1.0)

    # Loop through all tasks to check quality of examplars
    for i, file in enumerate(examplar_files):

        # Construct current task
        task_name = task_names[i]
        examplars = Examplars.load(file)
        data = dict({"train": examplars.data, "valid": [], "test": []})

        # Get examplar task only containing examplar data
        task = Task(config, percentage=1.0, data=data, task_name=task_name,
                    examplars=dict({"train": None, "valid": None}))

        # Get full task containing all data for current task type
        full_task, _ = generate_task(dataset, [template.index(f"d:{task_name}")], old_examplars=None)

        # Get current_task's da_indices and sv_indices / sv_contents
        gt_da_indices = [i for i in range(len(da_data)) if da_data[i].startswith(f"d-a:{task_name}")]
        gt_sv_indices = [i for i in range(len(sv_data)) if sv_data[i].startswith(f"d-a-s-v:{task_name}")]
        gt_sv_contents = set(["-".join(sv_data[i].split(":")[1].split("-")[1: 3]) for i in gt_sv_indices])

        # Get da_idx, sv_idx, sv_content and sv_len
        da_indices = set()
        sv_indices = set()
        sv_contents = set()
        sv_len = []

        for _, _, text, meta in task.data["train"]:
            _, da_idx, sv_idx, feat_str = task.getFeatIdx(meta)

            da_indices = da_indices | set(da_idx)
            sv_indices = sv_indices | set(sv_idx)
            sv_len.append(len(sv_idx))
            sv_contents = sv_contents | set(["-".join(sv_data[i].split(":")[1].split("-")[1: 3]) for i in sv_idx])

        sv_contents.discard("")
        #print(sorted(gt_sv_contents))

        # Calculate recall of da and sv
        da_recall = len(da_indices) / len(gt_da_indices)
        sv_recall_fine_grained = len(sv_indices) / len(gt_sv_indices)
        sv_recall_coarse_grained = len(sv_contents) / len(gt_sv_contents)
        mean_sv_len = sum(sv_len) / len(sv_len)

        # Calculate the percentage of training samples in full_task that are covered by examplar sv-pairs
        covered_sv_num = 0
        covered_content_num = 0
        all_sv = []
        for _, _, text, meta in full_task.data["train"]:
            sv_idx = full_task.getFeatIdx(meta)[2]
            sv_content = set(["-".join(sv_data[i].split(":")[1].split("-")[1: 3]) for i in sv_idx])
            all_sv += ["-".join(sv_data[i].split(":")[1].split("-")[1: 3]) for i in sv_idx]
            covered_sv_num += set(sv_idx).issubset(sv_indices)
            covered_content_num += set(sv_content).issubset(sv_contents)

        covered_sv_percentage = covered_sv_num / len(full_task.data["train"])
        covered_content_percentage = covered_content_num / len(full_task.data["train"])

        print(f"Current task is {task_name}")
        print(f"SV counter is {Counter(all_sv)}")
        print(f"SV contents is {sorted(sv_contents)}")
        data_df.loc[task_name] = [da_recall, sv_recall_fine_grained, sv_recall_coarse_grained, mean_sv_len,
                                  covered_sv_percentage, covered_content_percentage]

    #data_df.to_csv(output_file, index=True, columns=data_df.columns)
    print(data_df)

/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master


Whole Data Set INFO
***** DATA INFO *****
# of turns
Train: 43812
Valid: 5905
Test: 5916
# of batches: Train 171 Valid 23 Test 23
The number of examplar batch is 0, 0
The number of batch is 0, 0


***** DATA INFO *****
Using 100.0% of training data
BATCH SIZE: 256
Train: 43812 turns
Valid: 5905 turns
Test: 5916 turns
# of batches: Train 171 Valid 23 Test 23
*************************

/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Attraction
SV counter is Counter({'Inform-Name': 365, 'Inform-Addr': 331, 'Inform-Type': 297, 'Inform-Area': 290, 'Inform-Choice': 240, 'Inform-Fee': 221, 'Inform-Phone': 203, 'Inform-Post': 181, 'Recommend-Name': 148, 'Request-Area': 75, 'Request-Type': 71, 'Recommend-Addr': 43, 'Select-Type': 39, 'NoOffer-Type': 38, 'NoOffer-Area': 37, 'Recommend-Fee': 36, 'Recommend-Area': 27, 'Select-none': 25, 'Recommend-Type': 21, 'Recommend-Post': 15, 'Recommend-Phone': 14, 'NoOffer-none': 12, 'Request-Price': 11, 'Inform-Price': 9, 'Request-Name': 9, 'Select-Area':

The number of examplar batch is 0, 0
The number of batch is 26, 3
The number of examplar batch is 0, 0
The number of batch is 0, 0


/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Taxi
SV counter is Counter({'Inform-Car': 201, 'Inform-Phone': 200, 'Request-Leave': 74, 'Request-Depart': 60, 'Request-Dest': 51, 'Request-Arrive': 41, 'Inform-Depart': 28, 'Inform-Dest': 22, 'Inform-Leave': 15, 'Inform-Arrive': 10, 'Inform-none': 4})
SV contents is ['Inform-Arrive', 'Inform-Car', 'Inform-Depart', 'Inform-Dest', 'Inform-Leave', 'Inform-Phone', 'Request-Arrive', 'Request-Depart', 'Request-Dest', 'Request-Leave']
/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master


The number of examplar batch is 0, 0
The number of batch is 10, 1
The number of examplar batch is 0, 0
The number of batch is 0, 0


/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Restaurant
SV counter is Counter({'Inform-Name': 343, 'Inform-Choice': 343, 'Inform-Food': 302, 'Inform-Area': 264, 'Inform-Price': 218, 'Request-Food': 163, 'Recommend-Name': 145, 'Inform-Addr': 143, 'Inform-Phone': 94, 'Inform-Post': 93, 'Request-Area': 91, 'Request-Price': 76, 'NoOffer-Food': 68, 'Select-none': 39, 'NoOffer-Area': 37, 'Recommend-Food': 32, 'Recommend-Addr': 27, 'Recommend-Area': 26, 'Select-Price': 26, 'NoOffer-none': 26, 'Select-Name': 23, 'Recommend-Price': 21, 'Select-Food': 20, 'NoOffer-Price': 18, 'Select-Area': 6, 'Recommend-Post': 6, 'Inform-Ref': 5, 'Recommend-Phone': 4, 'Request-Name': 4, 'Select-Choice': 3, 'NoOffer-Name': 1, 'Recommend-none': 1})
SV contents is ['Inform-Addr', 'Inform-Area', 'Inform-Choice', 'Inform-Food', 'Inform-Name', 'Inform-Phone', 'Inform-Post', 'Inform-Price', 'NoOffer-Area', 'NoOffer-Food', 'NoOffer-P

The number of examplar batch is 0, 0
The number of batch is 35, 3
The number of examplar batch is 0, 0
The number of batch is 0, 0


/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Train
SV counter is Counter({'Inform-Leave': 467, 'Inform-Arrive': 381, 'Inform-Id': 357, 'OfferBook-none': 333, 'Inform-Choice': 272, 'OfferBooked-Ref': 271, 'Request-Leave': 215, 'Inform-Dest': 209, 'Request-Day': 192, 'Inform-Depart': 182, 'Request-Depart': 181, 'OfferBooked-Ticket': 179, 'Request-Dest': 152, 'Inform-Ticket': 149, 'Inform-Time': 117, 'Inform-Day': 115, 'Request-Arrive': 100, 'Request-People': 76, 'OfferBooked-Id': 63, 'OfferBooked-People': 46, 'OfferBooked-Leave': 30, 'Select-none': 27, 'OfferBook-Id': 20, 'OfferBooked-Arrive': 19, 'Inform-Ref': 16, 'OfferBook-Leave': 15, 'OfferBooked-Depart': 14, 'OfferBook-Arrive': 12, 'OfferBooked-Dest': 9, 'OfferBooked-Day': 8, 'Select-Leave': 7, 'OfferBook-Dest': 4, 'OfferBooked-Time': 4, 'Select-Day': 4, 'NoOffer-Depart': 3, 'OfferBooked-none': 3, 'Select-Arrive': 3, 'OfferBook-People': 3, 'OfferB

The number of examplar batch is 0, 0
The number of batch is 39, 6
The number of examplar batch is 0, 0
The number of batch is 0, 0


/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Booking
SV counter is Counter({'Book-Ref': 482, 'Request-Day': 164, 'Request-Stay': 97, 'Request-Time': 93, 'Request-People': 92, 'Book-Name': 84, 'NoBook-none': 84, 'Book-Day': 81, 'Book-People': 70, 'Book-Time': 59, 'Inform-none': 43, 'Book-Stay': 36, 'Inform-Name': 21, 'NoBook-Day': 13, 'Inform-Day': 11, 'NoBook-Time': 10, 'NoBook-Stay': 6, 'Inform-People': 6, 'NoBook-People': 5, 'Inform-Stay': 4, 'Inform-Ref': 3, 'Inform-Time': 2, 'NoBook-Name': 1, 'Book-none': 1, 'NoBook-Ref': 1})
SV contents is ['Book-Day', 'Book-Name', 'Book-People', 'Book-Ref', 'Book-Stay', 'Book-Time', 'Inform-People', 'Inform-Stay', 'NoBook-Day', 'NoBook-Name', 'NoBook-People', 'NoBook-Ref', 'NoBook-Stay', 'NoBook-Time', 'NoBook-none', 'Request-Day', 'Request-People', 'Request-Stay', 'Request-Time']
/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incr

The number of examplar batch is 0, 0
The number of batch is 24, 3
The number of examplar batch is 0, 0
The number of batch is 0, 0


/Users/feimi/Documents/GitHub/Continual_Learning_for_NLG/nlg-sclstm-multiwoz-incremental/nlg-sclstm-multiwoz-master
Current task is Hotel
SV counter is Counter({'Inform-Choice': 377, 'Inform-Name': 366, 'Inform-Type': 302, 'Inform-Price': 225, 'Inform-Area': 221, 'Inform-Stars': 170, 'Request-Area': 163, 'Recommend-Name': 135, 'Inform-Parking': 125, 'Inform-Internet': 122, 'Inform-Addr': 115, 'Request-Price': 110, 'Inform-Phone': 88, 'Inform-Post': 72, 'Select-none': 42, 'NoOffer-Type': 37, 'Recommend-Area': 35, 'Request-Stars': 34, 'Recommend-Price': 30, 'Recommend-Stars': 29, 'Request-Type': 20, 'Request-Parking': 19, 'NoOffer-Price': 19, 'Select-Area': 19, 'Recommend-Internet': 18, 'Recommend-Parking': 17, 'Select-Price': 17, 'Select-Type': 17, 'Request-Internet': 16, 'NoOffer-Area': 16, 'Recommend-Type': 16, 'NoOffer-Stars': 15, 'Recommend-Addr': 10, 'Select-Stars': 9, 'NoOffer-none': 9, 'Select-Name': 6, 'Request-Name': 5, 'Recommend-Post': 5, 'Recommend-Choice': 4, 'NoOffer-Inter

The number of examplar batch is 0, 0
The number of batch is 34, 4


In [45]:
import json

experiment_type = 'examplar_random_64'
examplar_prefix = f"{experiment_prefix}{experiment_type}/examplars/"

a = json.load(open(examplar_prefix+"Attraction_train_data_file.json", 'r'))

count = 1
total_len = 0
for item in a:
    sent = item[2]['delex']
    total_len += len(sent.split(' '))
    print(count, sent)
    count += 1

print (total_len/count)

1 the postcode is slot-attraction-inform-post .
2 yes slot-attraction-inform-name is located in the slot-attraction-inform-area . 
3 there are no slot-attraction-nooffer-type in the same area but there is slot-attraction-inform-choice in the slot-attraction-inform-area .
4 are you looking for one with slot-attraction-select-fee ?
5 unfortunately there is n't anything in the slot-attraction-nooffer-area that 's slot-attraction-nooffer-type , do you have an alternate preference or area you 'd like ? 
6 i would recommend slot-attraction-inform-name . the entrance fee is slot-attraction-inform-fee . 
7 slot-attraction-recommend-name is an slot-attraction-recommend-type in the slot-attraction-recommend-area , phone is slot-attraction-recommend-phone , post code is , slot-attraction-recommend-post and address is slot-attraction-recommend-addr , it is slot-attraction-recommend-fee to enter . 
8 of course - the address is slot-attraction-inform-addr slot-attraction-inform-post and the phone nu