In [28]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Notes

Natural language inference in dialogs

- Consistency:
    - Agent changes its answer despite the context remains pointing to one of them / other. Eg: dial 100_00000, turn 6, slot category
        
        See Jason Weston team's paper on Dialog NLI
  

- OR selection:
    - Consider the utterance formation: "(X1) (object A) or (object B) (X2) "
        the model needs to pick object A or B conditioned on X1, X2 and context. Eg: dial 100_00001, turn 12, slot movie_name
        
        A change in as subtle as one word piece might change from Object A to B.. or viceversa.
        
        One object needs to be selected from many of same type. many-1 mapping.
        
        see Winograd, Entailment recognition tasks in GLUE benchmark


- AND conjunction
    - Consider the utterance formation: "...(object A subquery) (object A) and (object B subquery) object B ... "
        eg: "I want to go from JKF to SFO tomorrow evening"
        
        multiple objects of same type need to be selected. 1-1 mapping.. 
        

- Temporal selection
    - Eg: dial 37_00035
        Airlines are categorical values initialized in memory. Agent offers Alaska Airlines at turn 4, American Airlines at turn 6,
        Delta Airlines at turn 8. User aceepts Delta at turn 10, but prediction points to American.
        
        Note that, categorical fields are not temporally ordered in memory. Memory is initialized with them as appeared in schema.


- Value disambiguation
    - Eg: dial 44_00043 slot buses_2/fare_type
    
    - Values of a slot are given a description. The semantic representations of these values although numerous, are practically
        limited. But atypical representation needs to be gracefully handled.
          
    - Selection of non categorical and categorical values are different pattern recognition challenges.
        - In case of noncat values, the semantic representations in memory and dialog match exactly
        - In case of cat values, the semantic representations in memory and dialog DO NOT match. The model in addition needs to 
            disambiguate the mapping from utterances / context. This involves coreference and ellipsis and common sense resolutions.
        
    - Intutively, Rastogi el al. approach to remove separate slot tagger, equalizes the complexity of selection of cat and noncat values.

      
Model limitations

- Lack of dialog history
    - Each turn is modeled independently and all slots/turn; as such the model predicts despite lack of context for that specific slot. 
        
        It surprisingly does well hmmm. interesting.
        
        Ideally predict change of value or condition on proper context. Eg: dial 37_00002, turn 6, slot seating_class
        
        
- Categorical values are not tagged
    - All cat values are initialized in memory at the beginning (thus have no temporal order unlike noncat values). is that a good idea?

In [1]:
import glob
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from tabulate import tabulate
from ipdb import set_trace

In [2]:
target_dial = {}
for fname in tqdm(glob.glob("../data/train/dialogues*.json")):
    with open(fname) as f:
        ds = json.load(f)
        for dial in ds:
            dial_id = dial["dialogue_id"]
            target_dial[dial_id] = dial       

HBox(children=(IntProgress(value=0, max=127), HTML(value='')))




In [3]:
pred_dial = {}
with open("../out/out1-train/dialogues.json") as f:
    ds = json.load(f)
    for dial in ds:
        dial_id = dial["dialogue_id"]
        pred_dial[dial_id] = dial

HBox(children=(IntProgress(value=0, max=16142), HTML(value='')))




In [4]:
assert set(target_dial.keys()) == set(pred_dial.keys())

In [5]:
def num_errors(pred, target):
    errors = 0
    assert pred["dialogue_id"] == target["dialogue_id"]
    for tid, (p, t) in enumerate(zip(pred["turns"], target["turns"])):
        if t["speaker"] == "USER":
            turn_results = defaultdict(dict) # dial,turn,serv,slot->(t_val, p_val)
            # fill targets
            for frame in t["frames"]:
                serv = frame["service"]
                for slot, vals in frame["state"]["slot_values"].items():
                    turn_results[tid,serv,slot] = vals[0]
            # fill predictions
            for frame in p["frames"]:
                serv = frame["service"]
                for slot, vals in frame["state"]["slot_values"].items():
                    errors += 1-int(turn_results[tid,serv,slot] == vals[0])
    return errors

In [6]:
errors_freq = {}
for k in tqdm(target_dial):
    errors_freq[k] = num_errors(pred_dial[k], target_dial[k])

HBox(children=(IntProgress(value=0, max=16142), HTML(value='')))




In [7]:
dial_ids = sorted(target_dial.keys(), key=lambda k: errors_freq[k], reverse=True)

In [30]:
dial_ids[10:100]

['75_00002',
 '74_00093',
 '37_00052',
 '37_00123',
 '37_00127',
 '74_00080',
 '56_00071',
 '38_00023',
 '60_00031',
 '72_00113',
 '75_00039',
 '37_00011',
 '37_00041',
 '60_00048',
 '74_00070',
 '73_00054',
 '60_00027',
 '72_00124',
 '37_00063',
 '37_00064',
 '21_00086',
 '72_00069',
 '72_00105',
 '75_00044',
 '37_00029',
 '60_00041',
 '116_00042',
 '38_00022',
 '38_00038',
 '21_00101',
 '18_00026',
 '37_00047',
 '60_00095',
 '38_00045',
 '74_00118',
 '74_00122',
 '58_00041',
 '58_00121',
 '58_00127',
 '72_00080',
 '72_00082',
 '75_00025',
 '37_00036',
 '60_00071',
 '36_00118',
 '55_00088',
 '74_00058',
 '75_00024',
 '73_00025',
 '19_00007',
 '57_00101',
 '58_00096',
 '59_00003',
 '56_00006',
 '56_00015',
 '73_00000',
 '54_00050',
 '37_00006',
 '37_00062',
 '80_00092',
 '21_00114',
 '18_00043',
 '55_00028',
 '105_00023',
 '58_00120',
 '53_00102',
 '56_00000',
 '56_00051',
 '75_00003',
 '75_00032',
 '75_00040',
 '73_00019',
 '73_00043',
 '37_00040',
 '37_00100',
 '18_00036',
 '74_00074

In [31]:
def wrap(s, limit=100, delimiter="<br>"):
    x = []
    for i,j in enumerate(s):
        if i % limit == 0:
            x.append(delimiter)
        x.append(j)
    return "".join(x)

def print_dial(pred, target):
    results = []
    exists = set()
    assert pred["dialogue_id"] == target["dialogue_id"]
    dial = pred["dialogue_id"]
    for tid, (p, t) in enumerate(zip(pred["turns"], target["turns"])):
        if t["speaker"] == "USER":
            # utter
            u_utter = t["utterance"]
            if tid > 0: s_utter = target["turns"][tid-1]["utterance"]
            else: s_utter = "NONE"
            utter = "SYS: {} <br><br> USR: {}".format(wrap(s_utter), wrap(u_utter))
            
            turn_results = defaultdict(dict) # dial,turn,serv,slot->(t_val, p_val)
            
            # fill targets
            for frame in t["frames"]:
                serv = frame["service"]
                for slot, vals in frame["state"]["slot_values"].items():
                    if (dial,tid) in exists: utter_field = ""
                    else: utter_field = utter
                    turn_results[dial,tid,serv,slot] = [vals[0], "UNK", False, utter_field]
                    exists.add((dial,tid))
            
            # fill predictions
            for frame in p["frames"]:
                serv = frame["service"]
                for slot, vals in frame["state"]["slot_values"].items():
                    turn_results[dial,tid,serv,slot][1] = vals[0]
                    turn_results[dial,tid,serv,slot][2] = (
                        turn_results[dial,tid,serv,slot][0] == turn_results[dial,tid,serv,slot][1])
            
            # when slot-values are empty
            if not turn_results:
                turn_results[dial,tid,None,None] = (None, None, None, utter)

            # fill results
            for k, v in turn_results.items():
                results.append(k + tuple(v))

    header = ("dial", "turn", "serv", "slot", "target", "pred", "correct", "utter")
    df = pd.DataFrame(results, columns=header)
    
    # rearrange cols
    ordered_header = ["dial", "turn", "utter", "serv", "slot", "target", "pred", "correct"]
    df = df[ordered_header]

    def highlight(r):
        row = [""] * len(header)
        if not r.correct:
            row[-1] = "background-color: pink"
            row[-2] = "background-color: pink"
            row[-3] = "background-color: pink"
        return row
    
    df = df.style.apply(highlight, axis=1)
    display(df)

    
p = pred_dial["37_00040"]
t = target_dial["37_00040"]
print_dial(p, t)

Unnamed: 0,dial,turn,utter,serv,slot,target,pred,correct
0,37_00040,0,SYS: NONE USR: Hi. Can you help me book a flight traveling in premium economy?,Flights_1,seating_class,Premium Economy,Premium Economy,True
1,37_00040,2,"SYS: Absolutely! Where will you be departing from? USR: Seattle, WA.",Flights_1,origin_city,"Seattle, WA","Seattle, WA",True
2,37_00040,2,,Flights_1,seating_class,Premium Economy,Economy,False
3,37_00040,4,"SYS: Great. Where are you traveling to and when do you plan to leave? USR: I want to leave Friday next week. I'm flying to Vancouver, BC.",Flights_1,departure_date,Friday next week,Friday next week,True
4,37_00040,4,,Flights_1,destination_city,"Vancouver, BC","Seattle, WA",False
5,37_00040,4,,Flights_1,origin_city,"Seattle, WA","Vancouver, BC",False
6,37_00040,4,,Flights_1,seating_class,Premium Economy,Economy,False
7,37_00040,6,SYS: There's an Alaska Airlines flight that leaves at 3:45 pm. It has 1 stop and costs $428. Does that in terest you at all? USR: Are there any other flights? I have three people in my group.,Flights_1,departure_date,Friday next week,Friday next week,True
8,37_00040,6,,Flights_1,destination_city,"Vancouver, BC","Vancouver, BC",True
9,37_00040,6,,Flights_1,origin_city,"Seattle, WA","Seattle, WA",True
