In [1]:
map_relations = {'Comment':0, 'Contrast':1, 'Correction':2, 'Question-answer_pair':3, 'Acknowledgement':4,'Elaboration':5,
                 'Clarification_question':6, 'Conditional':7, 'Continuation':8, 'Result':9, 'Explanation':10, 'Q-Elab':11,
                 'Alternation':12, 'Narration':13, 'Confirmation_question':14, 'Sequence':15, 'Null':16}

In [2]:
reverse_relations = {0:'Comment', 1:'Contrast', 2:'Correction', 3:'Question-answer_pair', 4:'Acknowledgement',5:'Elaboration',
                 6:'Clarification_question', 7:'Conditional', 8:'Continuation', 9:'Result', 10:'Explanation', 11:'Q-Elab',
                 12:'Alternation', 13:'Narration', 14:'Confirmation_question', 15:'Sequence', 16:'Null'}

In [3]:
import pickle
import json
from collections import Counter, defaultdict 
from sklearn.metrics import precision_recall_fscore_support,classification_report, ConfusionMatrixDisplay, confusion_matrix

In [4]:
home = %pwd
# gold_test_path = '/home/kate/minecraft_utils/llm_annotator/annotated_data/TEST_101_bert.json'
# llama_test_path = home + '/msdc_llama/test-output-generate-2p-format.pkl'
gold_val_path = '/home/kate/minecraft_utils/llm_annotator/annotated_data/VAL_100_bert.json'
llama_val_path = home + '/msdc_llama/val-output-generate-2p-format.pkl'

In [5]:
with open(gold_val_path, 'r') as jf:
    test_gold = json.load(jf)

In [6]:
with open(llama_val_path, 'rb') as f:
    test_pred = pickle.load(f)

In [7]:
len(test_pred), len(test_gold)

(6466, 100)

combine test pred and test gold info: <br>
get speaker, global turn info from gold, and incoming result and continuation info from preds, <br>
add them to the edu objects in the test gold 

In [8]:
test_pred[100:104]

[[1, 28, 32, 1, 9], [1, 26, 32, 1, 2], [1, 32, 33, 1, 14], [1, 33, 34, 1, 3]]

##### step 1: add predicted relations to the gold data json <br>
NB: test gold is a list of games, whereas test pred is a single list.

In [9]:
for i, game in enumerate(test_gold):
    preds = [[e[1], e[2], e[4]] for e in test_pred if e[0]==i]
    pred_rels = []
    for p in preds:
        pred_rel = {}
        pred_rel['x'] = p[0]
        pred_rel['y'] = p[1]
        pred_rel['type'] = reverse_relations[p[2]]
        pred_rels.append(pred_rel)
    game['predicted_relations'] = pred_rels

##### step 2 <br> adds info to edus that is necessary for more second pass (Narration) processing. 
<br> info: global turn, architect edu index in turn, 1/0 result incoming, 1/0 continuation, edu type <br> returns only narration relations. 

In [10]:
def is_nl(edu):
    """
    if every word in alphanumeric
    """
    nl = 1
    words = edu.split(' ')
    # print(words)
    for word in [w for w in words if w != '']:
        if not contains_number(word) or len(word)<5:
            nl = 0
            break
    return nl

def contains_number(string):
    return any(char.isdigit() for char in string)

In [11]:
for game in test_gold:
    rels = game['predicted_relations']
    # rels = game['relations']
    edus = game['edus']
    new_rels = []

    #add turn index for arch and global turn info to all edus
    ind_cnt = 0
    global_cnt = 0
    last_speaker = None
    global_index = 0
    for edu in edus:
        edu['global_index'] = global_index
        global_index += 1
        speaker = edu['speaker']
        if speaker == last_speaker:
            edu['turn'] = global_cnt
        else:
            last_speaker = speaker
            global_cnt += 1
            edu['turn'] = global_cnt
        if speaker == 'Architect':
            edu['turn_ind'] = ind_cnt
            ind_cnt += 1
            edu['type'] = 0
        elif speaker == 'Builder':
            ind_cnt = 0
            #also add type infoes
            if is_nl(edu['text']):
                edu['type'] = 1 
            else:
                edu['type'] = 0
        #add field for incoming result information
        edu['res'] = 0
        edu['nar_start'] = 0

    #add incoming Result information
    for rel in rels:
        ind = rel['y']
        if rel['type'] == 'Result':
            edus[ind]['res'] = 1
        # if rel['type'] == 'Acknowledgement':
        #     edus[ind]['ack'] = 1
        #add first continuation!!!
        if rel['x'] == 0 and rel['type'] == 'Continuation':
            edus[ind]['nar_start'] = 1

        #keep only specified type of relations
#         if rel['type'] == 'Narration':
#             new_rels.append(rel)         
#     game['relations'] = new_rels

##### step 4: run second pass
<br> Overall, the idea is to go through each game extending a 'backbone' of Narrative arcs from Instruction 1 to Instruction 2, from Instruction 2 to Instruction 3, etc. with the last Narration from Instruction n-1 to Instrution n. 
<br>For each game, find the first EDU and check that is connected to the 0 EDU by a Continuation relation. This EDU will be the source EDU for the first Narrative arc. 
<br>For each subsequent EDU, update the state to reflect whether actions have taken taken place. The next EDU that occurs after actions have taken place, has an Architect as speaker, and is the target of a Result, becomes the target of the current Narrative arc, as well as the source of the next arc. 
<br> if there is no start found the game is ignored. 
<br> The output of this step is a list of games, each game is a list of narrative arc endpoints. 

In [12]:
predictions = []
no_start_games = []
for i, game in enumerate(test_gold):
        narration_guesses = []
        actions_happening = 0
        actions_happened = 0
        narr = [0,0]
        try:
            start = [edu['global_index'] for edu in game['edus'] if edu['nar_start'] == 1][0]
            narr[0] = start
        except IndexError:
            print('no start')
            no_start_games.append(game['id'])
            ##if no start can be found, then the source of the first narration is 0
            pass   
        for edu in game['edus'][start+1:]:
            if edu['type'] == 1 and actions_happening == 0: #if builder is moving
                actions_happening = 1
            if edu['type'] == 0  and actions_happening == 1: #if someone is talking now
                actions_happened = 1
            if edu['speaker'] == 'Architect' and edu['res'] == 1 and actions_happened == 1:
                narr[1] = edu['global_index']
                #decide here how to return the information
                full_tup = [i]
                full_tup.extend(narr)
                full_tup.extend([1,13])
                narration_guesses.append(full_tup) #!!!return tups with game index
                ##originally append a tuple
                narr = [0,0]
                narr[0] = edu['global_index']
                actions_happening = 0
                actions_happened = 1
        predictions.append(narration_guesses)

no start


In [13]:
len(predictions)

100

In [14]:
with open(home + '/pickles/' + 'llamipa_second_pass.pkl', 'wb') as f:
    pickle.dump(predictions, f)

##### step 5: get F1 for second pass on Narration arcs only
<br> This ignores intra-turn Narrations

In [15]:
#format gold -- ignore intra turn narrations!!
gold_narrs = []
for game in test_gold:
    narrs = []
    rels = game['relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        if rel['type'] == 'Narration':
            if turn_dict[rel['x']] != turn_dict[rel['y']]:
                # if rel['y'] - rel['x'] < 11: ### D == 10 or less
                if rel['y'] - rel['x'] < 16: ### D == 15 or less
                    narrs.append((rel['x'], rel['y']))
    gold_narrs.append(narrs)

In [16]:
#now rearrange
gold_narrations = []
pred_narrations = []

for i, game in enumerate(gold_narrs):
    #preds = predictions[i]
    # preds = [(p[1], p[2]) for p in predictions[i] if p[2]-p[1] < 11] ### D == 10 or less
    preds = [(p[1], p[2]) for p in predictions[i] if p[2]-p[1] < 16] ### D == 15 or less
    for rel in game:
        if rel in preds: #TP
            gold_narrations.append(1)
            pred_narrations.append(1)
        elif rel not in preds: #FN
            gold_narrations.append(1)
            pred_narrations.append(0)
    for rel in preds: #FP
        if rel not in game:
            gold_narrations.append(0)
            pred_narrations.append(1)

In [17]:
assert len(gold_narrations) == len(pred_narrations)

In [18]:
scores = precision_recall_fscore_support(gold_narrations, pred_narrations, average='binary')

In [19]:
scores

(0.8075, 0.8187579214195184, 0.8130899937067338, None)

##### step 6: get F1 for second pass on all narrations
<br> This includes intra-turn narrations

In [20]:
#format gold -- include intra turn narrations!!
gold_narrs = []
for game in test_gold:
    narrs = []
    rels = game['relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        if rel['type'] == 'Narration':
            # if rel['y'] - rel['x'] < 11: ### D == 10 or less
            if rel['y'] - rel['x'] < 16: ### D == 15 or less
                narrs.append((rel['x'], rel['y']))
    gold_narrs.append(narrs)

In [21]:
#add back in the intra narrations to the predictions 
all_narr_predictions = [p for p in predictions]
for i, game in enumerate(test_gold):
    narrs = []
    rels = game['predicted_relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        if rel['type'] == 'Narration':
            if turn_dict[rel['x']] == turn_dict[rel['y']]:
                # if rel['y'] - rel['x'] < 11: ### D == 10 or less
                if rel['y'] - rel['x'] < 16: ### D == 15 or less
                    # print('heres one')
                    narrs.append((i, rel['x'], rel['y'], 1, 13))
    all_narr_predictions[i].extend(narrs)


In [22]:
len(predictions), len(all_narr_predictions)

(100, 100)

In [23]:
#now rearrange
gold_narrations = []
pred_narrations = []

for i, game in enumerate(gold_narrs):
    #preds = predictions[i]
    # preds = [(p[1], p[2]) for p in predictions[i] if p[2]-p[1] < 11] ### D == 10 or less
    preds = [(p[1], p[2]) for p in predictions[i] if p[2]-p[1] < 16] ### D == 15 or less
    for rel in game:
        if rel in preds: #TP
            gold_narrations.append(1)
            pred_narrations.append(1)
        elif rel not in preds: #FN
            gold_narrations.append(1)
            pred_narrations.append(0)
    for rel in preds: #FP
        if rel not in game:
            gold_narrations.append(0)
            pred_narrations.append(1)

In [24]:
assert len(gold_narrations) == len(pred_narrations)

In [25]:
scores = precision_recall_fscore_support(gold_narrations, pred_narrations, average='binary')

In [26]:
scores

(0.793398533007335, 0.8092269326683291, 0.8012345679012346, None)

##### step 6: get F1 add all new Narration scores to the rest of the relations and re-compute the attachment and F1 scores.


combine predictions and new test pred, <br> 
**but make sure that any relations that have the same endpoints as a new narrative arc are removed!!**

In [27]:
###add the rest of the rel types preds to the predictions
all_rel_predictions = [p for p in all_narr_predictions]
sidelined_rels = []
for i, game in enumerate(test_gold):
    narrs = []
    rels = game['predicted_relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    # new_arcs = [(p[1], p[2]) for p in all_narr_predictions[i] if p[2]-p[1] < 11] ## D== 10 or less
    new_arcs = [(p[1], p[2]) for p in all_narr_predictions[i] if p[2]-p[1] < 16] ## D== 15 or less
    for rel in rels:
        if rel['type'] != 'Narration':
            # if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 11:  ## D== 10 or less
            if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 16:  ## D== 15 or less
                ##IF ENDPOINTS ALREADY EXIST IN NEW NARRATIVE ARCS, DON'T COUNT THE REL
                if (rel['x'], rel['y']) in new_arcs:
                    sidelined_rels.append((i, rel['x'], rel['y'], 1, map_relations[rel['type']]))
                else:
                    narrs.append((i, rel['x'], rel['y'], 1, map_relations[rel['type']]))
    all_rel_predictions[i].extend(narrs)

In [28]:
#format gold --specify relation distance
all_gold = []
for i, game in enumerate(test_gold):
    narrs = []
    rels = game['relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        # if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 11: ###D == 10 or less
        if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 16: ###D == 15 or less
            narrs.append((i, rel['x'], rel['y'], 1, map_relations[rel['type']]))
    all_gold.append(narrs)

In [29]:
len(all_gold), len(all_rel_predictions)

(100, 100)

In [30]:
total_TP = []
matrix_list = []
for i, game in enumerate(all_gold):
    game_rels = [[p[1], p[2], p[4]] for p in game] #the distance here is already set in cell above
    
    # pred_rels = [[p[1], p[2], p[4]] for p in all_rel_predictions[i] if p[2] - p[1] < 11] ## D == 10 or less
    pred_rels = [[p[1], p[2], p[4]] for p in all_rel_predictions[i] if p[2] - p[1] < 16] ## D == 15 or less

    #true positives
    #create the relation comparisons by type
    TP = [e for e in pred_rels if e in game_rels] 
    leftover_pred = [p for p in pred_rels if p not in TP]
    leftover_gold = [p for p in game_rels if p not in TP]

    #then process the TP, FP, FN for matrix 
    total_TP.extend(TP)
    #mlen = len(matrix_list)
    rem_dict = defaultdict(list)
    for x in TP:
        matrix_list.append([x[2], x[2]])
        # tp_matrix_list.append([x[0], x[0]])
        #add to distance dict
        # d = x[2]-x[1]
        # tp_distances[d].append(x[0])
    for x in leftover_pred:
        rem_dict[(x[0], x[1])].append(('p', x[2]))
    for x in leftover_gold:
        rem_dict[(x[0], x[1])].append(('g', x[2]))

    p_count = 0
    g_count = 0
    null_count = 0
    for k in rem_dict.keys():
        p = 16
        t = 16
        for re in rem_dict[k]:
            if re[0] == 'p':
                p = re[1]
                p_count += 1
            elif re[0] == 'g':
                t = re[1]
                g_count += 1
        matrix_list.append([t,p])
    

In [31]:
len(matrix_list)

7337

In [32]:
gold = [m[0] for m in matrix_list]
pred = [m[1] for m in matrix_list]
# print(set(gold))
# print(set(pred))
all_gold_labels = []
all_gold_labels.extend(gold)
all_gold_labels.extend(pred)
labels = list(set(all_gold_labels))
print(labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [33]:
new_labels = [reverse_relations[l] for l in labels]

In [34]:
print(classification_report(gold,pred,target_names=new_labels))

                        precision    recall  f1-score   support

               Comment       0.62      0.57      0.59       323
              Contrast       0.83      0.71      0.76        75
            Correction       0.73      0.69      0.71       406
  Question-answer_pair       0.88      0.86      0.87       409
       Acknowledgement       0.88      0.84      0.86       895
           Elaboration       0.78      0.74      0.76       806
Clarification_question       0.77      0.82      0.79       190
           Conditional       0.00      0.00      0.00         9
          Continuation       0.44      0.59      0.50       324
                Result       0.89      0.89      0.89      2003
           Explanation       0.00      0.00      0.00        55
                Q-Elab       0.53      0.37      0.43        49
           Alternation       0.77      0.75      0.76        36
             Narration       0.79      0.81      0.80       800
 Confirmation_question       0.95      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
d = classification_report(gold,pred,target_names=new_labels,output_dict=True)
prec = 0
rec = 0
f1 = 0 
count = 0

for label in new_labels:
    if label!="Null":
        prec+=d[label]["precision"]*d[label]["support"]
        rec+=d[label]["recall"]*d[label]["support"]
        f1+=d[label]["f1-score"]*d[label]["support"]
        count+=d[label]["support"]
        # checking that support is same as the number of ground truth instance for the label
        # assert d[label]["support"] == Counter(g_label_l)[label]
        

print("Weighted Average Precision:", prec/count)
print("Weighted Average Recall:", rec/count)
print("Weighted Average F1 score:", f1/count)

Weighted Average Precision: 0.802879210508278
Weighted Average Recall: 0.7950075642965204
Weighted Average F1 score: 0.7976879694976644


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
#attachment scores 
attach_pred = [1 if p!=16 else 0 for p in pred]
attach_gold = [1 if g!=16 else 0 for g in gold]

In [37]:
scores = precision_recall_fscore_support(attach_gold, attach_pred, average='binary')

In [38]:
scores

(0.8885994483604045, 0.8773071104387292, 0.8829171741778319, None)

##### STEP 6. Calculate the scores on from regular Llamipa outputs

In [39]:
total_TP = []
matrix_list = []
for i, game in enumerate(test_gold):
    game_rels = [] 
    rels = game['relations']
    for rel in rels:
        # if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 11: ##D == 10 or less
        if rel['y'] > rel['x'] and rel['y'] - rel['x'] < 16: ##D == 15 or less
            game_rels.append((i, rel['x'], rel['y'], 1, map_relations[rel['type']]))
    
    pred_rels = []
    prels = game['predicted_relations']
    for prel in prels:
        # if prel['y'] > prel['x'] and prel['y'] - prel['x'] < 11: ##D == 10 or less
        if prel['y'] > prel['x'] and prel['y'] - prel['x'] < 16: ##D == 15 or less
            pred_rels.append((i, prel['x'], prel['y'], 1, map_relations[prel['type']]))
 
    #true positives
    #create the relation comparisons by type
    TP = [e for e in pred_rels if e in game_rels] 
    leftover_pred = [p for p in pred_rels if p not in TP]
    leftover_gold = [p for p in game_rels if p not in TP]

    #then process the TP, FP, FN for matrix 
    total_TP.extend(TP)
    #mlen = len(matrix_list)
    rem_dict = defaultdict(list)
    for x in TP:
        matrix_list.append([x[4], x[4]])
        # tp_matrix_list.append([x[0], x[0]])
        #add to distance dict
        # d = x[2]-x[1]
        # tp_distances[d].append(x[0])
    for x in leftover_pred:
        rem_dict[(x[1], x[2])].append(('p', x[4]))
    for x in leftover_gold:
        rem_dict[(x[1], x[2])].append(('g', x[4]))

    p_count = 0
    g_count = 0
    null_count = 0
    for k in rem_dict.keys():
        p = 16
        t = 16
        for re in rem_dict[k]:
            if re[0] == 'p':
                p = re[1]
                p_count += 1
            elif re[0] == 'g':
                t = re[1]
                g_count += 1
        matrix_list.append([t,p])

In [40]:
gold = [m[0] for m in matrix_list]
pred = [m[1] for m in matrix_list]
# print(set(gold))
# print(set(pred))
all_gold_labels = []
all_gold_labels.extend(gold)
all_gold_labels.extend(pred)
labels = list(set(all_gold_labels))
print(labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [41]:
new_labels = [reverse_relations[l] for l in labels]

In [42]:
print(classification_report(gold,pred,target_names=new_labels))

                        precision    recall  f1-score   support

               Comment       0.62      0.57      0.59       323
              Contrast       0.83      0.71      0.76        75
            Correction       0.73      0.69      0.71       406
  Question-answer_pair       0.88      0.86      0.87       409
       Acknowledgement       0.88      0.84      0.86       895
           Elaboration       0.78      0.74      0.76       806
Clarification_question       0.77      0.82      0.79       190
           Conditional       0.00      0.00      0.00         9
          Continuation       0.44      0.59      0.50       324
                Result       0.89      0.89      0.89      2003
           Explanation       0.00      0.00      0.00        55
                Q-Elab       0.53      0.37      0.43        49
           Alternation       0.77      0.75      0.76        36
             Narration       0.80      0.76      0.78       800
 Confirmation_question       0.95      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
d = classification_report(gold,pred,target_names=new_labels,output_dict=True)
prec = 0
rec = 0
f1 = 0 
count = 0

for label in new_labels:
    if label!="Null":
        prec+=d[label]["precision"]*d[label]["support"]
        rec+=d[label]["recall"]*d[label]["support"]
        f1+=d[label]["f1-score"]*d[label]["support"]
        count+=d[label]["support"]
        # checking that support is same as the number of ground truth instance for the label
        # assert d[label]["support"] == Counter(g_label_l)[label]
        

print("Weighted Average Precision:", prec/count)
print("Weighted Average Recall:", rec/count)
print("Weighted Average F1 score:", f1/count)

Weighted Average Precision: 0.8034073733878314
Weighted Average Recall: 0.7881996974281392
Weighted Average F1 score: 0.7944759859918737


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
#attachment scores 
attach_pred = [1 if p!=16 else 0 for p in pred]
attach_gold = [1 if g!=16 else 0 for g in gold]

In [45]:
scores = precision_recall_fscore_support(attach_gold, attach_pred, average='binary')

In [46]:
scores

(0.8900232018561485, 0.8704992435703479, 0.880152963671128, None)

##### STEP 7. Caluclate Llamipa score on just Narrative arcs

In [47]:
gold_arcs = []
pred_arcs = []
for game in test_gold:
    narrs = []
    rels = game['relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        if rel['type'] == 'Narration':
            if turn_dict[rel['x']] != turn_dict[rel['y']]:
                # if rel['y'] - rel['x'] < 11: ## D == 10 or less
                if rel['y'] - rel['x'] < 16: ## D == 15 or less
                    narrs.append((rel['x'], rel['y']))
    gold_arcs.append(narrs)
    narrs = []
    prels = game['predicted_relations']
    for prel in prels:
        if prel['type'] == 'Narration':
            if turn_dict[prel['x']] != turn_dict[prel['y']]:
                # if prel['y'] - prel['x'] < 11:  ## D == 10 or less
                if prel['y'] - prel['x'] < 16:  ## D == 15 or less
                    narrs.append((prel['x'], prel['y']))
    pred_arcs.append(narrs)
  

In [48]:
#now rearrange
gold_narrations = []
pred_narrations = []

for i, game in enumerate(gold_arcs):
    preds = pred_arcs[i]
    for rel in game:
        if rel in preds: #TP
            gold_narrations.append(1)
            pred_narrations.append(1)
        elif rel not in preds: #FN
            gold_narrations.append(1)
            pred_narrations.append(0)
    for rel in preds: #FP
        if rel not in game:
            gold_narrations.append(0)
            pred_narrations.append(1)

In [49]:
assert len(gold_narrations) == len(pred_narrations)

In [50]:
scores = precision_recall_fscore_support(gold_narrations, pred_narrations, average='binary')

In [51]:
scores

(0.8179347826086957, 0.7629911280101395, 0.7895081967213115, None)

#### So what is the second pass score on all narrative arcs

In [52]:
with open(home + '/pickles/' + 'llamipa_second_pass.pkl', 'rb') as f:
    secondpass = pickle.load(f)

In [53]:
gold_arcs = []
for game in test_gold:
    narrs = []
    rels = game['relations']
    edus = game['edus']
    turn_dict = {edu['global_index']:edu['turn'] for edu in edus}
    for rel in rels:
        if rel['type'] == 'Narration':
            if turn_dict[rel['x']] != turn_dict[rel['y']]:
                # if rel['y'] - rel['x'] < 11: ## D == 10 or less
                # if rel['y'] - rel['x'] < 16: ## D == 15 or less
                #NO distance limit
                    narrs.append((rel['x'], rel['y']))
    gold_arcs.append(narrs)

In [68]:
gold_narrations = []
pred_narrations = []
secondpass_falsepos = []
secondpass_falseneg = []

for i, game in enumerate(gold_arcs):
    #preds = predictions[i]
    # preds = [(p[1], p[2]) for p in predictions[i] if p[2]-p[1] < 11] ### D == 10 or less
    preds = [(p[1], p[2]) for p in secondpass[i]] ## No distance
    for rel in game:
        if rel in preds: #TP
            gold_narrations.append(1)
            pred_narrations.append(1)
        elif rel not in preds: #FN
            gold_narrations.append(1)
            pred_narrations.append(0)
            secondpass_falseneg.append([i, rel[0], rel[1]])
    for rel in preds: #FP
        if rel not in game:
            gold_narrations.append(0)
            pred_narrations.append(1)
            secondpass_falsepos.append([i, rel[0], rel[1]])

In [69]:
assert len(gold_narrations) == len(pred_narrations)

In [70]:
scores = precision_recall_fscore_support(gold_narrations, pred_narrations, average='binary')

In [71]:
scores

(0.7942238267148014, 0.8088235294117647, 0.8014571948998178, None)

#### comparison of Narrations in Gold and 2p.

In [79]:
Counter([s[2] - s[1] for s in secondpass_falsepos])

Counter({4: 29,
         3: 26,
         2: 22,
         8: 13,
         5: 12,
         6: 11,
         11: 8,
         9: 8,
         15: 5,
         12: 5,
         7: 5,
         10: 5,
         18: 4,
         13: 4,
         16: 3,
         19: 2,
         17: 2,
         22: 2,
         39: 2,
         14: 1,
         44: 1,
         21: 1})

In [80]:
Counter([s[2] - s[1] for s in secondpass_falseneg])

Counter({3: 19,
         6: 19,
         7: 16,
         4: 15,
         8: 12,
         5: 12,
         2: 11,
         9: 10,
         10: 8,
         13: 7,
         14: 5,
         11: 5,
         15: 3,
         20: 3,
         17: 2,
         16: 2,
         18: 2,
         24: 1,
         21: 1,
         12: 1,
         27: 1,
         29: 1})

In [87]:
len(secondpass_falseneg), len(secondpass_falsepos)

(156, 171)

In [86]:
f = open(home + "/secondpass_llamipa_falsenegative.txt","w")
for i, game in enumerate(test_gold):
    fp = [f for f in secondpass_falseneg if f[0] == i]
    if len(fp) > 0:
        edus = game['edus']
        gameid = game['id']
        # print(gameid)
        # print(fp)
        print(gameid, file=f)
        for rel in fp:
            print(str(rel[1]) + '. ' + edus[rel[1]]['speaker'] + ': ' + edus[rel[1]]['text'], file=f)
            print('------->>', file=f)
            print(str(rel[2]) + '. ' + edus[rel[2]]['speaker'] + ': ' + edus[rel[2]]['text'], file=f)
            print('===', file=f)
        print('------------------------------\n', file=f)

#### So what exactly is the difference between llamipa's arcs and the narrative pass arcs?

In [None]:
pred_arcs[0]

In [None]:
len(pred_arcs), len(secondpass)

In [None]:
secondpass[0]

In [None]:
pred_arcs[0]

In [None]:
just_in_llamipa = []
just_in_pass = []
for i, x in enumerate(pred_arcs):
    # print('llamipa only: ', x)
    check = [(c[1], c[2]) for c in secondpass[i] if c[2] - c[1] < 11]
    # print('second pass:', check)
    # print('-----------------------------')
    for rel in check:
        if rel not in x:
            just_in_pass.append((i, rel[0], rel[1]))
    for z in x:
        if z not in check:
            just_in_llamipa.append((i, z[0], z[1]))


In [None]:
just_in_pass

In [None]:
just_in_llamipa

### now figure out which are correct

In [None]:
#for rel in just llamipa: 
#see if it's in the gold
#if not, see if 
big_list = []
table_lables = ['index', 'llamipa only', 'llamipa_correct', 'pass only', 'pass_correct']
for i, game in enumerate(gold_arcs):
    #get the just llamipas
    g = [] # game index, #jl, #jl correct, #sp, #spcorrect
    g.append(i)
    jl = [(m[1], m[2]) for m in just_in_llamipa if m[0] == i]
    if len(jl) > 0:
        #g.append(len(jl))
        g.append(jl)  
        ojl = [] #overlap between jl and gold
        for n in jl:
            if n in game:
                ojl.append(n)
        if len(ojl) > 0:
            g.append(len(ojl))
        else:
            g.append('None')
    else:
        g.extend(['None', 'None'])

    sp = [(m[1], m[2]) for m in just_in_pass if m[0] == i]
    if len(sp) > 0:
        # g.append(len(sp))
        g.append(sp)
        osp = []
        for o in sp:
            if o in game:
                osp.append(o)
        if len(osp) > 0:
            g.append(len(osp))
        else:
            g.append('None')
    else:
        g.extend(['None', 'None'])
    big_list.append(g)



In [None]:
big_list_edit = [b for b in big_list if b[1]!='None' or b[3]!='None']

In [None]:
import pandas

In [None]:
print('                                         ')
print(pandas.DataFrame(big_list_edit, columns=table_lables))
print('                                         ')

In [None]:
#total_arcs  = 
sum([len(l) for l in gold_arcs])