# Modelling Conceptual Pacts over time in PentoCV

In [1]:
#!pip install google_trans_new
!pip install -U deep-translator
!pip install scikit-learn
!pip install sciPy
!pip install nltk



In [3]:
import sys

# You need PentoRef python on your path
sys.path.append("../../pentoref/code/python/")

In [5]:
import pentoref.IO as IO
import sqlite3 as sqlite
#from google_trans_new import google_translator
from deep_translator import GoogleTranslator
import datetime

import sklearn
from sklearn.feature_extraction import DictVectorizer
#from sklearn.preprocessing import LabelEncoder
#l = LabelEncoder()

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report

from nltk.classify import SklearnClassifier
#nltk.download('punkt')  # if using stemming in German

from random import shuffle
import numpy as np
import scipy.stats as stats
from scipy import optimize
import pandas as pd
from math import log
import random
import copy
import time

from collections import Counter
import matplotlib.pyplot as plt

from pentoref.IOutils import clean_utt
from machine_learning_utils import calculate_mcnemar_test

In [6]:
def remove_reparanda(utt):
    "removes the content between ( and + though leaves those intact"
    repair_depth = 0
    cleaned_utt = ""
    for c in utt:
        if c == "+":
            repair_depth-=1
        elif c == "(":
            repair_depth+=1
        elif repair_depth>0:
            continue
        cleaned_utt+=c
    assert repair_depth==0, "repair depth not 0:" + utt
    return cleaned_utt
clean_utt(remove_reparanda("(( hello + hello) + {F um } hello)"))
          

'hello'

In [7]:
# Create databases if required
if False:   # make True if you need to create the databases from the derived data
    for corpus_name in ["PENTOCV"]: # "TAKE", "TAKECV", 
        data_dir = "../../pentoref/{0}_PENTOREF".format(corpus_name)
        dfwords, dfutts, dfrefs, dfscenes, dfactions = IO.convert_subcorpus_raw_data_to_dataframes(data_dir)
        IO.write_corpus_to_database("{0}.db".format(corpus_name),
                                    corpus_name, dfwords, dfutts, dfrefs, dfscenes, dfactions)

In [8]:
# Connect to database
CORPUS = "PENTOCV"
db = sqlite.connect("{0}.db".format(CORPUS))
cursor = db.cursor()
# get the table column header names
print("utts", [x[1] for x in cursor.execute("PRAGMA table_info(utts)")])
print("words", [x[1] for x in cursor.execute("PRAGMA table_info(words)")])
print("refs", [x[1] for x in cursor.execute("PRAGMA table_info(refs)")])
print("scenes", [x[1] for x in cursor.execute("PRAGMA table_info(scenes)")])
print("actions", [x[1] for x in cursor.execute("PRAGMA table_info(actions)")])

utts ['gameID', 'uttID', 'starttime', 'endtime', 'utt', 'utt_clean', 'role', 'speaker']
words ['gameID', 'uttID', 'position', 'word', 'lemma', 'tag']
refs ['refID', 'gameID', 'uttID', 'text', 'id', 'piece', 'location']
scenes ['timestampID', 'gameID', 'pieceID', 'position_global', 'position_x', 'position_y', 'shape', 'shape_distribution', 'shape_orientation', 'shape_skewness_horizontal', 'shape_skewness_vertical', 'shape_edges', 'colour', 'colour_distribution', 'colour_hsv', 'colour_rgb']
actions ['gameID', 'starttime', 'endtime', 'hand', 'action', 'piece']


## Get utterances from certain time periods in each experiment or for certain episodes

In [9]:
if False:
    translator = GoogleTranslator(source='de', target='en')

    for row in db.execute("SELECT gameID, starttime, speaker, utt_clean, utt FROM utts" + \
                       # " WHERE starttime >= 200 AND starttime <= 300" + \
                         ' WHERE gameID = "r1_1_1_b"' + \
                        " ORDER BY gameID, starttime"):
        print(row)
        line = row[3]
        print(line)
        if not line:
            continue
        translate_text = translator.translate(line,lang_src='de',lang_tgt='en') 
        print(translate_text)

## Build dataset

In [10]:
# just focus on single pieces, not references to multiple pieces
good_pieces = ["X", "Y", "P", "N", "U", "F", "Z", "L", "T", "I", "W", "V"]

In [11]:
references_per_pair = {}  # all data will be stored here with keys =pairnum
for row in db.execute("SELECT id, gameID, text, uttID FROM refs" + \
#for row in db.execute("SELECT shape, colour, orientation, gridPosition, gameID, pieceID FROM scenes" + \
                     " ORDER by gameID"):
    if False: 
        # TAKE
        isTarget = db.execute('SELECT refID FROM refs WHERE gameID ="' + row[4] + '" AND pieceID ="' + row[5] + '"')
        target = False 
        for r1 in isTarget:
            target = True
        if not target:
            continue

    #TAKE
    #shape, colour, orientation, gridPosition, gameID, pieceID = row
    #piece = colour  #+ "_" + shape #shape + "_" + colour
    
    #PENTOCV
    piece, gameID, text, uttID = row
    
    # some manual corrections of disfluencies
    error_found = False
    if "(der + (das + das + das) grüne $m)" in text.lower():
        error_found = True
        text = "(der + (das + (das + das ))) grüne $m)"
    elif "(der {f äh:m:} + der) + der) winkel" in text.lower():
        error_found = True
        text = "((der {f äh:m:} + der) + der) winkel"
    elif "das {br- + blaue} lange" in text.lower():
        error_found = True
        text = "das (br- + blaue) Lange"
    elif "das: ({f äh} ja .) andere blaue $z" in text.lower():
        error_found = True
        text = "das: ({f äh} ja . + ) andere blaue $z"
    elif "den (ist das rosa oben) block" in text.lower():
        error_found = True
        text = "den (ist das rosa oben + ) block"
    elif """(<p="dieses">die-</p> (genau) + das) element""" in text.lower():
        error_found = True
        text = """(<p="dieses">die-</p> (genau  +) + das) element"""
    elif """das $t (dieser blaue <p="senkrecht">senk-</p>""" in text.lower():
        error_found = True
        text = """das $t (dieser blaue <p="senkrecht">senk-</p>+)"""
    elif """es">wie's</v> jetzt <v="ist">is'</v> und steckst <ref id="f" piece="target"> es""" in text.lower():
        error_found = True
        text = """wie's jetzt <v="ist">is'</v> und steckst <ref id="f" piece="target"> es"""
    elif """es">wie's</v>""" in text.lower():
        error_found = True
        text = """wie's"""
    
        
        
    
    clean_text = clean_utt(remove_reparanda(text.lower()))
    assert clean_text!="", count
    
    
    pair_num = gameID.split("_")[0]
    # get speaker
    speaker = list(db.execute('SELECT speaker FROM utts WHERE uttID =' + str(uttID)))[0][0]
    speaker = pair_num + "_" + speaker
    
    end_time =  float(list(db.execute('SELECT endtime FROM utts WHERE uttID =' + str(uttID)))[0][0])
    
    if error_found:
        print("replacing with", text, "at", gameID, uttID, end_time)
    #if not pair_num == PAIR_NUM:
    #    continue
    
        
    if piece not in good_pieces:
        continue
        
    if not references_per_pair.get(pair_num):
        references_per_pair[pair_num] = []
    
    
    if "_s" in gameID:
        continue # just get the build phases for now due to inconsistent labelling

    references_per_pair[pair_num].append((clean_text, speaker, uttID, end_time, piece))


    # sort by end time
for pair_num in references_per_pair.keys():
    ref_list = references_per_pair[pair_num]
    ref_list = sorted(ref_list, key=lambda x:x[3])
    references_per_pair[pair_num] = ref_list

print(references_per_pair.keys())
print(references_per_pair['r3'][0:20])

dict_keys(['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8'])
[('gelben stein', 'r3_B', 8886, 129.257, 'U'), ('das rote kreuz', 'r3_B', 8893, 141.68, 'X'), ('daran', 'r3_B', 8897, 148.215, 'U'), ('der orange stein', 'r3_B', 8897, 148.215, 'L'), ('das l', 'r3_A', 7900, 151.582, 'L'), ('daran', 'r3_B', 8902, 157.211, 'X'), ('das grüne t', 'r3_B', 8902, 157.211, 'T'), ('der blaue winkel', 'r3_B', 8931, 219.17, 'V'), ('das', 'r3_B', 8936, 226.213, 'V'), ('daran', 'r3_B', 8940, 235.547, 'V'), ('der gelbe stein', 'r3_B', 8940, 235.547, 'U'), ('er', 'r3_B', 8943, 244.98, 'U'), ('der graue stein', 'r3_B', 8946, 248.207, 'F'), ('den gelben', 'r3_B', 8953, 265.246, 'U'), ('der grüne stein', 'r3_B', 8959, 272.863, 'W'), ('den gelben', 'r3_B', 8967, 284.31, 'U'), ('den grauen', 'r3_B', 8967, 284.31, 'F'), ('das grüne t', 'r3_B', 8970, 287.759, 'T'), ('das grüne t', 'r3_B', 8977, 300.292, 'T'), ('der lila stein', 'r3_B', 8977, 300.292, 'N')]


In [12]:
# null hyp 1: the referring expression is no less predictable given the other mentions for a given piece at a given time point
# regardless of order (so like a language model, all mentions are equally likely and there's no correlation with time)
# of how p(ref|previous) =< p(ref|all)


In [13]:
# ASSUMPTION: only dealing with full names, not anaphors # TODO for this paper may actually leave them in
if True: # False TODO for this paper may actually leave them in
    anaphors = ["es", "das", 'er', "da", "der", "ihn", "den", "sie", "die", "damit", "daran", "dem"]
    for pair_num in references_per_pair.keys():
        refs = references_per_pair[pair_num]
        references_per_pair[pair_num] = list(filter(lambda x:x[0] not in anaphors, refs))


In [14]:
# Get stats for whole corpus and pieces
piece_counter = Counter()
for pair_num in references_per_pair.keys():
    list_refs = [ref[4] for ref in references_per_pair[pair_num]]
    piece_counter.update(list_refs) 
print("TOTAL REFERENCES", piece_counter.total())
sorted(piece_counter.items())

TOTAL REFERENCES 1899


[('F', 134),
 ('I', 139),
 ('L', 161),
 ('N', 197),
 ('P', 147),
 ('T', 191),
 ('U', 144),
 ('V', 162),
 ('W', 170),
 ('X', 177),
 ('Y', 162),
 ('Z', 115)]

In [15]:
# let's have a look at how one pair talks about a piece over time
all_piece_counts = []
for pair_num in references_per_pair.keys():
    print(pair_num, len(references_per_pair[pair_num]), "refs")
    c = Counter([ref[4] for ref in references_per_pair[pair_num]])
    print(c)
    for c,v in c.items():
        all_piece_counts.append(v)
mean_length =np.mean([len(v) for k,v in references_per_pair.items()])
print(mean_length)
# select test as closest to mean length, heldout as second closest
for pair_num in references_per_pair.keys():
    print(pair_num, abs(len(references_per_pair[pair_num])-mean_length))

r1 100 refs
Counter({'P': 15, 'T': 14, 'L': 12, 'Y': 11, 'X': 9, 'U': 8, 'W': 8, 'I': 7, 'N': 6, 'F': 5, 'V': 3, 'Z': 2})
r2 170 refs
Counter({'W': 21, 'Y': 19, 'X': 17, 'T': 16, 'F': 16, 'I': 14, 'N': 14, 'Z': 13, 'L': 13, 'U': 11, 'P': 9, 'V': 7})
r3 265 refs
Counter({'T': 34, 'X': 26, 'V': 26, 'N': 24, 'L': 22, 'Z': 21, 'W': 20, 'I': 20, 'Y': 19, 'U': 18, 'P': 18, 'F': 17})
r4 285 refs
Counter({'U': 33, 'V': 31, 'X': 30, 'N': 29, 'T': 26, 'Y': 25, 'W': 23, 'I': 21, 'P': 20, 'F': 18, 'Z': 17, 'L': 12})
r5 143 refs
Counter({'Y': 17, 'X': 16, 'U': 15, 'N': 15, 'W': 14, 'P': 12, 'V': 11, 'L': 11, 'T': 10, 'Z': 10, 'F': 7, 'I': 5})
r6 258 refs
Counter({'N': 33, 'I': 26, 'P': 26, 'T': 26, 'L': 25, 'Y': 23, 'W': 23, 'V': 22, 'U': 17, 'F': 17, 'X': 12, 'Z': 8})
r7 352 refs
Counter({'N': 41, 'X': 38, 'L': 35, 'V': 33, 'I': 30, 'F': 29, 'W': 28, 'T': 25, 'P': 25, 'U': 23, 'Y': 23, 'Z': 22})
r8 326 refs
Counter({'T': 40, 'N': 35, 'W': 33, 'L': 31, 'V': 29, 'X': 29, 'F': 25, 'Y': 25, 'P': 22, '

In [16]:
print("median", np.median(all_piece_counts))
print("lower", np.min(all_piece_counts))
print("upper", np.max(all_piece_counts))

median 19.5
lower 2
upper 41


# Create language models and language model features

In [17]:
TEST = 'r6'  # take this out of X-val and save for final testing
HELDOUT = 'r3'  # used mainly for holding out folds in getting the LM features
EXTRA_HELDOUT = 'r7' # note this should probably be r4, in principle shouldn't affect results

In [20]:
# let's have a look at how one pair talks about a piece over time

for ref in filter(lambda x:x[4] == 'Y', references_per_pair['r1']):
    print(ref)

('das braune klötzchen', 'r1_B', 6003, 407.971, 'Y')
('das braune', 'r1_B', 6011, 423.967, 'Y')
('braune', 'r1_B', 6013, 425.93, 'Y')
('das blaue', 'r1_B', 6041, 477.407, 'Y')
('das braune', 'r1_B', 6055, 493.203, 'Y')
('das braune', 'r1_B', 6056, 494.18, 'Y')
('das braune klötzchen', 'r1_B', 6119, 608.51, 'Y')
('das braune gebilde', 'r1_B', 6179, 690.209, 'Y')
('das braune gebilde', 'r1_B', 6185, 701.471, 'Y')
('braun', 'r1_B', 6193, 716.532, 'Y')
('das braune', 'r1_B', 6290, 834.766, 'Y')


In [21]:
class mini_language_model():
    
    def __init__(self, order, smoothing_k=0.0001):
        self.order = order
        self.ngrams = {}
        self.num_training_sents = 0
        self.smoothing_k = smoothing_k
        for o in range(1, order+1):
            self.ngrams[o] = {}
        self.unigram_denom = 0
        self.vocab_size = 0
    
    
    def add_ngram_to_model(self, ngram, order):
        ngram_text = "_".join(ngram)
        if not self.ngrams.get(order).get(ngram_text):
            self.ngrams[order][ngram_text] = 0
        self.ngrams[order][ngram_text]+=1

        
    def minus_ngram_from_model(self, ngram, order):
        ngram_text = "_".join(ngram)
        if not self.ngrams.get(order).get(ngram_text):
            self.ngrams[order][ngram_text] = 0
            
        self.ngrams[order][ngram_text]-=1
      
    
    def add_counts_from_other_model(self, other_lm):
        assert(other_lm.order == self.order)
        for n in range(1, self.order+1):
            #print("ngrams before", self.ngrams)
            combined_keys = list(self.ngrams[n].keys() | other_lm.ngrams[n].keys())
            #print("other after", self.ngrams)
            for key in combined_keys:
                
                count_self = self.ngrams[n].get(key)
                if not count_self:
                    count_self = 0
                count_other = other_lm.ngrams[n].get(key)
                if not count_other:
                    count_other = 0
                #print(key, count_self, count_other)
                self.ngrams[n][key] = count_self + count_other
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print("update")
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def train(self, sents):
        for sent in sents:
            padded = ["<s>"] * (self.order -1) + sent + ["</s>"]
            for i in range(self.order-1, len(padded)):
                for n in range(1, self.order+1):
                    target = padded[i]
                    context = padded[i-(n-1):i]
                    self.add_ngram_to_model(context + [target], n)
                    if n> 1:
                        self.add_ngram_to_model(context, n-1)
                    
            self.num_training_sents += 1
        #print(self.ngrams)
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def de_train(self, sents):
        # take away these counts
        for sent in sents:
            padded = ["<s>"] * (self.order -1) + sent + ["</s>"]
            for i in range(self.order-1, len(padded)):
                for n in range(1, self.order+1):
                    target = padded[i]
                    context = padded[i-(n-1):i]
                    self.minus_ngram_from_model(context + [target], n)
        #print(self.ngrams)
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def prob_lidstone(self, ngram, order):
        """Add-k smoothing using the discount parameter for this model."""
        ngram_text =  "_".join(ngram)
        ngram_count = self.ngrams[order].get(ngram_text)
        if not ngram_count:
            ngram_count = 0
        num = ngram_count + self.smoothing_k
        if order == 1:
            denom = self.unigram_denom + (self.smoothing_k * self.vocab_size)
            if self.unigram_denom == 0:
                #print("warning no training, returning k/40/10")
                denom = 150 + (self.smoothing_k * 15)
            #print(denom)
        else:
            context = ngram[:-1]
            context_text = "_".join(context)
            #print(context_text)
            context_count = self.ngrams[order-1].get(context_text)
            if not context_count:
                context_count = 0
            #print(context_count)
            all_contexts = len(self.ngrams[order-1].keys())
            denom = context_count + (self.smoothing_k * all_contexts)
            if self.unigram_denom == 0 or context_count == 0:
                #print("warning no training, returning k/40/10")
                denom = 40 + (self.smoothing_k * 10)
            #print(denom)
        assert num/denom <=1 and num/denom >0, str(num) + " " + str(denom) + " " + str(num/denom) + " " + str(ngram) + " n=" + str(order) + " k=" + str(self.smoothing_k)
        return num/denom
    

In [22]:
m = mini_language_model(3, smoothing_k=0.1)
m.train([["I", "like", "bill"], ["I", "like", "mary"]])
print(m.ngrams)
print(m.prob_lidstone(["I", "like", "mary"], 3))

m2 = mini_language_model(3)
m2.train([["I", "like", "bill", 'today'], ["I", "like", "mary"]])
print(m2.ngrams)
m.add_counts_from_other_model(m2)
print(m.ngrams)

{1: {'I': 4, '<s>': 2, 'like': 4, 'bill': 2, '</s>': 2, 'mary': 2}, 2: {'<s>_I': 4, '<s>_<s>': 2, 'I_like': 4, 'like_bill': 2, 'bill_</s>': 1, 'like_mary': 2, 'mary_</s>': 1}, 3: {'<s>_<s>_I': 2, '<s>_I_like': 2, 'I_like_bill': 1, 'like_bill_</s>': 1, 'I_like_mary': 1, 'like_mary_</s>': 1}}
0.23404255319148937
{1: {'I': 4, '<s>': 2, 'like': 4, 'bill': 2, 'today': 2, '</s>': 2, 'mary': 2}, 2: {'<s>_I': 4, '<s>_<s>': 2, 'I_like': 4, 'like_bill': 2, 'bill_today': 2, 'today_</s>': 1, 'like_mary': 2, 'mary_</s>': 1}, 3: {'<s>_<s>_I': 2, '<s>_I_like': 2, 'I_like_bill': 1, 'like_bill_today': 1, 'bill_today_</s>': 1, 'I_like_mary': 1, 'like_mary_</s>': 1}}
{1: {'I': 8, '<s>': 4, 'like': 8, 'bill': 4, '</s>': 4, 'mary': 4, 'today': 2}, 2: {'<s>_I': 8, '<s>_<s>': 4, 'I_like': 8, 'like_bill': 4, 'bill_</s>': 1, 'like_mary': 4, 'mary_</s>': 2, 'today_</s>': 1, 'bill_today': 2}, 3: {'<s>_<s>_I': 4, '<s>_I_like': 4, 'I_like_bill': 2, 'like_bill_</s>': 1, 'I_like_mary': 2, 'like_mary_</s>': 2, 'bill_

In [23]:
def get_refs_in_timerange_for_piece(pair_num, piece, time_uttend_start, time_uttend_end, references_per_pair):
    """Returns a list of the refs from a certain time start to a time end in order for a given piece.
    NOTE FOR NOW refs in the same utterance are either all included or all excluded from this method"""
    if not references_per_pair.get(pair_num):
        return []
    return list(filter(lambda x:(x[3]>=time_uttend_start and x[3]<time_uttend_end) and x[4]==piece,
                       references_per_pair[pair_num]))
    

In [24]:
get_refs_in_timerange_for_piece('r3', 'L', 0, 687.481, references_per_pair)

[('der orange stein', 'r3_B', 8897, 148.215, 'L'),
 ('das l', 'r3_A', 7900, 151.582, 'L'),
 ('der orange winkel', 'r3_B', 9019, 374.137, 'L'),
 ('der orange winkel', 'r3_B', 9042, 477.238, 'L')]

In [25]:
# Test 1: Is there an increasing similarity to the previous references as the dialogue progresses?
# Test 2: does a reference have higher average similarity to other references in the same dialogue compared to those in different dialogyes?

# Experiment 1: Create reference resolution training + test data with features from the language models and Optimize Params
* TODO setting - exclude and re-compute based on pieces 'in play', or assume the robot doesn't know - try both
* Try 1) global model only without active learning, 2) global + local standard 3) global and local incrementally/dynamically updating, see the difference in ref res. Measures on accuracy + surprisal.
* Naive bayes with individual language models for each piece + marginalization? OR model can work independently with any probabilistic classifier as a joint probability?
* Get an optimal weighting between the local, incremental model and global models

In [26]:
def get_rank_dict_for_prob_dict(prob_dict):
    rank_dict = {}
    rank = 0
    for item in sorted(prob_dict.items(), key=lambda x:x[1]):
        rank+=1
        rank_dict[item[0]] = rank
    return rank_dict
get_rank_dict_for_prob_dict({"K": 0.34, "Y": 0.35, "X": 0.33})

{'X': 1, 'K': 2, 'Y': 3}

In [27]:
def moving_average(a_list):
    averages = []
    total = 0
    for i, a in enumerate(a_list):
        total+=a
        averages.append(total/(i+1))
    return averages

In [28]:
def mean_cross_entropy_on_test_folds(k, pair_piece_lms_orig, folds, prob_model):
    """Returns the mean of the cross entropy on the folds using the appropriate lm.
    Assumes folds will be in correct order w.r.t. time of reference."""
    # firstly, duplicate the language model to avoid any side effects
    #print(k)
    pair_piece_lms = copy.deepcopy(pair_piece_lms_orig)
    # Set the k for all to be the same
    for pair_num in pair_piece_lms.keys():
        for piece in pair_piece_lms[pair_num].keys():
            pair_piece_lms[pair_num][piece].smoothing_k = k[0]
            
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
            # assume we know the correct ref piece
            piece = ref_info[4]
            lm = pair_piece_lms[pair_num][piece]
            ref = ["<s>"] * (lm.order -1) + ref_info[0].split() + ["</s>"]
            prob = 0
            for j in range(lm.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm.order-1):j]
                ngram = context + [target]
                prob += log(lm.prob_lidstone(ngram, lm.order))
            s += prob
            count +=1
            if prob_model == 'self':
                # self language model, starting empty for a piece
                # then will train as it encounters each piece
                #print(pair_piece_lms[pair_num][piece].num_training_sents, "train sents so far before")
                pair_piece_lms[pair_num][piece].train([ref_info[0].split()])
    
        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
    #print(k, cross_entropies, np.mean(cross_entropies))
    return np.mean(cross_entropies)

In [29]:
def get_params_for_lowest_entropy_on_fold_test(folds, pieces, n_range, k_val_range, prob_model='self'):
    """An x-fold process run multiple times to find the best params of n and k (smoothing constant)
    for an ngram model
    """
    results = {}  # store results for each model
    for test_n in n_range:
        pair_piece_lms = {}  # will produce all lms for each speaker for each piece with order test_n
        for test_pair in folds.keys():
            # test pair is the list of refs to get the entropy results on in both cases
            # in 'other' model, referent_lm model is trained on refs on all other folds
            # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
                  #i.e. retrained with the ref added to the counts after that ref is tested
            pair_piece_lms[test_pair] = {}  # key piece, value language model for that piece
            for piece in pieces:
                if prob_model == 'other':
                    # train the model (smoothing can be changed as it doesn't affect training)

                    training_refs = []
                    
                    for other_pair_num in folds.keys():
                        if other_pair_num == test_pair:
                            continue
                        training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])])  
                    training_fold = [r[0] for r in training_refs]
                    #print("length of training fold for lm2", piece, test_pair, len(training_fold))
                    if len(training_fold) <1:
                        print("not enough data", piece, pair_num)
                        continue
                    pair_piece_lms[test_pair][piece] = mini_language_model(test_n, smoothing_k=0.0001)    
                    pair_piece_lms[test_pair][piece].train([sent.split() for sent in training_fold])

                elif prob_model == 'self':
                    # just initialize lm for piece, no need to train
                    pair_piece_lms[test_pair][piece] = mini_language_model(test_n, smoothing_k=0.0001)    
             
            
        # run nelder mead to get best k for this n
        best = optimize.minimize(
                mean_cross_entropy_on_test_folds,
                k_val_range[0],    # first argument that to be optimized (k)
                args=(pair_piece_lms, folds, prob_model),  # other arguments to the function
                method='Nelder-Mead', # use nelder mead to find minima
                tol=0.0001, # to this degree of error
                options={'disp': False},
                bounds=[k_val_range]
        )
        best_k, best_ce = best.x[0], best.fun
        print("best k", best_k, best_ce)
        results[test_n] = (best_k, best_ce)
    
    print(results)
    n = min(results.items(), key=lambda x:x[1][1])[0]
    k = results[n][0]
               
    return n, k, results

In [31]:
# Firstly, in a self-supervised way, optimize the global + local language models in terms of n and smoothing param to get the lowest cross-entropy
# We only go for the global best average across all pairs and pieces for the rest of the experiments
# NB we just use this as a comparative method - this is not necessarily the best params for the best result
print(TEST, HELDOUT)
opt_params = {}
n_range = [1,2,3]
k_val_range = [0.00000001, 1]

l = list(filter(lambda x:x!=TEST, references_per_pair.keys()))

all_n_self = []
all_k_self = []
all_n_other = []
all_k_other = []

for heldout in l:
    training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [TEST, heldout], references_per_pair.keys())}
    
    n_self, k_self, lm_opt_results_self = get_params_for_lowest_entropy_on_fold_test(training_folds, good_pieces,
                                                                  n_range, k_val_range, prob_model='self')


    all_n_self.append(n_self)
    all_k_self.append(k_self)

    n_other, k_other, lm_opt_results_other = get_params_for_lowest_entropy_on_fold_test(training_folds, good_pieces,
                                                                  n_range, k_val_range, prob_model='other')
    
    all_n_other.append(n_other)
    all_k_other.append(k_other)
    
    
opt_params["n_self"] = round(np.mean(all_n_self))  # NB python rounds down 0.5 but shouldn't matter
opt_params["k_self"] = np.mean(all_k_self)

opt_params["n_other"] = round(np.mean(all_n_other)) # NB python rounds down 0.5 but shouldn't matter
opt_params["k_other"] = np.mean(all_k_other)
print(opt_params)

r6 r3
best k 1.0 6.86497954274358
best k 1.0 6.494462605070569
best k 1.0 7.703398867971607
{1: (1.0, 6.86497954274358), 2: (1.0, 6.494462605070569), 3: (1.0, 7.703398867971607)}
best k 1.0 8.3721941211424
best k 0.556662793500001 7.461818951993145
best k 0.2273443935000005 8.545814773734284
{1: (1.0, 8.3721941211424), 2: (0.556662793500001, 7.461818951993145), 3: (0.2273443935000005, 8.545814773734284)}
best k 1.0 6.915151572870145
best k 1.0 6.45897208116136
best k 1.0 7.63492546812265
{1: (1.0, 6.915151572870145), 2: (1.0, 6.45897208116136), 3: (1.0, 7.63492546812265)}
best k 1.0 8.323678483317178
best k 0.594935817500001 7.391706636840973
best k 0.24091034550000048 8.447313934773915
{1: (1.0, 8.323678483317178), 2: (0.594935817500001, 7.391706636840973), 3: (0.24091034550000048, 8.447313934773915)}
best k 1.0 6.889135580897849
best k 1.0 6.614003785214069
best k 1.0 7.7960055050801
{1: (1.0, 6.889135580897849), 2: (1.0, 6.614003785214069), 3: (1.0, 7.7960055050801)}
best k 1.0 8.45

In [32]:
def mean_cross_entropy_on_test_folds_joint_model(lambda_lm1, pair_piece_lms_self_orig, pair_piece_lms_other_orig,
                                                 cutoff_word_global_lm, folds):
    #print("lambda", lambda_lm1, "cutoff", cutoff_word_global_lm)
    pair_piece_lms_self = copy.deepcopy(pair_piece_lms_self_orig)
    pair_piece_lms_other = copy.deepcopy(pair_piece_lms_other_orig)
            
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
            # assume we know the correct ref piece
            piece = ref_info[4]
            
            # get the weighted prob contribution from the self model
            self_prob = 1
            lm_self = pair_piece_lms_self[pair_num][piece]
            ref = ["<s>"] * (lm_self.order -1) + ref_info[0].split() + ["</s>"]
            
            for j in range(lm_self.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm_self.order-1):j]
                ngram = context + [target]
                self_prob = self_prob * lm_self.prob_lidstone(ngram, lm_self.order)
                
            # get the weighted prob contribution from the other model
            other_prob = 1
            lm_other = pair_piece_lms_other[pair_num][piece]
            ref = ["<s>"] * (lm_other.order -1) + ref_info[0].split() + ["</s>"]
            for j in range(lm_other.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm_other.order-1):j]
                ngram = context + [target]
                other_prob = other_prob * lm_other.prob_lidstone(ngram, lm_other.order)
            
            # combine the probs with correct lambda weights
            
            # calculate the correct lambda weights based on the cut-off word
            
            lambda_global = 1- lambda_lm1
            factor = 0 if cutoff_word_global_lm <=1 else (1-lambda_global)/(cutoff_word_global_lm-1)
            num_refs_f = pair_piece_lms_self[pair_num][piece].num_training_sents  # + 1 # to include the current one?
            current_lambda_lm1 = 1 - max([lambda_global + (factor * ((((cutoff_word_global_lm-1)-num_refs_f)))), lambda_global])

            
            prob = (current_lambda_lm1 * self_prob) + ((1-current_lambda_lm1) * other_prob)
            #final_ref_info = tuple(list(ref_info) + [prob, len(ref_info[0].split()), num_refs_f])
            #piece_references_and_probabilities_per_pair[pair_num].append(final_ref_info)
            
            s += log(prob)
            count +=1
            
            # update self language model, which starts empty for a piece
            # then will train as it encounters each piece
            #print(pair_piece_lms_self[pair_num][piece].num_training_sents, "train sents so far before")
            pair_piece_lms_self[pair_num][piece].train([ref_info[0].split()])

        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
    #print(lambda_lm1, cross_entropies, np.mean(cross_entropies))
    return np.mean(cross_entropies)

In [33]:
# NB MAY NOT USE THIS METHOD FOR OPTIMIZING THESE TWO
def get_best_params_cutoff_lambda_xval(n_self, n_other, k_self, k_other, pieces, folds,
                                       cutoff_range = [1,10],
                                       lambda_range=[0,1]):
    """An x-fold process run multiple times to find the best params of lambda (weighting of local language model)
    and cut-off word at which only the local model applies (with a gradual weighting)
    """
    results = {}  # store results for each model
    
    for test_cutoff in range(cutoff_range[0], cutoff_range[1]+1):
        pair_piece_lms_self = {}  # will produce all lms for each speaker for each piece with order test_cutoff
        pair_piece_lms_other = {}  # will produce all lms for each speaker for each piece with order test_cutoff
        for test_pair in folds.keys():
            # test pair is the list of refs to get the entropy results on in both cases
            # in 'other' model, referent_lm model is trained on refs on all other folds
            # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
                  #i.e. retrained with the ref added to the counts after that ref is tested
            pair_piece_lms_other[test_pair] = {}  # key piece, value language model for that piece
            pair_piece_lms_self[test_pair] = {}  # key piece, value language model for that piece
            for piece in pieces:
                for prob_model in ['other', 'self']:
                    if prob_model == 'other':
                        # train the model (smoothing can be changed as it doesn't affect training)

                        training_refs = []

                        for other_pair_num in folds.keys():
                            if other_pair_num == test_pair:
                                continue
                            training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])])  
                        training_fold = [r[0] for r in training_refs]
                        #print("length of training fold for lm2", piece, test_pair, len(training_fold))
                        if len(training_fold) <1:
                            print("not enough data", piece, pair_num)
                            continue
                        pair_piece_lms_other[test_pair][piece] = mini_language_model(n_other, smoothing_k=k_other)    
                        pair_piece_lms_other[test_pair][piece].train([sent.split() for sent in training_fold])

                    elif prob_model == 'self':
                        # just initialize lm for piece, no need to train
                        pair_piece_lms_self[test_pair][piece] = mini_language_model(n_self, smoothing_k=k_self)    

        # run nelder mead to get best lambda
        best = optimize.minimize(
                mean_cross_entropy_on_test_folds_joint_model,
                lambda_range[0],    # first argument to be optimized (lambda)
                args=(pair_piece_lms_self, pair_piece_lms_other, test_cutoff, folds),  # other arguments to the function
                method='Nelder-Mead', # use nelder mead to find minima
                tol=0.0001, # to this degree of error
                options={'disp': False},
                bounds=[lambda_range]
        )
        best_lambda, best_ce = best.x[0], best.fun
        #print("best lambda", best_lambda, best_ce)
        results[test_cutoff] = (best_lambda, best_ce)
    
    print(results)
    cutoff = min(results.items(), key=lambda x:x[1][1])[0]
    lambda_lm1 = results[cutoff][0]
               
    return cutoff, lambda_lm1, results
    

In [34]:
def get_new_piece_lms_from_folds(n, k, pieces, folds, target_folds=None, train=False, heldout_extra_fold=None):
    """Returns a piece based language model objects of n=n, k=k, using the training folds"""
    #print('num folds', len(folds.keys()))
    pair_piece_lms_other = {}
    for test_pair in folds.keys():
        # test pair is the list of refs to get the entropy results on in both cases
        # in 'other' model, referent_lm model is trained on refs on all other folds
        # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
              #i.e. retrained with the ref added to the counts after that ref is tested
        if not test_pair in target_folds:
            continue
        pair_piece_lms_other[test_pair] = {}  # key piece, value language model for that piece
        count = 0
        for piece in pieces:
            # train the model (smoothing can be changed as it doesn't affect training)
            pair_piece_lms_other[test_pair][piece] = mini_language_model(n, smoothing_k=k)  
            if train:
                training_refs = []
                non_training_pairs = [test_pair]
                if not heldout_extra_fold is None:
                    # hold out either the heldout extra fold or it's the test pair, another one
                    random.seed(0)   #count)
                    count+=1
                    l = list(filter(lambda x:x!=test_pair,(folds.keys())))
                    shuffle(l)  # NB this may not have been shuffled before
                    heldout = l[0] if test_pair == heldout_extra_fold else heldout_extra_fold
                    #heldout = l[0]
                    non_training_pairs.append(heldout)
                num_training_pairs = 0
                for other_pair_num in folds.keys():
                    if other_pair_num in non_training_pairs:
                        continue
                    training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])]) 
                    num_training_pairs+=1
                #print("num training pairs for lm", piece, test_pair, num_training_pairs)
                training_fold = [r[0] for r in training_refs]
                #print("length of training fold for lm", piece, test_pair, len(training_fold))
                if len(training_fold) <1:
                    print("not enough data", piece, test_pair)
                    continue
                pair_piece_lms_other[test_pair][piece].train([sent.split() for sent in training_fold])
    return pair_piece_lms_other

In [35]:
def generate_classifier_data_from_lms_and_params(pair_piece_lms_self_orig, pair_piece_lms_other_orig, pieces, lambda_lm1, cutoff_word_global_lm, folds, lexical=True):
    # Creates the appropriate language models for, and trains and tests classifiers for a given test pair
    # create the speaker-wise fold language models (using all other pairs except key)
   
    # Record all scores (probabilities) for each piece according to the language models in the folds 
    # (Which uses both other language models from observing other pairs and the local updating language model/pact model)
    piece_references_and_probabilities_per_pair = {}

    pair_piece_lms_self = copy.deepcopy(pair_piece_lms_self_orig)
    pair_piece_lms_other = copy.deepcopy(pair_piece_lms_other_orig)

  
    
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        piece_references_and_probabilities_per_pair[pair_num] = []
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
   
            target_piece = ref_info[4]
            
            piece_prob_dict = {} # gets the raw probs assigned to ref by all models
            
            for piece in pieces:
                # get the weighted prob contribution from the self model
                self_prob = 1
                lm_self = pair_piece_lms_self[pair_num][piece]
                ref = ["<s>"] * (lm_self.order -1) + ref_info[0].split() + ["</s>"]

                for j in range(lm_self.order-1, len(ref)):
                    target = ref[j]
                    context = ref[j-(lm_self.order-1):j]
                    ngram = context + [target]
                    self_prob = self_prob * lm_self.prob_lidstone(ngram, lm_self.order)

                # get the weighted prob contribution from the other model
                other_prob = 1
                lm_other = pair_piece_lms_other[pair_num][piece]
                ref = ["<s>"] * (lm_other.order -1) + ref_info[0].split() + ["</s>"]
                for j in range(lm_other.order-1, len(ref)):
                    target = ref[j]
                    context = ref[j-(lm_other.order-1):j]
                    ngram = context + [target]
                    other_prob = other_prob * lm_other.prob_lidstone(ngram, lm_other.order)

                # combine the probs with correct lambda weights

                # calculate the correct lambda weights based on the cut-off word
                lambda_global = 1- lambda_lm1
                factor = 0 if cutoff_word_global_lm <=1 else (1-lambda_global)/(cutoff_word_global_lm-1)
                num_refs_f = pair_piece_lms_self[pair_num][piece].num_training_sents  # + 1 # to include the current one?
                current_lambda_lm1 = 1 - max([lambda_global + (factor * ((((cutoff_word_global_lm-1)-num_refs_f)))), lambda_global])


                prob = (current_lambda_lm1 * self_prob) + ((1-current_lambda_lm1) * other_prob)
                
                
                    
                
                
                piece_prob_dict[piece] = (-log(prob)/len(ref_info[0].split()), num_refs_f)

                # only get the entropy for target piece
                if piece == target_piece:
                    s += log(prob)
                    count +=1
            
            
            
            # update self language model, which starts empty for a piece
            # then will train as it encounters each piece
            #print(pair_piece_lms_self[pair_num][piece].num_training_sents, "train sents so far before")
            pair_piece_lms_self[pair_num][target_piece].train([ref_info[0].split()])
            
        
            
            # could add moving averages for how this piece has been judged probability-wise by each piece model
            # built so far in this interaction by different models
            # for this instance, we don't know what the correct referent is, so we only have the probs from the models
            #  will be num shapes * num shapes
            # assumes the latest prob dict for this piece is a new instance for all shapes:
            init_probs_from_current = {k: [v[0]] for k, v in piece_prob_dict.items()}
            probs_assigned_to_pieces_so_far = {k:copy.deepcopy(init_probs_from_current) for k, v in piece_prob_dict.items()}
            # for previous instances, we do know what the correct referents were
            # so we can check what the effect of combining the current probs to those from previous positive examples
            # NB and negative ones too?
            # scroll forward from start of interaction up to but not including current ref
            for j in range(0, i):
                back_ref_info = piece_references_and_probabilities_per_pair[pair_num][j]
                back_ref_prob_dict = back_ref_info[5]
                back_ref_target_piece = back_ref_info[4]
                
                for k, v in back_ref_prob_dict.items():
                    if probs_assigned_to_pieces_so_far[back_ref_target_piece].get(k) is None:
                        probs_assigned_to_pieces_so_far[back_ref_target_piece][k] = []
                    probs_assigned_to_pieces_so_far[back_ref_target_piece][k].append(v[0])
            
            # will be num shapes * num shapes
            moving_average_prob_dicts_all_lms = {k:{} for k, v in piece_prob_dict.items()}
            for target_p in probs_assigned_to_pieces_so_far.keys():
                for a_p in probs_assigned_to_pieces_so_far[target_p].keys():
                    moving_average_prob_dicts_all_lms[target_p][a_p] = np.mean(probs_assigned_to_pieces_so_far[target_p][a_p])
                    
            
            
            final_ref_info = tuple(list(ref_info) + [piece_prob_dict, moving_average_prob_dicts_all_lms, len(ref_info[0].split())])
            piece_references_and_probabilities_per_pair[pair_num].append(final_ref_info)
            

        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
  
    return piece_references_and_probabilities_per_pair
    

In [36]:
def get_zscore_dict_for_prob_dict(prob_dict):
    probs  = [item[1] for item in sorted(prob_dict.items(), key=lambda x:x[0])] # alpha
    zscores = list(stats.zscore(np.array(probs)))
    def convert_nan(v):
        if np.isnan(v):
            return - 100
        return v
    return {k: convert_nan(v) for k, v in zip(sorted(prob_dict.keys()), zscores)}

get_zscore_dict_for_prob_dict({"K": 0.33, "Y": 0.35, "X": 0.33})

{'K': -0.7071067811865476, 'X': -0.7071067811865476, 'Y': 1.4142135623730951}

In [37]:
def generate_classifier_data_from_raw(folds, lexical=True, lm_features=True, moving_average=False):
    """Gets derived data from raw probs from language models"""
    final_data = []
    current_pair_num = None
    for current_pair_num in folds.keys():
        raw_data = folds[current_pair_num]
        for ref_info in raw_data:
            #('das orange l', 'r1_B', 5887, 204.521, 'L', prob_dict, ref_length)

            text, speaker, utt_id, end_time, target_piece, prob_dict, piece_lm_moving_average, ref_length = ref_info




            #if current_pair_piece_probs.get(piece) is None:
            #    current_pair_piece_probs[piece] = []


            feature_vector = {}

            if lexical:
                clean_utt = text.lower().split()
                word_dem = len(clean_utt)

                local_word_count = Counter()

                for word in clean_utt:
                    #word_counter[word] += (1/word_dem)   # for global
                    #word_piece_counter[piece+"__"+word] += (1/word_dem)  #for global
                    local_word_count[word] += 1

                for k, v in local_word_count.items():
                    feature_vector[k] = local_word_count[k]/word_dem



            if lm_features:
                length_weighted_prob_dict = {k:v[0] for k,v in prob_dict.items()}

                #for k, v in length_weighted_prob_dict.items():
                #    feature_vector['local_prob_' + k] = v

                zscore_dict = get_zscore_dict_for_prob_dict(length_weighted_prob_dict)

                for k, v in zscore_dict.items():
                    feature_vector['local_zscore_prob_' + k] = v 

                if moving_average:
                    #pass
                    # to get a smoother measure of the pacts so far (after the first one), 
                    # we calculate what the moving average lm score is for each piece lm
                    # with which previous pieces and current probs for this pair
                    # give for the current piece (not knowing what it is and assuming it's of each type)
                    # i.e. assuming it's of a shape (X), what's the moving average of all the piece lm's on this type of piece?
                    # you'd need num pieces * num pieces number of moving probs for it to be fair, not just for the target

                    for _target in piece_lm_moving_average.keys():
                        #if not _target == target_piece: # NB this is cheating!
                        #    continue
                        zscore_dict_average = get_zscore_dict_for_prob_dict(piece_lm_moving_average[_target])
                        for k, v in zscore_dict_average.items():  
                            if not k == _target:
                                continue
                            #feature_vector['local_prob_moving_zcore' + _target + ":" + k] = v
                            feature_vector['local_prob_moving_prob' + _target + ":" + k] =  piece_lm_moving_average[_target][k]
                            feature_vector['local_prob_moving_zcore' + _target + ":" + k] =  v

                #zscore_dict_average = get_zscore_dict_for_prob_dict(piece_lm_moving_average)

                #for k, v in zscore_dict_average.items():
                #    feature_vector['local_zscore_prob_moving_' + k] = v 


            #rank_dict = get_rank_dict_for_prob_dict(piece_lm_prob_local)

            #for k, v in rank_dict.items():
            #    feature_vector['local_prob_rank' + k] = v 
            final_data.append((feature_vector, target_piece))
    return final_data


In [38]:
# TRAINING AND VALIDATING OUR CLASSIFIER

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
simplefilter("ignore", category=ConvergenceWarning)

def train_classifier(train_data):
    #print("Training Classifier...")
    return SklearnClassifier(LinearSVC(loss='squared_hinge')).train(train_data)
    #return SklearnClassifier(LogisticRegression()).train(train_data)

def predict_labels(samples, classifier):
    return classifier.classify_many(map(lambda t: t[0], samples))


In [39]:
if False:
    # Alternative method for finding best lambda local and cut-off - may not work as LMs of different size, so will always favour
    # high lambda for the local model which will always give higher probabilities
    # experiment with and optimize the best lambda weight on the local model and best cut-off point
    # in process get the cross validation results
    print(opt_params)
    n_self = opt_params["n_self"]
    n_other = opt_params["n_other"]
    k_self = opt_params["k_self"]
    k_other = opt_params["k_other"]
    cutoff_word_global_lm, lambda_local, x_val_results = get_best_params_cutoff_lambda_xval(n_self, n_other,
                                                                                            k_self, k_other,
                                                                                            good_pieces, training_folds,
                                                                                            cutoff_range=[1,5],
                                                                                            lambda_range=[0,1]) 
    opt_params["lambda_local"] = lambda_local
    opt_params["cutoff_word_global_lm"] = cutoff_word_global_lm

In [40]:
# finally get the test results with best language models (will have done this 7 times above in each xval)

def get_fscore_and_preds(references_per_pair, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm, 
               lexical=True, 
               lm_features=True,
               moving_average=False,
               exhaustive_retrain=False,
               no_train=False,
               heldout_extra_fold=None):
    
    # First generate training data by getting the other/global probs from the other pair folds (k-fold style)
    # This is to simulate what it's like 
    # So the LM sizes are num_folds - 1
    # One problem is in the case where there is one pair to learn from, this means there is no 'other' model
    #, only one, so it has to use its own probabilities, which aren't representative
    tic = time.time()
    training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [test], references_per_pair.keys())}
    global_training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [test], references_per_pair.keys())}
    extra_fold_penalty = 1 if heldout_extra_fold is not None else 0
    if (len(global_training_folds)-extra_fold_penalty) == 1:
        # this won't work at all as it needs to learn from other pairs, so just copy it to learn from its final model
        # not ideal but at least gives values
        global_training_folds['extra'] = global_training_folds[list(global_training_folds.keys())[0]]
        
    lms_global = get_new_piece_lms_from_folds(n_other, k_other, good_pieces,
                                              global_training_folds, target_folds=global_training_folds.keys(),
                                              train=True,
                                              heldout_extra_fold=heldout_extra_fold)
    #print("global model trained for training")
    lms_self = get_new_piece_lms_from_folds(n_self, k_self, good_pieces,
                                            training_folds,
                                            target_folds=training_folds.keys(),
                                            train=False)
    #print("local model trained for training")
    raw_train_data = generate_classifier_data_from_lms_and_params(lms_self,
                                                                  lms_global,
                                                                  good_pieces,
                                                                  lambda_local,
                                                                  cutoff_word_global_lm,
                                                                  training_folds)

    print(time.time()-tic, "all lm features extracted")
    train_data = generate_classifier_data_from_raw(raw_train_data, lexical=lexical, lm_features=lm_features, moving_average=moving_average)
    #print("length training data", len(train_data))
    #print(train_data[0])


    # now generate the test data using LMs trained on all the other folds except heldout, to ensure the same sized LMs
    
    all_global_training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [heldout], references_per_pair.keys())}
    extra_fold_penalty = 1 if heldout_extra_fold is not None else 0
    if (len(all_global_training_folds)-extra_fold_penalty) == 1:
        # this won't work at all as it needs to learn from other pairs, so just copy it to learn from its final model
        # not ideal but at least gives values
        all_global_training_folds['extra'] = global_training_folds['extra']
    
    test_folds ={k:references_per_pair[k] for k in filter(lambda x:x==test, references_per_pair.keys())}
    
    # if using all other folds, needs to take one other fold out for the LM training to ensure same size as training
    lms_global = get_new_piece_lms_from_folds(n_other, k_other, good_pieces,
                                                      all_global_training_folds, target_folds=[test], train=True,
                                                      heldout_extra_fold=heldout_extra_fold)
    #print("global model trained for testing")
    lms_self = get_new_piece_lms_from_folds(n_self, k_self, good_pieces,
                                                      test_folds, target_folds=[test], train=False)

    #print("local model trained for testing")
    raw_test_data = generate_classifier_data_from_lms_and_params(lms_self,
                                                                 lms_global,
                                                                 good_pieces,
                                                                 lambda_local,
                                                                 cutoff_word_global_lm,
                                                                 test_folds)


    test_data = generate_classifier_data_from_raw(raw_test_data, lexical=lexical, lm_features=lm_features, moving_average=moving_average)
    #print(test_data[0])

    if no_train:
        train_data = []  # just start with no data
    
    prf = None
    if exhaustive_retrain:
        y_true = [x[1] for x in test_data]
        y_pred = []
        # do exhaustive incremental testing on current exmaple then retraining given last example
        for t in range(0, len(test_data)):
            if len(set([x[1] for x in train_data])) < 2:  # only one class seen so far, can't train yet
                random.seed(t)  # guess a piece using index t as random seed
                shuffle(good_pieces)
                guess = good_pieces[0]
                y_pred.extend(guess)
            else:
                # predict and train model with current data
                cl = train_classifier(train_data)
                y_pred.extend(predict_labels([test_data[t]], cl))
                
            feats, label = test_data[t]
            feats = {k:v for k, v in feats.items()}
            train_data.append((feats, label))  # add current example just attempted to training data
            
    else:
        cl = train_classifier(train_data)
        #print(train_data[0])
        y_true = [x[1] for x in test_data]
        y_pred = predict_labels(test_data, cl)
    prf = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
    #print(prf)
    #print(classification_report(y_true, y_pred))
    return prf[2], y_pred, y_true, accuracy

In [41]:
# Do optimization to find best cut-off and lambda local param values (rather than just cross-entropy as not straightforward)
# find best lambda local and cut-off word through cross-val testing, pick one with highest accuracy

# NB got to ['n_other', 'n_self', 'cutoff_word_global_lm', 'k_other', 'k_self', 'lambda_local'] = [2, 2, 10, 0.45, 0.2, 0.1] 0.8469793454633859
# # actual order:
# ['n_other', 'n_self', 'k_other', 'k_self', 'cutoff_word_global_lm',  'lambda_local'] = [2, 2, 0.45, 0.2, 10,  0.1] 0.8469793454633859


OPTIMIZING = False
if OPTIMIZING:
    n_other = opt_params['n_other']
    n_self = opt_params['n_self']
    k_other = opt_params['k_other']
    k_self = opt_params['k_self']

    print(TEST, HELDOUT)
    cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x !=TEST, references_per_pair.keys())}
    results = {}
    results_file = open("2023-06-14T18:49:05.057567.csv", "a")
    #results_file = open(datetime.datetime.now().isoformat() + ".csv", "w")
    #results_file.write(",".join(['n_other', 'n_self', 'cutoff_word_global_lm', 'k_other', 'k_self', 'lambda_local', 'fscore']) + "\n")
    
    best_f = 0
    best_params = []
    for n_other in range(1,3): #2
        for n_self in range(1,3): #4
            for k_other_raw in range(5,105,5): #80
                k_other = k_other_raw/100
                for k_self_raw in range(5,105,5): #1_600
                    k_self = k_self_raw/100
                    for cutoff_word_global_lm in range(1,11): #16_000
                        # assume convex - not quite right but nearly 
                        prev = 0
                        for lambda_local_raw in range(0,105,5): #320_000 params max, at least 32_000!
                            lambda_local = lambda_local_raw/100
                            if [n_other, n_self, k_other, k_self, cutoff_word_global_lm, lambda_local] <= [2, 2, 0.45, 0.2, 10,  0.1]:
                                continue
                            
                            
                            all_scores = []
                            for test in cross_val_data.keys():
                                random.seed(0)
                                l = list(filter(lambda x:x!=test,(cross_val_data.keys())))
                                shuffle(l)
                                heldout = l[0] if test == HELDOUT else HELDOUT
                                #heldout = HELDOUT
                                #print(cross_val_data.keys(), test)
                                fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm)
                                all_scores.append(fscore)
                            mean_f = np.mean(all_scores)
                            test_params = [n_other, n_self, cutoff_word_global_lm, k_other, k_self, lambda_local]
                            print(test_params, mean_f)
                            if mean_f < prev: # assume convex - not quite right but nearly 
                                break
                            prev = mean_f
                            results[",".join([str(f) for f in test_params])] = mean_f
                            if mean_f == best_f:
                                best_params = [best_params, test_params]
                            elif mean_f > best_f:
                                best_f = mean_f
                                best_params = test_params
                                print("best", best_f, best_params)
                    #results_file = open("results" + datetime.time)
                    #for row in sorted(results.items(), key=lambda x:x[1], reverse=True):
    
    for params, result in sorted(results.items(), key=lambda x:x[1], reverse=True):
        results_file.write(params + "," + str(result) + "\n")
    results_file.close()


In [42]:
# if already optimized look at results
df = pd.read_csv(open("2023-06-14T18:49:05.057567.csv"))
opt_params['n_other'] = df.head(1).n_other[0]
opt_params["n_self"] = df.head(1).n_self[0]
opt_params['k_other'] = df.head(1).k_other[0]
opt_params["k_self"] = df.head(1).k_self[0]
opt_params['lambda_local'] = df.head(1).lambda_local[0]
opt_params["cutoff_word_global_lm"] = df.head(1).cutoff_word_global_lm[0]
print(opt_params)

{'n_self': 1, 'k_self': 0.1, 'n_other': 1, 'k_other': 0.9, 'lambda_local': 0.1, 'cutoff_word_global_lm': 6}


In [43]:
df.head(20)

Unnamed: 0,n_other,n_self,cutoff_word_global_lm,k_other,k_self,lambda_local,fscore
0,1,1,6,0.9,0.1,0.1,0.86553
1,1,1,6,0.95,0.1,0.1,0.865376
2,1,1,5,1.0,0.05,0.2,0.865096
3,1,1,5,0.95,0.05,0.2,0.865075
4,1,1,8,1.0,0.1,0.15,0.865066
5,1,1,6,1.0,0.1,0.1,0.865005
6,1,1,7,1.0,0.1,0.1,0.864977
7,1,1,7,1.0,0.05,0.15,0.864896
8,1,1,6,0.8,0.1,0.1,0.864542
9,1,1,6,0.85,0.1,0.1,0.864479


# Experiment 1: some plots on x-val data with optimized params

In [44]:
def plot_piece_lm_values_for_pair_piece(data, piece, moving_average=False, fig_file=None):
    piece_plots = {p:[] for p in data[0][5].keys()}
    for ref_info in data:
        text, speaker, utt_id, end_time, target_piece, prob_dict, piece_lm_moving_average, ref_length = ref_info
        
        if not target_piece == piece:  # only focus on target piece
            continue
        print(speaker.split("_")[-1], "&", round(end_time,1),  "&", text, "\\\\")
        if moving_average is True:
            # how does each piece lm give a probability to THIS target piece over time
            for lm_piece, inner_prob_dict in piece_lm_moving_average.items():
                
                prob_dict_inner = piece_lm_moving_average[lm_piece]
                prob_dict_inner = get_zscore_dict_for_prob_dict(prob_dict_inner)
                for inner_piece_lm, prob in prob_dict_inner.items():
                    if not inner_piece_lm == piece:
                        continue
                    piece_plots[lm_piece].append(prob)
        else:
            # if just raw probs
            for lm_piece, prob in prob_dict.items():
                piece_plots[lm_piece].append(prob[0])
    
    #for k, v in piece_plots.items():
    #    print(k,v)
                
    #‘b’	blue
    #‘g’	green
    #‘r’	red  (255, 0, 0 )
    #‘c’	cyan
    #‘m’	magenta
    #‘y’	yellow
    #‘k’	black
    #‘w’	white

    #orange (255, 128, 0)
    #grey (160, 160, 160)
    #pink (255, 153, 204)
    #brown (153, 76, 0)
    #purple (102, 0, 204)

    colour_map = {'F': (160/255, 160/255, 160/255), 'I': "blue" , 'L': (255/255, 128/255, 0/255),
                  'N': (102/255, 0/255, 204/255),
                  'P': (255/255, 153/255, 204/255), 'T': "green", 'U': "yellow", 'V': "blue",
                  'W' :"green", 'X': "red", 'Y': (153/255, 76/255, 0/255), 'Z': "blue"}

    for lm_piece in sorted(piece_plots.keys()):
        probs = piece_plots[lm_piece]
        target_text = "" if lm_piece == piece else ""
        linestyle = "solid" if lm_piece == piece else "dashed"
        marker = "o" if lm_piece == piece else "none"
        plt.plot(probs, label=lm_piece + " " + target_text, color=colour_map[lm_piece],
                linestyle=linestyle, marker=marker)


    # plot lines
    #plt.plot(moving_average([m[0] for m in method1]), label = "active: mixed models with k=0.015 for previous")
    #plt.plot(moving_average([m[0] for m in method2]), label = "interactive: retraining language model with incrementing data")
    #plt.plot(moving_average([m[0] for m in method3]), label = "static baseline")
    #plt.plot(moving_average([m[0] for m in method4]), label = "interactive: mixed models with k=0.015 for previous + recency")
    #plt.plot(moving_average([m[0] for m in method5]), label = "active: retraining language model with incrementing data + recency")
    #plt.plot(moving_average([m[0] for m in method6]), label = "interactive only + recency")
    #plt.plot(moving_average([m[0] for m in method7]), label = "interactive only")
    #plt.legend(mode='expand') #bbox_to_anchor=(0.1, 1))
    plt.xticks(fontsize=8, rotation=25)
    plt.xticks(np.arange(len(piece_plots['X'])), np.arange(1, len(piece_plots['X'])+1))
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 0.85),
          fancybox=True, shadow=True)
    
    plt.ylabel('Negative log prob per word (Z-score)')  
    plt.xlabel('Referring expression number to target piece in interaction')  

    #plt.show()
    if not fig_file is None:
        plt.savefig(fig_file) 
    plt.clf()


In [45]:
# print(TEST, HELDOUT, EXTRA_HELDOUT)
cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x !=TEST, references_per_pair.keys())}

print(opt_params)
n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, True, False]
print(exp_params)

cross_val_results = []
all_preds = []
all_y = []
for test in ['r2']:
    
    heldout = EXTRA_HELDOUT if test == HELDOUT else HELDOUT
    training_folds = {k:cross_val_data[k] for k in filter(lambda x:x not in [test], cross_val_data.keys())}
    lms_global = get_new_piece_lms_from_folds(n_other, k_other, good_pieces,
                                                      training_folds, target_folds=training_folds.keys(), train=True)
    #print("global model trained for training")
    lms_self = get_new_piece_lms_from_folds(n_self, k_self, good_pieces,
                                                      training_folds, target_folds=training_folds.keys(), train=False)
    #print("local model trained for training")
    raw_train_data = generate_classifier_data_from_lms_and_params(lms_self,
                                                                  lms_global,
                                                                  good_pieces,
                                                                  lambda_local,
                                                                  cutoff_word_global_lm,
                                                                  training_folds)
    
    for _piece in sorted(good_pieces):
        print(_piece)
        _pair = 'r3'
        plot_piece_lm_values_for_pair_piece(raw_train_data[_pair], _piece, moving_average=True, fig_file=_pair + _piece + '.pdf')
    
    
    
    
    break



{'n_self': 1, 'k_self': 0.1, 'n_other': 1, 'k_other': 0.9, 'lambda_local': 0.1, 'cutoff_word_global_lm': 6}
[1, 1, 6, 0.9, 0.1, 0.1, True, True, False]
F
B & 248.2 & der graue stein \\
B & 284.3 & den grauen \\
B & 485.1 & der graue stein \\
B & 1020.5 & der graue stein \\
B & 1121.2 & der graue stein \\
B & 1150.4 & dem grauen \\
A & 1622.3 & den grauen \\
A & 1649.5 & den grauen \\
A & 1810.5 & den grauen \\
A & 1940.9 & den grauen \\
A & 1951.9 & der graue stein \\
A & 1978.7 & den grauen \\
A & 2301.9 & den grauen stein \\
A & 2339.7 & von dem grauen \\
A & 2419.0 & den grauen \\
A & 2434.2 & dieses \\
A & 2453.4 & dem grauen \\
I
A & 340.4 & der lange \\
B & 378.3 & der blaue balken \\
B & 846.8 & der blaue balken \\
B & 862.6 & der blaue balken \\
B & 883.7 & den blauen \\
B & 899.6 & den blauen balken \\
B & 908.8 & der balken \\
B & 1117.8 & den winkel \\
B & 1270.5 & der blaue balken \\
A & 1441.8 & den blauen balken \\
A & 1657.7 & den blauen balken \\
A & 1704.7 & dem balken

<Figure size 640x480 with 0 Axes>

# Experiment 1 final runs/results (on xval then test data)

In [46]:
xval_results = {}

In [47]:
# 1) re-run best cross-val result
# get extra heldout fold to ensure we're still using 5 folds for training
cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x !=TEST, references_per_pair.keys())}

print(opt_params)
n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]

cross_val_results = []
all_preds = []
all_y = []
tic = time.time()
for test in cross_val_data.keys():
    heldout = EXTRA_HELDOUT if test == HELDOUT else HELDOUT  # only not the normal heldout when that's being tested on
    #print("heldout", heldout) 
    f, preds, y, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
toc = time.time()
print(toc-tic)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, True, False]
xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

{'n_self': 1, 'k_self': 0.1, 'n_other': 1, 'k_other': 0.9, 'lambda_local': 0.1, 'cutoff_word_global_lm': 6}
1.420598030090332 all lm features extracted
1.4563159942626953 all lm features extracted
1.3999531269073486 all lm features extracted
1.3520081043243408 all lm features extracted
1.3699538707733154 all lm features extracted
1.0786170959472656 all lm features extracted
1.1116728782653809 all lm features extracted
11.885232210159302
[0.87, 0.8176470588235294, 0.8943396226415095, 0.8842105263157894, 0.8951048951048951, 0.8892045454545454, 0.7944785276073619] 0.8635693108496615


In [48]:
# 2) Get the baseline of lexical info only, no updating/retraining or lms
cross_val_results = []
all_preds = []
all_y = []
tic = time.time()
for test in cross_val_data.keys():
    heldout = EXTRA_HELDOUT if test == HELDOUT else HELDOUT  # only not the normal heldout when that's being tested on
    #print("heldout", heldout) 
    f, preds, y, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                      lm_features = False)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
toc = time.time()
print(toc-tic)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, False, False]
xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

1.368596076965332 all lm features extracted
1.350687026977539 all lm features extracted
1.1674659252166748 all lm features extracted
1.1486430168151855 all lm features extracted
1.3383722305297852 all lm features extracted
1.1068978309631348 all lm features extracted
1.1096389293670654 all lm features extracted
10.140897989273071
[0.71, 0.8, 0.8943396226415095, 0.8912280701754386, 0.8531468531468531, 0.8579545454545454, 0.7699386503067485] 0.8252296773892994


In [49]:
lm_train_time = sum([1.3798320293426514, 1.296260118484497,
1.1890807151794434, 1.2357509136199951,
1.289025068283081,1.067748785018921,
1.1144702434539795])
toc-tic-lm_train_time

1.568730115890503

In [50]:
# 3) see what difference exahustive retraining makes, firstly lexical only, no lm features
cross_val_results = []
all_preds = []
all_y = []
tic = time.time()
for test in cross_val_data.keys():
    heldout = EXTRA_HELDOUT if test == HELDOUT else HELDOUT  # only not the normal heldout when that's being tested on
    #print("heldout", heldout) 
    f, preds, y, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                      lm_features=False, exhaustive_retrain=True)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
toc = time.time()
print(toc-tic)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, False, True]
xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

1.3765959739685059 all lm features extracted
1.3137869834899902 all lm features extracted
1.201883316040039 all lm features extracted
1.2006549835205078 all lm features extracted
1.3009788990020752 all lm features extracted
1.092986822128296 all lm features extracted
1.115159034729004 all lm features extracted
31.92237901687622
[0.86, 0.8352941176470589, 0.909433962264151, 0.8982456140350877, 0.8741258741258742, 0.8920454545454546, 0.7822085889570553] 0.8644790873678118


In [51]:
toc-tic-lm_train_time

23.350211143493652

In [52]:
comparison_results = {}
for sys1 in xval_results.keys():
    for sys2 in xval_results.keys():
        if sys1 == sys2:
            continue
        if sys1 + " V " + sys2 in comparison_results.keys() or sys2 + " V " + sys1 in comparison_results.keys():
            continue
        cl1_preds = xval_results[sys1][2]
        cl2_preds = xval_results[sys2][2]
        sig = calculate_mcnemar_test(cl1_preds, cl2_preds, all_y, alpha=0.05, exact=True)
        comparison_results[sys1 + " V " + sys2] = [round(xval_results[sys1][1],3), round(xval_results[sys2][1],3)] + list(sig)
        
print(len(comparison_results))
count = 1
for k, v in comparison_results.items():
    print(count, ")", k, v)
    count+=1

statistic=21.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=23.000, p-value=0.672
Same proportions of errors (fail to reject H0)
statistic=9.000, p-value=0.000
Different proportions of errors (reject H0)
3
1 ) 1,1,6,0.9,0.1,0.1,True,True,False V 1,1,6,0.9,0.1,0.1,True,False,False [0.864, 0.825, 21.0, 4.968015141912969e-06, True, True, True]
2 ) 1,1,6,0.9,0.1,0.1,True,True,False V 1,1,6,0.9,0.1,0.1,True,False,True [0.864, 0.864, 23.0, 0.6718110337653656, False, False, False]
3 ) 1,1,6,0.9,0.1,0.1,True,False,False V 1,1,6,0.9,0.1,0.1,True,False,True [0.825, 0.864, 9.0, 3.542223380175076e-09, True, True, True]


In [53]:
test_results = {}

In [54]:
print(TEST, HELDOUT, EXTRA_HELDOUT)

print(opt_params)
n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]

cross_val_results = []
all_preds = []
all_y = []
for test in [TEST]:
    #print("heldout", heldout)
    f, preds, y, accuracy = get_fscore_and_preds(references_per_pair, test, HELDOUT, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                      heldout_extra_fold=EXTRA_HELDOUT)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, True, False]
print(exp_params)
test_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

r6 r3 r7
{'n_self': 1, 'k_self': 0.1, 'n_other': 1, 'k_other': 0.9, 'lambda_local': 0.1, 'cutoff_word_global_lm': 6}
1.4918701648712158 all lm features extracted
[1, 1, 6, 0.9, 0.1, 0.1, True, True, False]
[0.8798449612403101] 0.8798449612403101


In [55]:
# 2) Get the baseline of lexical info only, no updating/retraining or lms
# 2) Get the baseline of lexical info only, no updating/retraining or lms
cross_val_results = []
all_preds = []
all_y = []
for test in [TEST]:
    f, preds, y, accuracy = get_fscore_and_preds(references_per_pair, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                      lm_features = False)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, False, False]
test_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

1.4520947933197021 all lm features extracted
[0.8333333333333334] 0.8333333333333334


In [56]:
# 3) see what difference exahustive retraining makes, firstly lexical only, no lm features
cross_val_results = []
all_preds = []
all_y = []
for test in [TEST]:
    f, preds, y, accuracy = get_fscore_and_preds(references_per_pair, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                      lm_features=False, exhaustive_retrain=True, heldout_extra_fold=EXTRA_HELDOUT)
    all_preds.extend(preds)
    all_y.extend(y)
    cross_val_results.append(accuracy)
exp_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, True, False, True]
test_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
print(cross_val_results, np.mean(cross_val_results))

1.4181408882141113 all lm features extracted
[0.8992248062015504] 0.8992248062015504


In [57]:
comparison_results_test = {}
for sys1 in test_results.keys():
    for sys2 in test_results.keys():
        if sys1 == sys2:
            continue
        if sys1 + " V " + sys2 in comparison_results_test.keys() or sys2 + " V " + sys1 in comparison_results_test.keys():
            continue
        cl1_preds = test_results[sys1][2]
        cl2_preds = test_results[sys2][2]
        sig = calculate_mcnemar_test(cl1_preds, cl2_preds, all_y, alpha=0.05, exact=True)
        comparison_results_test[sys1 + " V " + sys2] = [round(test_results[sys1][1],3), round(test_results[sys2][1],3)] + list(sig)      

count = 1
for k, v in comparison_results_test.items():
    print(count, ")", k, v)
    count+=1

statistic=0.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=0.000, p-value=0.062
Same proportions of errors (fail to reject H0)
statistic=0.000, p-value=0.000
Different proportions of errors (reject H0)
1 ) 1,1,6,0.9,0.1,0.1,True,True,False V 1,1,6,0.9,0.1,0.1,True,False,False [0.88, 0.833, 0.0, 0.00048828125, True, True, True]
2 ) 1,1,6,0.9,0.1,0.1,True,True,False V 1,1,6,0.9,0.1,0.1,True,False,True [0.88, 0.899, 0.0, 0.0625, False, False, False]
3 ) 1,1,6,0.9,0.1,0.1,True,False,False V 1,1,6,0.9,0.1,0.1,True,False,True [0.833, 0.899, 0.0, 1.52587890625e-05, True, True, True]


# Experiment 2: Removing the training data by one pair each time, down to 1 pair

In [58]:
# Do optimization to find best cut-off and lambda local param values (rather than just cross-entropy as not straightforward)
# find best lambda local and cut-off word through cross-val testing, pick one with highest accuracy
OPTIMIZING = False
if OPTIMIZING:
    
    # assume these are the same (could be incorrect)
    n_self = opt_params['n_self'] 
    k_self = opt_params['k_self']

    print(TEST, HELDOUT, EXTRA_HELDOUT)
    
    results = {}
    results_file = open(datetime.datetime.now().isoformat() + "_reduced.csv", "w")
    results_file.write(",".join(['num_pairs', 'n_other', 'n_self', 'cutoff_word_global_lm', 'k_other', 'k_self', 'lambda_local', 'fscore']) + "\n")
    
    for num_pairs in range(1,6):  # Full model is using 6 folds for training in cross val
        
        #cross_val_pairs = list(filter(lambda x:x not in [TEST, HELDOUT, EXTRA_HELDOUT], references_per_pair.keys()))[:num_pairs-1]
        #cross_val_pairs.extend([HELDOUT, EXTRA_HELDOUT])
        #print(num_pairs, cross_val_pairs)
        #cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        
        
        best_f = 0
        best_params = []
        for n_other in range(1,2): #1
            for k_other_raw in range(5,105,5): #20
                k_other = k_other_raw/100
                for cutoff_word_global_lm in range(1,11): #200
                    # assume convex - not quite right but nearly 
                    prev = 0
                    for lambda_local_raw in range(0,105,5): #4_000 params max, at least 4_00!
                        lambda_local = lambda_local_raw/100
                        all_scores = []
                        for test in list(filter(lambda x:x!=TEST, references_per_pair.keys())):
                            heldout = HELDOUT if test!=HELDOUT else EXTRA_HELDOUT # use the heldout when training on full dataset to simulate same lm sizes as in development
                            other_pairs = list(filter(lambda x:x not in [heldout, test], 
                                                      [HELDOUT, EXTRA_HELDOUT] + list(filter(lambda x:x not in [HELDOUT, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']))))
                            cross_val_pairs = [heldout, test] + other_pairs[:num_pairs-1]
                            #print(cross_val_pairs, test, heldout)
                            cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
                            #print("heldout", heldout) 
                            #print(cross_val_data.keys(), test)
                            fscore, preds, labels = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                                        )
                            all_scores.append(fscore)
                        mean_f = np.mean(all_scores)
                        test_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other, k_self, lambda_local]
                        print(test_params, mean_f)
                        if mean_f < prev: # assume convex - not quite right but nearly 
                            break
                        prev = mean_f
                        results[",".join([str(f) for f in test_params])] = mean_f
                        if mean_f == best_f:
                            best_params = [best_params, test_params]
                        elif mean_f > best_f:
                            best_f = mean_f
                            best_params = test_params
                            print("best", best_f, best_params)
                #results_file = open("results" + datetime.time)
                #for row in sorted(results.items(), key=lambda x:x[1], reverse=True):
    
    for params, result in sorted(results.items(), key=lambda x:x[1], reverse=True):
        results_file.write(params + "," + str(result) + "\n")
    results_file.close()

In [59]:
# if already optimized look at results
df = pd.read_csv(open("2023-06-21T08:17:12.999981_reduced.csv"))
local_df = df[df['num_pairs']==1]
local_df

Unnamed: 0,num_pairs,n_other,n_self,cutoff_word_global_lm,k_other,k_self,lambda_local,fscore
0,1,1,1,7,0.10,0.1,0.10,0.783648
1,1,1,1,8,0.20,0.1,0.25,0.783520
2,1,1,1,8,0.10,0.1,0.10,0.782771
3,1,1,1,6,0.10,0.1,0.10,0.781791
4,1,1,1,4,0.15,0.1,0.05,0.781598
...,...,...,...,...,...,...,...,...
754,1,1,1,6,1.00,0.1,0.00,0.603963
755,1,1,1,7,1.00,0.1,0.00,0.603963
756,1,1,1,8,1.00,0.1,0.00,0.603963
757,1,1,1,9,1.00,0.1,0.00,0.603963


In [60]:
opt_params_reduced = {}
for num_training_pairs in range(1,6):
    local_df = df[df['num_pairs']==num_training_pairs]
    opt_params_reduced['n_self_{}'.format(num_training_pairs)] = int(local_df.iloc[0].n_self)
    opt_params_reduced['k_self_{}'.format(num_training_pairs)] = local_df.iloc[0].k_self
    opt_params_reduced['n_other_{}'.format(num_training_pairs)] = int(local_df.iloc[0].n_other)
    opt_params_reduced['k_other_{}'.format(num_training_pairs)] = local_df.iloc[0].k_other
    opt_params_reduced['lambda_local_{}'.format(num_training_pairs)] = local_df.iloc[0].lambda_local
    opt_params_reduced['cutoff_word_global_lm_{}'.format(num_training_pairs)] = int(local_df.iloc[0].cutoff_word_global_lm)

In [61]:
# Assume the cutoff and lambda for 6 pairs is the same as the global, as trained on all
opt_params_reduced['n_self_{}'.format(6)] = opt_params['n_self']
opt_params_reduced['k_self_{}'.format(6)] = opt_params['k_self']
opt_params_reduced['n_other_{}'.format(6)] = opt_params['n_other']
opt_params_reduced['k_other_{}'.format(6)] = opt_params['k_other']
opt_params_reduced['lambda_local_{}'.format(6)] = opt_params['lambda_local']
opt_params_reduced['cutoff_word_global_lm_{}'.format(6)] = opt_params['cutoff_word_global_lm']

In [62]:
for param in opt_params_reduced:
    print(param, opt_params_reduced[param])

n_self_1 1
k_self_1 0.1
n_other_1 1
k_other_1 0.1
lambda_local_1 0.1
cutoff_word_global_lm_1 7
n_self_2 1
k_self_2 0.1
n_other_2 1
k_other_2 1.0
lambda_local_2 0.8
cutoff_word_global_lm_2 6
n_self_3 1
k_self_3 0.1
n_other_3 1
k_other_3 0.95
lambda_local_3 0.65
cutoff_word_global_lm_3 8
n_self_4 1
k_self_4 0.1
n_other_4 1
k_other_4 0.75
lambda_local_4 0.4
cutoff_word_global_lm_4 5
n_self_5 1
k_self_5 0.1
n_other_5 1
k_other_5 0.7
lambda_local_5 0.3
cutoff_word_global_lm_5 7
n_self_6 1
k_self_6 0.1
n_other_6 1
k_other_6 0.9
lambda_local_6 0.1
cutoff_word_global_lm_6 6


In [63]:
reduced_xval_results = {}

In [64]:
# 1) re-run best cross-val result
print(TEST, HELDOUT, EXTRA_HELDOUT)

local_results = {}
for num_pairs in range(1,7):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]
    

    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]

    lexical = True
    lm_features = True
    retrain = False

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate(['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        heldout = HELDOUT if test!=HELDOUT else EXTRA_HELDOUT # use the heldout when training on full dataset to simulate same lm sizes as in development
        other_pairs = list(filter(lambda x:x not in [heldout, test], 
                                  [HELDOUT, EXTRA_HELDOUT] + list(filter(lambda x:x not in [HELDOUT, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']))))
        cross_val_pairs = [heldout, test] + other_pairs[:num_pairs-1]
        print(cross_val_pairs, test, heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

r6 r3 r7
num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, True, False]
['r3', 'r1'] r1 r3
0.2286689281463623 all lm features extracted
['r3', 'r2'] r2 r3
0.22969603538513184 all lm features extracted
['r7', 'r3'] r3 r7
0.32456493377685547 all lm features extracted
['r3', 'r4'] r4 r3
0.22815895080566406 all lm features extracted
['r3', 'r5'] r5 r3
0.22738289833068848 all lm features extracted
['r3', 'r7'] r7 r3
0.23005104064941406 all lm features extracted
['r3', 'r8'] r8 r3
0.22387003898620605 all lm features extracted
[0.77, 0.711764705882353, 0.8377358490566038, 0.8456140350877193, 0.7692307692307693, 0.8039772727272727, 0.7392638036809815] 0.7825123479522428
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, True, False]
['r3', 'r1', 'r7'] r1 r3
0.5509071350097656 all lm features extracted
['r3', 'r2', 'r7'] r2 r3
0.5464608669281006 all lm features extracted
['r7', 'r3', 'r1'] r3 r7
0.3910789489746094 all lm features extracted
['r3', 'r4', 'r7'] r4 r3
0.5445067882537842 all lm features extr

In [65]:
# 2) lexical only baseline
print(TEST, HELDOUT, EXTRA_HELDOUT)

local_results = {}
for num_pairs in range(1,7):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]
    

    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]

    lexical = True
    lm_features = False
    retrain = False

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate(['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        heldout = HELDOUT if test!=HELDOUT else EXTRA_HELDOUT # use the heldout when training on full dataset to simulate same lm sizes as in development
        other_pairs = list(filter(lambda x:x not in [heldout, test], 
                                  [HELDOUT, EXTRA_HELDOUT] + list(filter(lambda x:x not in [HELDOUT, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']))))
        cross_val_pairs = [heldout, test] + other_pairs[:num_pairs-1]
        print(cross_val_pairs, test, heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

r6 r3 r7
num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, False, False]
['r3', 'r1'] r1 r3
0.2290349006652832 all lm features extracted
['r3', 'r2'] r2 r3
0.22480511665344238 all lm features extracted
['r7', 'r3'] r3 r7
0.31954193115234375 all lm features extracted
['r3', 'r4'] r4 r3
0.22333598136901855 all lm features extracted
['r3', 'r5'] r5 r3
0.22367405891418457 all lm features extracted
['r3', 'r7'] r7 r3


  _warn_prf(average, modifier, msg_start, len(result))


0.22140789031982422 all lm features extracted
['r3', 'r8'] r8 r3
0.2231738567352295 all lm features extracted


  _warn_prf(average, modifier, msg_start, len(result))


[0.67, 0.5411764705882353, 0.6867924528301886, 0.7578947368421053, 0.6713286713286714, 0.7272727272727273, 0.5644171779141104] 0.6598403195394341
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, False, False]
['r3', 'r1', 'r7'] r1 r3
0.5361559391021729 all lm features extracted
['r3', 'r2', 'r7'] r2 r3
0.537459135055542 all lm features extracted
['r7', 'r3', 'r1'] r3 r7
0.38897705078125 all lm features extracted
['r3', 'r4', 'r7'] r4 r3
0.5315558910369873 all lm features extracted
['r3', 'r5', 'r7'] r5 r3
0.5318489074707031 all lm features extracted
['r3', 'r7', 'r1'] r7 r3


  _warn_prf(average, modifier, msg_start, len(result))


0.28885889053344727 all lm features extracted
['r3', 'r8', 'r7'] r8 r3
0.5316002368927002 all lm features extracted
[0.72, 0.5352941176470588, 0.8150943396226416, 0.8350877192982457, 0.7832167832167832, 0.7698863636363636, 0.5858895705521472] 0.7206384134247487
num pairs 3
[3, 1, 1, 8, 0.95, 0.1, 0.65, True, False, False]
['r3', 'r1', 'r7', 'r2'] r1 r3
0.6618103981018066 all lm features extracted
['r3', 'r2', 'r7', 'r1'] r2 r3
0.605445146560669 all lm features extracted
['r7', 'r3', 'r1', 'r2'] r3 r7
0.5120658874511719 all lm features extracted
['r3', 'r4', 'r7', 'r1'] r4 r3
0.6473898887634277 all lm features extracted
['r3', 'r5', 'r7', 'r1'] r5 r3
0.6048290729522705 all lm features extracted
['r3', 'r7', 'r1', 'r2'] r7 r3
0.4182169437408447 all lm features extracted
['r3', 'r8', 'r7', 'r1'] r8 r3
0.6028010845184326 all lm features extracted
[0.72, 0.5352941176470588, 0.8528301886792453, 0.8350877192982457, 0.7762237762237763, 0.7897727272727273, 0.6012269938650306] 0.7300622175694406

In [66]:
# 3) Exhaustive retraining competitor
print(TEST, HELDOUT, EXTRA_HELDOUT)

local_results = {}
for num_pairs in range(1,7):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]
    

    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]

    lexical = True
    lm_features = False
    retrain = True

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate(['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        heldout = HELDOUT if test!=HELDOUT else EXTRA_HELDOUT # use the heldout when training on full dataset to simulate same lm sizes as in development
        other_pairs = list(filter(lambda x:x not in [heldout, test], 
                                  [HELDOUT, EXTRA_HELDOUT] + list(filter(lambda x:x not in [HELDOUT, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r7', 'r8']))))
        cross_val_pairs = [heldout, test] + other_pairs[:num_pairs-1]
        print(cross_val_pairs, test, heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_xval_results[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

r6 r3 r7
num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, False, True]
['r3', 'r1'] r1 r3
0.23499011993408203 all lm features extracted
['r3', 'r2'] r2 r3
0.2220621109008789 all lm features extracted
['r7', 'r3'] r3 r7
0.3152937889099121 all lm features extracted
['r3', 'r4'] r4 r3
0.21932411193847656 all lm features extracted
['r3', 'r5'] r5 r3
0.2200171947479248 all lm features extracted
['r3', 'r7'] r7 r3
0.2262861728668213 all lm features extracted
['r3', 'r8'] r8 r3
0.2205650806427002 all lm features extracted
[0.82, 0.7705882352941177, 0.879245283018868, 0.8526315789473684, 0.8181818181818182, 0.8323863636363636, 0.7361963190184049] 0.8156042282995629
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, False, True]
['r3', 'r1', 'r7'] r1 r3
0.5421440601348877 all lm features extracted
['r3', 'r2', 'r7'] r2 r3
0.5365328788757324 all lm features extracted
['r7', 'r3', 'r1'] r3 r7
0.3889789581298828 all lm features extracted
['r3', 'r4', 'r7'] r4 r3
0.5377721786499023 all lm features extracte

In [67]:
for num_pairs in range(1,7):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)
    reduced_comparison_results = {}
    for sys1 in reduced_xval_results.keys():
        if not int(sys1.split(",")[0]) == num_pairs:
            continue
        for sys2 in reduced_xval_results.keys():
            if not int(sys2.split(",")[0]) == num_pairs:
                continue
            if sys1 == sys2:
                continue
            if sys1 + " V " + sys2 in reduced_comparison_results.keys() or sys2 + " V " + sys1 in reduced_comparison_results.keys():
                continue
            cl1_preds = reduced_xval_results[sys1][2]
            cl2_preds = reduced_xval_results[sys2][2]
            sig = calculate_mcnemar_test(cl1_preds, cl2_preds, all_y, alpha=0.05, exact=True)
            reduced_comparison_results[sys1 + " V " + sys2] = [round(reduced_xval_results[sys1][1],3), round(reduced_xval_results[sys2][1],3)] + list(sig)


    count = 1
    for k, v in reduced_comparison_results.items():
        print(count, ")", k, v)
        count+=1
    print("*" * 30)

num pairs 1
statistic=71.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=53.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=21.000, p-value=0.000
Different proportions of errors (reject H0)
1 ) 1,1,1,7,0.1,0.1,0.1,True,True,False V 1,1,1,7,0.1,0.1,0.1,True,False,False [0.783, 0.66, 71.0, 4.6274975325333254e-29, True, True, True]
2 ) 1,1,1,7,0.1,0.1,0.1,True,True,False V 1,1,1,7,0.1,0.1,0.1,True,False,True [0.783, 0.816, 53.0, 0.0004097727655174, True, True, True]
3 ) 1,1,1,7,0.1,0.1,0.1,True,False,False V 1,1,1,7,0.1,0.1,0.1,True,False,True [0.66, 0.816, 21.0, 1.7951571978789752e-55, True, True, True]
******************************
num pairs 2
statistic=53.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=40.000, p-value=0.012
Different proportions of errors (reject H0)
statistic=19.000, p-value=0.000
Different proportions of errors (reject H0)
1 ) 2,1,1,6,1.0,0.1,0.8,True,True,False V 2,1,1,6,1.0,0.1,0.8,True,Fal

In [68]:
# Now get results on the test data
reduced_results_test = {}

In [69]:
# (1) optimized with lm features 
local_results = {}

for num_pairs in range(1,8):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]


    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]
    
    
    #if num_pairs in [4,5,6]:
    #    n_other = opt_params_reduced['n_other_{}'.format(6)]
    #    k_other = opt_params_reduced['k_other_{}'.format(6)]
     #   lambda_local = opt_params_reduced['lambda_local_{}'.format(6)]
     #   cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(6)]

    lexical = True
    lm_features = True
    retrain = False

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate([TEST]):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        other_pairs = list(filter(lambda x:x not in [HELDOUT, TEST, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']))
        #random.seed(num_pairs)
        #shuffle(other_pairs)
        cross_val_pairs = [TEST, HELDOUT, EXTRA_HELDOUT] + other_pairs
        cross_val_pairs = cross_val_pairs[:num_pairs+1]
        extra_heldout = EXTRA_HELDOUT  if num_pairs > 6 else None # HELDOUT
        print(cross_val_pairs, test, HELDOUT, extra_heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, HELDOUT, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                     heldout_extra_fold=extra_heldout,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_results_test[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, True, False]
['r6', 'r3'] r6 r3 None
0.2276449203491211 all lm features extracted
[0.813953488372093] 0.813953488372093
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, True, False]
['r6', 'r3', 'r7'] r6 r3 None
0.5350580215454102 all lm features extracted
[0.8488372093023255] 0.8488372093023255
num pairs 3
[3, 1, 1, 8, 0.95, 0.1, 0.65, True, True, False]
['r6', 'r3', 'r7', 'r1'] r6 r3 None
0.6088743209838867 all lm features extracted
[0.8527131782945736] 0.8527131782945736
num pairs 4
[4, 1, 1, 5, 0.75, 0.1, 0.4, True, True, False]
['r6', 'r3', 'r7', 'r1', 'r2'] r6 r3 None
0.7388148307800293 all lm features extracted
[0.8255813953488372] 0.8255813953488372
num pairs 5
[5, 1, 1, 7, 0.7, 0.1, 0.3, True, True, False]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4'] r6 r3 None
0.9938099384307861 all lm features extracted
[0.8410852713178295] 0.8410852713178295
num pairs 6
[6, 1, 1, 6, 0.9, 0.1, 0.1, True, True, False]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4', 'r5

In [70]:
# (2) lexical only baseline

local_results = {}

for num_pairs in range(1,8):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]


    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]
    
    
    #if num_pairs in [4,5,6]:
    #    n_other = opt_params_reduced['n_other_{}'.format(6)]
    #    k_other = opt_params_reduced['k_other_{}'.format(6)]
     #   lambda_local = opt_params_reduced['lambda_local_{}'.format(6)]
     #   cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(6)]

    lexical = True
    lm_features = False
    retrain = False

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate([TEST]):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        other_pairs = list(filter(lambda x:x not in [HELDOUT, TEST, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']))
        #random.seed(num_pairs)
        #shuffle(other_pairs)
        cross_val_pairs = [TEST, HELDOUT, EXTRA_HELDOUT] + other_pairs
        cross_val_pairs = cross_val_pairs[:num_pairs+1]
        extra_heldout = EXTRA_HELDOUT  if num_pairs > 6 else None # HELDOUT
        print(cross_val_pairs, test, HELDOUT, extra_heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, HELDOUT, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                     heldout_extra_fold=extra_heldout,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_results_test[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, False, False]
['r6', 'r3'] r6 r3 None
0.22547507286071777 all lm features extracted
[0.5968992248062015] 0.5968992248062015
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, False, False]
['r6', 'r3', 'r7'] r6 r3 None


  _warn_prf(average, modifier, msg_start, len(result))


0.5361320972442627 all lm features extracted
[0.7170542635658915] 0.7170542635658915
num pairs 3
[3, 1, 1, 8, 0.95, 0.1, 0.65, True, False, False]
['r6', 'r3', 'r7', 'r1'] r6 r3 None
0.6036379337310791 all lm features extracted
[0.7286821705426356] 0.7286821705426356
num pairs 4
[4, 1, 1, 5, 0.75, 0.1, 0.4, True, False, False]
['r6', 'r3', 'r7', 'r1', 'r2'] r6 r3 None
0.7428689002990723 all lm features extracted
[0.8294573643410853] 0.8294573643410853
num pairs 5
[5, 1, 1, 7, 0.7, 0.1, 0.3, True, False, False]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4'] r6 r3 None
0.9828410148620605 all lm features extracted
[0.8294573643410853] 0.8294573643410853
num pairs 6
[6, 1, 1, 6, 0.9, 0.1, 0.1, True, False, False]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4', 'r5'] r6 r3 None
1.1102240085601807 all lm features extracted
[0.8333333333333334] 0.8333333333333334
num pairs 7
[7, 1, 1, 6, 0.9, 0.1, 0.1, True, False, False]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4', 'r5', 'r8'] r6 r3 r7
1.4151499271392822 all lm features

In [71]:
# (3) Exhausitve retrain lexical only

local_results = {}

for num_pairs in range(1,8):
    print("num pairs", num_pairs)
    #print(opt_params_reduced)

    n_self = opt_params_reduced['n_self_{}'.format(min([6,num_pairs]))]
    k_self = opt_params_reduced['k_self_{}'.format(min([6,num_pairs]))]
    n_other = opt_params_reduced['n_other_{}'.format(min([6,num_pairs]))]
    k_other = opt_params_reduced['k_other_{}'.format(min([6,num_pairs]))]


    lambda_local = opt_params_reduced['lambda_local_{}'.format(min([6,num_pairs]))]
    cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(min([6,num_pairs]))]
    
    
    #if num_pairs in [4,5,6]:
    #    n_other = opt_params_reduced['n_other_{}'.format(6)]
    #    k_other = opt_params_reduced['k_other_{}'.format(6)]
     #   lambda_local = opt_params_reduced['lambda_local_{}'.format(6)]
     #   cutoff_word_global_lm = opt_params_reduced['cutoff_word_global_lm_{}'.format(6)]

    lexical = True
    lm_features = False
    retrain = True

    exp_params = [num_pairs, n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local, lexical, lm_features, retrain]
    print(exp_params)
    cross_val_results = []
    all_preds = []
    all_y = []
    for i, test in enumerate([TEST]):
        #random.seed(0)  # note in training this was always 0
        #shuffle(training_folds)
        
        other_pairs = list(filter(lambda x:x not in [HELDOUT, TEST, EXTRA_HELDOUT], ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']))
        #random.seed(num_pairs)
        #shuffle(other_pairs)
        cross_val_pairs = [TEST, HELDOUT, EXTRA_HELDOUT] + other_pairs
        cross_val_pairs = cross_val_pairs[:num_pairs+1]
        extra_heldout = EXTRA_HELDOUT  if num_pairs > 6 else None # HELDOUT
        print(cross_val_pairs, test, HELDOUT, extra_heldout)
        cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x in cross_val_pairs, references_per_pair.keys())}
        #print("heldout", heldout) 
        #print(cross_val_data.keys(), test)
        
        fscore, preds, labels, accuracy = get_fscore_and_preds(cross_val_data, test, HELDOUT, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm,
                                                     heldout_extra_fold=extra_heldout,
                                                      lexical=lexical, lm_features=lm_features, exhaustive_retrain=retrain,
                                                      no_train=False)
                                                    

        all_preds.extend(preds)
        all_y.extend(labels)
        cross_val_results.append(accuracy)

    reduced_results_test[','.join([str(p) for p in exp_params])] = (cross_val_results, np.mean(cross_val_results),all_preds)
    print(cross_val_results, np.mean(cross_val_results))
    local_results[num_pairs] = np.mean(cross_val_results)
print(local_results)

num pairs 1
[1, 1, 1, 7, 0.1, 0.1, 0.1, True, False, True]
['r6', 'r3'] r6 r3 None
0.22508001327514648 all lm features extracted
[0.8565891472868217] 0.8565891472868217
num pairs 2
[2, 1, 1, 6, 1.0, 0.1, 0.8, True, False, True]
['r6', 'r3', 'r7'] r6 r3 None
0.5474739074707031 all lm features extracted
[0.8837209302325582] 0.8837209302325582
num pairs 3
[3, 1, 1, 8, 0.95, 0.1, 0.65, True, False, True]
['r6', 'r3', 'r7', 'r1'] r6 r3 None
0.6108858585357666 all lm features extracted
[0.8875968992248062] 0.8875968992248062
num pairs 4
[4, 1, 1, 5, 0.75, 0.1, 0.4, True, False, True]
['r6', 'r3', 'r7', 'r1', 'r2'] r6 r3 None
0.7434990406036377 all lm features extracted
[0.8992248062015504] 0.8992248062015504
num pairs 5
[5, 1, 1, 7, 0.7, 0.1, 0.3, True, False, True]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4'] r6 r3 None
1.0084960460662842 all lm features extracted
[0.9031007751937985] 0.9031007751937985
num pairs 6
[6, 1, 1, 6, 0.9, 0.1, 0.1, True, False, True]
['r6', 'r3', 'r7', 'r1', 'r2', 'r4', 

In [72]:
for num_training_pairs in range(1,8):
    print("num training pairs", num_training_pairs)
    reduced_comparison_results_test = {}
    for sys1 in reduced_results_test.keys():
        #print(sys1)
        if not int(sys1.split(",")[0]) == num_training_pairs:
            continue
        #print("getting 1")
        for sys2 in reduced_results_test.keys():
            if not int(sys2.split(",")[0]) == num_training_pairs:
                continue
            #print("getting 2")
            if sys1 == sys2:
                continue
            #print("getting 3")
            if sys1 + " V " + sys2 in reduced_comparison_results_test.keys() or sys2 + " V " + sys1 in reduced_comparison_results_test.keys():
                continue
            #print("getting 4")
            cl1_preds = reduced_results_test[sys1][2]
            cl2_preds = reduced_results_test[sys2][2]
            sig = calculate_mcnemar_test(cl1_preds, cl2_preds, all_y, alpha=0.05, exact=True)
            reduced_comparison_results_test[sys1 + " V " + sys2] = [round(reduced_results_test[sys1][1],3), round(reduced_results_test[sys2][1],3)] + list(sig)

    count = 1
    for k, v in reduced_comparison_results_test.items():
        print(count, ")", k, v)
        count+=1
    print("*" * 30)

num training pairs 1
statistic=8.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=6.000, p-value=0.035
Different proportions of errors (reject H0)
statistic=6.000, p-value=0.000
Different proportions of errors (reject H0)
1 ) 1,1,1,7,0.1,0.1,0.1,True,True,False V 1,1,1,7,0.1,0.1,0.1,True,False,False [0.814, 0.597, 8.0, 5.765519304519332e-12, True, True, True]
2 ) 1,1,1,7,0.1,0.1,0.1,True,True,False V 1,1,1,7,0.1,0.1,0.1,True,False,True [0.814, 0.857, 6.0, 0.03468966484069824, True, False, False]
3 ) 1,1,1,7,0.1,0.1,0.1,True,False,False V 1,1,1,7,0.1,0.1,0.1,True,False,True [0.597, 0.857, 6.0, 9.995152261577009e-16, True, True, True]
******************************
num training pairs 2
statistic=4.000, p-value=0.000
Different proportions of errors (reject H0)
statistic=1.000, p-value=0.012
Different proportions of errors (reject H0)
statistic=1.000, p-value=0.000
Different proportions of errors (reject H0)
1 ) 2,1,1,6,1.0,0.1,0.8,True,True,False V 2,1,1,6,1.0,0.1,