# Extra/initial work for the paper

In [1]:
#!pip install google_trans_new


In [2]:
import sys
sys.path.append("../python/")
import pentoref.IO as IO
import sqlite3 as sqlite
#from google_trans_new import google_translator
from deep_translator import GoogleTranslator
import datetime

import sklearn
from sklearn.feature_extraction import DictVectorizer
#from sklearn.preprocessing import LabelEncoder
#l = LabelEncoder()

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report

from nltk.classify import SklearnClassifier
#nltk.download('punkt')  # if using stemming in German

from random import shuffle
import numpy as np
import scipy.stats as stats
from scipy import optimize
from math import log
import random
import copy

from collections import Counter
import matplotlib.pyplot as plt

from pentoref.IOutils import clean_utt
from machine_learning_utils import calculate_mcnemar_test

In [3]:
def remove_reparanda(utt):
    "removes the content between ( and + though leaves those intact"
    repair_depth = 0
    cleaned_utt = ""
    for c in utt:
        if c == "+":
            repair_depth-=1
        elif c == "(":
            repair_depth+=1
        elif repair_depth>0:
            continue
        cleaned_utt+=c
    assert repair_depth==0, "repair depth not 0:" + utt
    return cleaned_utt
clean_utt(remove_reparanda("(( hello + hello) + {F um } hello)"))
          

'hello'

In [4]:
# Create databases if required
if False:   # make True if you need to create the databases from the derived data
    for corpus_name in ["TAKE", "TAKECV", "PENTOCV"]:
        data_dir = "../../../pentoref/{0}_PENTOREF".format(corpus_name)
        dfwords, dfutts, dfrefs, dfscenes, dfactions = IO.convert_subcorpus_raw_data_to_dataframes(data_dir)
        IO.write_corpus_to_database("{0}.db".format(corpus_name),
                                    corpus_name, dfwords, dfutts, dfrefs, dfscenes, dfactions)

In [5]:
# Connect to database
CORPUS = "PENTOCV"
db = sqlite.connect("{0}.db".format(CORPUS))
cursor = db.cursor()
# get the table column header names
print("utts", [x[1] for x in cursor.execute("PRAGMA table_info(utts)")])
print("words", [x[1] for x in cursor.execute("PRAGMA table_info(words)")])
print("refs", [x[1] for x in cursor.execute("PRAGMA table_info(refs)")])
print("scenes", [x[1] for x in cursor.execute("PRAGMA table_info(scenes)")])
print("actions", [x[1] for x in cursor.execute("PRAGMA table_info(actions)")])

utts ['gameID', 'uttID', 'starttime', 'endtime', 'utt', 'utt_clean', 'role', 'speaker']
words ['gameID', 'uttID', 'position', 'word', 'lemma', 'tag']
refs ['refID', 'gameID', 'uttID', 'text', 'id', 'piece', 'location']
scenes ['timestampID', 'gameID', 'pieceID', 'position_global', 'position_x', 'position_y', 'shape', 'shape_distribution', 'shape_orientation', 'shape_skewness_horizontal', 'shape_skewness_vertical', 'shape_edges', 'colour', 'colour_distribution', 'colour_hsv', 'colour_rgb']
actions ['gameID', 'starttime', 'endtime', 'hand', 'action', 'piece']


## Get utterances from certain time periods in each experiment or for certain episodes

In [6]:
if False:
    translator = GoogleTranslator(source='de', target='en')

    for row in db.execute("SELECT gameID, starttime, speaker, utt_clean, utt FROM utts" + \
                       # " WHERE starttime >= 200 AND starttime <= 300" + \
                         ' WHERE gameID = "r1_1_1_b"' + \
                        " ORDER BY gameID, starttime"):
        print(row)
        line = row[3]
        print(line)
        if not line:
            continue
        translate_text = translator.translate(line,lang_src='de',lang_tgt='en') 
        print(translate_text)

## Build dataset

In [7]:
# just focus on single pieces, not references to multiple pieces
good_pieces = ["X", "Y", "P", "N", "U", "F", "Z", "L", "T", "I", "W", "V"]

In [8]:
references_per_pair = {}  # all data will be stored here with keys =pairnum
for row in db.execute("SELECT id, gameID, text, uttID FROM refs" + \
#for row in db.execute("SELECT shape, colour, orientation, gridPosition, gameID, pieceID FROM scenes" + \
                     " ORDER by gameID"):
    if False: 
        # TAKE
        isTarget = db.execute('SELECT refID FROM refs WHERE gameID ="' + row[4] + '" AND pieceID ="' + row[5] + '"')
        target = False 
        for r1 in isTarget:
            target = True
        if not target:
            continue

    #TAKE
    #shape, colour, orientation, gridPosition, gameID, pieceID = row
    #piece = colour  #+ "_" + shape #shape + "_" + colour
    
    #PENTOCV
    piece, gameID, text, uttID = row
    
    # some manual corrections of disfluencies
    error_found = False
    if "(der + (das + das + das) grüne $m)" in text.lower():
        error_found = True
        text = "(der + (das + (das + das ))) grüne $m)"
    elif "(der {f äh:m:} + der) + der) winkel" in text.lower():
        error_found = True
        text = "((der {f äh:m:} + der) + der) winkel"
    elif "das {br- + blaue} lange" in text.lower():
        error_found = True
        text = "das (br- + blaue) Lange"
    elif "das: ({f äh} ja .) andere blaue $z" in text.lower():
        error_found = True
        text = "das: ({f äh} ja . + ) andere blaue $z"
    elif "den (ist das rosa oben) block" in text.lower():
        error_found = True
        text = "den (ist das rosa oben + ) block"
    elif """(<p="dieses">die-</p> (genau) + das) element""" in text.lower():
        error_found = True
        text = """(<p="dieses">die-</p> (genau  +) + das) element"""
    elif """das $t (dieser blaue <p="senkrecht">senk-</p>""" in text.lower():
        error_found = True
        text = """das $t (dieser blaue <p="senkrecht">senk-</p>+)"""
        
    if error_found:
        print("replacing with", text, "at", gameID, uttID)
        
        
    
    clean_text = clean_utt(remove_reparanda(text.lower()))
    assert clean_text!="", count
    
    
    pair_num = gameID.split("_")[0]
    # get speaker
    speaker = list(db.execute('SELECT speaker FROM utts WHERE uttID =' + str(uttID)))[0][0]
    speaker = pair_num + "_" + speaker
    
    end_time =  float(list(db.execute('SELECT endtime FROM utts WHERE uttID =' + str(uttID)))[0][0])
    
    #if not pair_num == PAIR_NUM:
    #    continue
    
        
    if piece not in good_pieces:
        continue
        
    if not references_per_pair.get(pair_num):
        references_per_pair[pair_num] = []
    
    
    if "_s" in gameID:
        continue # just get the build phases for now due to inconsistent labelling

    references_per_pair[pair_num].append((clean_text, speaker, uttID, end_time, piece))


    # sort by end time
for pair_num in references_per_pair.keys():
    ref_list = references_per_pair[pair_num]
    ref_list = sorted(ref_list, key=lambda x:x[3])
    references_per_pair[pair_num] = ref_list

print(references_per_pair.keys())
print(references_per_pair['r3'][0:20])

replacing with (der + (das + (das + das ))) grüne $m) at r4_1_3_b 10823
replacing with ((der {f äh:m:} + der) + der) winkel at r4_1_3_b 10880
replacing with das (br- + blaue) Lange at r7_1_1_b 1215
replacing with das: ({f äh} ja . + ) andere blaue $z at r8_1_3_s 2454
replacing with den (ist das rosa oben + ) block at r8_1_5_b 4428
replacing with (<p="dieses">die-</p> (genau  +) + das) element at r8_1_8_b 4919
replacing with das $t (dieser blaue <p="senkrecht">senk-</p>+) at r8_2_17_b 3696
dict_keys(['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8'])
[('gelben stein', 'r3_B', 8886, 129.257, 'U'), ('das rote kreuz', 'r3_B', 8893, 141.68, 'X'), ('daran', 'r3_B', 8897, 148.215, 'U'), ('der orange stein', 'r3_B', 8897, 148.215, 'L'), ('das l', 'r3_A', 7900, 151.582, 'L'), ('daran', 'r3_B', 8902, 157.211, 'X'), ('das grüne t', 'r3_B', 8902, 157.211, 'T'), ('der blaue winkel', 'r3_B', 8931, 219.17, 'V'), ('das', 'r3_B', 8936, 226.213, 'V'), ('daran', 'r3_B', 8940, 235.547, 'V'), ('der gelbe ste

# Create language models and language model features

In [9]:
# null hyp 1: the referring expression is no less predictable given the other mentions for a given piece at a given time point
# regardless of order (so like a language model, all mentions are equally likely and there's no correlation with time)
# of how p(ref|previous) =< p(ref|all)


In [10]:
# ASSUMPTION: only dealing with full names, not anaphors # TODO for this paper may actually leave them in
if True: # False TODO for this paper may actually leave them in
    anaphors = ["es", "das", 'er', "da", "der", "ihn", "den", "sie", "die", "damit", "daran", "dem"]
    for pair_num in references_per_pair.keys():
        refs = references_per_pair[pair_num]
        references_per_pair[pair_num] = list(filter(lambda x:x[0] not in anaphors, refs))


In [11]:
# Get stats for whole corpus and pieces
piece_counter = Counter()
for pair_num in references_per_pair.keys():
    list_refs = [ref[4] for ref in references_per_pair[pair_num]]
    piece_counter.update(list_refs) 
print("TOTAL REFERENCES", piece_counter.total())
sorted(piece_counter.items())

TOTAL REFERENCES 1899


[('F', 134),
 ('I', 139),
 ('L', 161),
 ('N', 197),
 ('P', 147),
 ('T', 191),
 ('U', 144),
 ('V', 162),
 ('W', 170),
 ('X', 177),
 ('Y', 162),
 ('Z', 115)]

In [12]:
# let's have a look at how one pair talks about a piece over time
for ref in filter(lambda x:x[4] == 'Z', references_per_pair['r3']):
    print(ref)

('der blaue stein', 'r3_B', 8998, 338.218, 'Z')
('der hellblaue stein', 'r3_B', 9101, 597.635, 'Z')
('der andere', 'r3_B', 9103, 599.63, 'Z')
('der blaue stein', 'r3_B', 9295, 976.864, 'Z')
('der hellblaue stein', 'r3_B', 9373, 1136.494, 'Z')
('der andere blaue', 'r3_A', 8155, 1431.262, 'Z')
('s', 'r3_A', 8155, 1431.262, 'Z')
('z', 'r3_A', 8155, 1431.262, 'Z')
('das z', 'r3_A', 8165, 1450.016, 'Z')
('der z-stein', 'r3_A', 8228, 1561.559, 'Z')
('dieses s', 'r3_A', 8357, 1745.013, 'Z')
('z-steinchen', 'r3_A', 8357, 1745.013, 'Z')
('blauen', 'r3_A', 8395, 1792.021, 'Z')
('das z', 'r3_A', 8636, 2157.882, 'Z')
('der z-stein', 'r3_A', 8700, 2288.859, 'Z')
('diesen z-stein', 'r3_A', 8790, 2443.581, 'Z')
('s-stein', 'r3_A', 8790, 2443.581, 'Z')
('ein z', 'r3_A', 8793, 2445.882, 'Z')
('dem z', 'r3_A', 8804, 2463.788, 'Z')
('dieses z', 'r3_A', 8839, 2519.642, 'Z')
('dem z', 'r3_A', 8841, 2522.952, 'Z')


In [13]:
class mini_language_model():
    
    def __init__(self, order, smoothing_k=0.0001):
        self.order = order
        self.ngrams = {}
        self.num_training_sents = 0
        self.smoothing_k = smoothing_k
        for o in range(1, order+1):
            self.ngrams[o] = {}
        self.unigram_denom = 0
        self.vocab_size = 0
    
    
    def add_ngram_to_model(self, ngram, order):
        ngram_text = "_".join(ngram)
        if not self.ngrams.get(order).get(ngram_text):
            self.ngrams[order][ngram_text] = 0
        self.ngrams[order][ngram_text]+=1

        
    def minus_ngram_from_model(self, ngram, order):
        ngram_text = "_".join(ngram)
        if not self.ngrams.get(order).get(ngram_text):
            self.ngrams[order][ngram_text] = 0
            
        self.ngrams[order][ngram_text]-=1
      
    
    def add_counts_from_other_model(self, other_lm):
        assert(other_lm.order == self.order)
        for n in range(1, self.order+1):
            #print("ngrams before", self.ngrams)
            combined_keys = list(self.ngrams[n].keys() | other_lm.ngrams[n].keys())
            #print("other after", self.ngrams)
            for key in combined_keys:
                
                count_self = self.ngrams[n].get(key)
                if not count_self:
                    count_self = 0
                count_other = other_lm.ngrams[n].get(key)
                if not count_other:
                    count_other = 0
                #print(key, count_self, count_other)
                self.ngrams[n][key] = count_self + count_other
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print("update")
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def train(self, sents):
        for sent in sents:
            padded = ["<s>"] * (self.order -1) + sent + ["</s>"]
            for i in range(self.order-1, len(padded)):
                for n in range(1, self.order+1):
                    target = padded[i]
                    context = padded[i-(n-1):i]
                    self.add_ngram_to_model(context + [target], n)
                    if n> 1:
                        self.add_ngram_to_model(context, n-1)
                    
            self.num_training_sents += 1
        #print(self.ngrams)
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def de_train(self, sents):
        # take away these counts
        for sent in sents:
            padded = ["<s>"] * (self.order -1) + sent + ["</s>"]
            for i in range(self.order-1, len(padded)):
                for n in range(1, self.order+1):
                    target = padded[i]
                    context = padded[i-(n-1):i]
                    self.minus_ngram_from_model(context + [target], n)
        #print(self.ngrams)
        self.unigram_denom = sum(self.ngrams[1].values())
        self.vocab_size = len(self.ngrams[1])
        #print(self.unigram_denom, "unigram counts")
        #print(self.vocab_size, "vocab size")
        
    def prob_lidstone(self, ngram, order):
        """Add-k smoothing using the discount parameter for this model."""
        ngram_text =  "_".join(ngram)
        ngram_count = self.ngrams[order].get(ngram_text)
        if not ngram_count:
            ngram_count = 0
        num = ngram_count + self.smoothing_k
        if order == 1:
            denom = self.unigram_denom + (self.smoothing_k * self.vocab_size)
            if self.unigram_denom == 0:
                #print("warning no training, returning k/40/10")
                denom = 150 + (self.smoothing_k * 15)
            #print(denom)
        else:
            context = ngram[:-1]
            context_text = "_".join(context)
            #print(context_text)
            context_count = self.ngrams[order-1].get(context_text)
            if not context_count:
                context_count = 0
            #print(context_count)
            denom = context_count + (self.smoothing_k * self.vocab_size)
            if self.unigram_denom == 0 or context_count == 0:
                #print("warning no training, returning k/40/10")
                denom = 40 + (self.smoothing_k * 10)
            #print(denom)
        assert num/denom <=1 and num/denom >0, str(num) + " " + str(denom) + " " + str(num/denom) + " " + str(ngram) + " n=" + str(order) + " k=" + str(self.smoothing_k)
        return num/denom
    

In [14]:
m = mini_language_model(3, smoothing_k=0.1)
m.train([["I", "like", "bill"], ["I", "like", "mary"]])
print(m.ngrams)
print(m.prob_lidstone(["I", "like", "mary"], 3))

m2 = mini_language_model(3)
m2.train([["I", "like", "bill", 'today'], ["I", "like", "mary"]])
print(m2.ngrams)
m.add_counts_from_other_model(m2)
print(m.ngrams)

{1: {'I': 4, '<s>': 2, 'like': 4, 'bill': 2, '</s>': 2, 'mary': 2}, 2: {'<s>_I': 4, '<s>_<s>': 2, 'I_like': 4, 'like_bill': 2, 'bill_</s>': 1, 'like_mary': 2, 'mary_</s>': 1}, 3: {'<s>_<s>_I': 2, '<s>_I_like': 2, 'I_like_bill': 1, 'like_bill_</s>': 1, 'I_like_mary': 1, 'like_mary_</s>': 1}}
0.23913043478260873
{1: {'I': 4, '<s>': 2, 'like': 4, 'bill': 2, 'today': 2, '</s>': 2, 'mary': 2}, 2: {'<s>_I': 4, '<s>_<s>': 2, 'I_like': 4, 'like_bill': 2, 'bill_today': 2, 'today_</s>': 1, 'like_mary': 2, 'mary_</s>': 1}, 3: {'<s>_<s>_I': 2, '<s>_I_like': 2, 'I_like_bill': 1, 'like_bill_today': 1, 'bill_today_</s>': 1, 'I_like_mary': 1, 'like_mary_</s>': 1}}
{1: {'I': 8, '<s>': 4, 'like': 8, 'bill': 4, '</s>': 4, 'mary': 4, 'today': 2}, 2: {'<s>_I': 8, '<s>_<s>': 4, 'I_like': 8, 'like_bill': 4, 'bill_</s>': 1, 'like_mary': 4, 'mary_</s>': 2, 'bill_today': 2, 'today_</s>': 1}, 3: {'<s>_<s>_I': 4, '<s>_I_like': 4, 'I_like_bill': 2, 'like_bill_</s>': 1, 'I_like_mary': 2, 'like_mary_</s>': 2, 'bill_

In [15]:
def get_refs_in_timerange_for_piece(pair_num, piece, time_uttend_start, time_uttend_end, references_per_pair):
    """Returns a list of the refs from a certain time start to a time end in order for a given piece.
    NOTE FOR NOW refs in the same utterance are either all included or all excluded from this method"""
    if not references_per_pair.get(pair_num):
        return []
    return list(filter(lambda x:(x[3]>=time_uttend_start and x[3]<time_uttend_end) and x[4]==piece,
                       references_per_pair[pair_num]))
    

In [16]:
get_refs_in_timerange_for_piece('r2', 'L', 0, 687.481, references_per_pair)

[('die eins', 'r2_B', 13319, 282.291, 'L'),
 ('orange', 'r2_A', 12552, 294.243, 'L'),
 ('dem orangen', 'r2_B', 13396, 410.399, 'L'),
 ('der eins', 'r2_B', 13396, 410.399, 'L'),
 ('diesem l', 'r2_B', 13417, 441.653, 'L'),
 ('die eins die orange', 'r2_B', 13570, 678.237, 'L')]

In [17]:
def combined_probs_of_two_lms(ngram, lm1, lm2, weight_lm1=0.5):
    # combine the probabilities together from two language models
    # challenge is they have potentially different vocab lengths, which affects smoothing
    weight_lm2 = 1 - weight_lm1
    if False: # Attempt to normalize to a proper probability 
        # Difficulty is the smoothing doesn't work the same for both models normally
        order = len(ngram)  # TODO should we have this as a param?
        total_unnormalized_distribution = {}
        all_ngrams = list(lm1.ngrams[order].keys() | lm2.ngrams[order].keys() | set(["_".join(ngram)]))
        #print(len(all_ngrams), "combined ngrams")
        # First get whole distribution over joint vocab
        total_counts = 0
        for a_ngram_text in all_ngrams:
            a_ngram = a_ngram_text.split()
            count_1 = lm1.ngrams[order].get(a_ngram_text)
            if not count_1:
                count_1 = 0
            count_2 = lm2.ngrams[order].get(a_ngram_text)
            if not count_2:
                count_2 = 0
            total_counts += (count_1 + count_2)
            #rawprob = (weight_lm1 * count_1) + (weight_lm2 * count_2)
            #total_unnormalized_distribution[a_ngram_text] = rawprob
            prob_lm1 = lm1.prob_lidstone(a_ngram, order)
            prob_lm2 = lm2.prob_lidstone(a_ngram, order)
            rawprob = (weight_lm1 * prob_lm1) + (weight_lm2 * prob_lm2)
            total_unnormalized_distribution[a_ngram_text] = rawprob
        #print(total_counts, "combined counts")

        # because laplace smoothed should always be non-0
        prob = total_unnormalized_distribution["_".join(ngram)]
        denom = sum(total_unnormalized_distribution.values())
    
    prob_lm1 = lm1.prob_lidstone(a_ngram, lm1.order)
    prob_lm2 = lm2.prob_lidstone(a_ngram, lm2.order)
    joint_score = (weight_lm1 * prob_lm1) + (weight_lm2 * prob_lm2)
    return joint_score
                            
    

In [18]:
def lms_for_all_pieces_up_to_current_time_point(order, smoothing_k, pair_num, time_uttend_end, references_per_pair, lm_other=None):
    piece_lms = {}
    for piece in references_per_pair.keys():
        #print(piece)
        data = get_refs_in_timerange_for_piece(pair_num, piece, 0, time_uttend_end, references_per_pair)
        train_data = [x[0].split() for x in data]
        piece_lms[piece] = mini_language_model(order, smoothing_k=smoothing_k)
        piece_lms[piece].train(train_data)
    return piece_lms
    

In [19]:
piece_lms = lms_for_all_pieces_up_to_current_time_point(1, 0.0001, 'r1', 823.29, references_per_pair)

In [20]:
# Test 1: Is there an increasing similarity to the previous references as the dialogue progresses?
# Test 2: does a reference have higher average similarity to other references in the same dialogue compared to those in different dialogyes?

In [21]:
def mean_cross_entropy_of_each_reference_with_lms_trained_on_interaction_so_far(piece, pair_num, piece_references_per_pair, order=1, smoothing_k=0.0001, lm_other=None, lambda_lm1=None, cutoff_word_i=5):
    """Focusses on a particular pair number with possible training on the other pairs + the interaction so far for the conceptual pact model.
    Sees how the evolution of the perplexity for the target piece's language model develops over time.
    Sees how the evolution of the perplexity of the distractor pieces' language models develops over time on those refs.
    """
    
    list_of_probs = []
    list_of_ref_lengths = []

    list_of_refs = piece_references_per_pair[piece][pair_num]
    
    for i in range(0, len(list_of_refs)):
        target = list_of_refs[i][0]
        #training = list_of_refs[:i] # + list_of_refs[i+1:]
        #training = [ref[0] for ref in training]
        time_uttend_end = list_of_refs[i][3]
        
        #print(len(training), "training instances")
        #lm = mini_language_model(1, smoothing_k=0.0001)
        #print("length of training data", len(training))
        #lm.train([ref.split() for ref in training])
        
        # always retrain entire model (not incremental currently but easy to make so as it's counts)
        lms_all_pieces = lms_for_all_pieces_up_to_current_time_point(order, smoothing_k, pair_num, time_uttend_end, piece_references_per_pair)
        
        if i ==0:
            assert len(lms_all_pieces[piece].ngrams[order])==0, str(lms_all_pieces[piece].ngrams.items()) +  piece +  pair_num
        
        
        #lm.de_train([ref.split() for ref in list_of_refs[i+1:]])  # take away the future counts, though leave the vocab there
        
        TRAIN_BY_RECENCY = False
        if TRAIN_BY_RECENCY:
            # ADD EXTRA COUNTS IN PROPORTION TO RECENCY
            top_count = 12
            for j in range(i-1,-1,-1):
                ref = list_of_refs[j]
                for k in range(0,top_count):
                    lm.train([ref.split()])
                if top_count > 1:
                    top_count-=4
        # 
        
        # option 1 add counts
        #if lm_other is not None:
        #    lm.add_counts_from_other_model(lm_other)
        
        ref = ["<s>"] * (order-1)  + target.split() + ['</s>']
        
        
        # get probs for all classifiers
        
        probs_all_pieces = {piece:log(1) for piece in lms_all_pieces.keys()}
        
        
        # go through the ref and get the ngrams
        for j in range(order-1, len(ref)):
            target = ref[j]
            context = ref[j-(order-1):j]
            ngram = context + [target]
            for a_piece in lms_all_pieces.keys():
                lm = lms_all_pieces[a_piece]
                if lm_other is None:
                    
                    probs_all_pieces[a_piece] = probs_all_pieces[a_piece] + log(lm.prob_lidstone(ngram, order))
                else:
                    # option 1 add counts
                    #prob_uni = prob_uni * lm.prob_lidstone([word], 1)
                    #option 2: combination
                    # NB FOR TRUST NEED TO KEEP TRACK OF HOW MANY INSTANCES OF EVERY PIECE, NOT JUST THIS ONE
                    factor = 0 if cutoff_word_i <=1 else (1-lambda_lm1)/(cutoff_word_i-1)
                    num_refs_f = lm.num_training_sents
                    current_lambda_lm1 = max([lambda_lm1 + (factor * ((((cutoff_word_i-1)-num_refs_f)))), lambda_lm1])
                    #current_lambda_lm1 = lambda_lm1  # switch to this for no influence of large language model
                    
                    probs_all_pieces[a_piece] = probs_all_pieces[a_piece] + log(combined_probs_of_two_lms(ngram, lm_other[pair_num][a_piece], lm, weight_lm1=current_lambda_lm1))
                    #probs_all_pieces[a_piece] = probs_all_pieces[a_piece] + log(lm_other[pair_num][a_piece].prob_lidstone(ngram, order))
                # update for this word and this piece
               
        #print(target, prob_ref_text)
        list_of_probs.append(probs_all_pieces)
        list_of_ref_lengths.append(tuple([len(ref)] +  list(list_of_refs[i])))
    list_of_refs_with_probs = [({piece: -prob/l[0] for piece, prob in p_dict.items()}, l[1], l[2], l[3], l[4]) for p_dict, l in zip(list_of_probs, list_of_ref_lengths)]
    piece_running_totals = {piece:0 for piece in piece_references_per_pair.keys()}
    # add running average in too
    for i in range(0,len(list_of_refs_with_probs)):
        ref = list(list_of_refs_with_probs[i])
        piece_running_average = {}
        for piece, prob in ref[0].items():
            new_total = prob + piece_running_totals[piece]
            piece_running_average[piece] = new_total/(i+1)
            piece_running_totals[piece] = new_total
        list_of_refs_with_probs[i] = tuple([ref[0], piece_running_average, ref[1], ref[2], ref[3], ref[4]])
            
        
    #print(list_of_refs_with_probs)
    return list_of_refs_with_probs
        
        

In [22]:
# Using one of the pairs as a test, rest is for LOSO-CV development
test_pair_num = ["r2"]

# Create reference resolution training + test data with features from the model
* TODO setting - exclude and re-compute based on pieces 'in play', or assume the robot doesn't know - try both
* Try 1) global model only without active learning, 2) global + local standard 3) global and local incrementally/dynamically updating, see the difference in ref res. Measures on accuracy + surprisal.
* Naive bayes with individual language models for each piece + marginalization? OR model can work independently with any probabilistic classifier as a joint probability?
* Get an optimal weighting between the local, incremental model and global models

In [23]:
def train_and_get_predictions_for_test_pair(training_pair_folds, test_pair_data, params):
    # Creates the appropriate language models for, and trains and tests classifiers for a given test pair
    # create the speaker-wise fold language models (using all other pairs except key)
    # will take a training_
    # STEP 1. CREATE THE SPEAKER LMS
    speaker_fold_lms = {}
    # Train a separate global language model for each piece (?) in the training data
    for piece in training_pair_folds.keys():

        for pair_num in training_pair_folds[piece]:
            if speaker_fold_lms.get(pair_num) is None:
                speaker_fold_lms[pair_num] = {}
            if speaker_fold_lms[pair_num].get(piece) is None:
                speaker_fold_lms[pair_num][piece] = {}
            # iterate over the other 
            training_refs = []
            for other_pair_num in training_pair_folds[piece]:
                if pair_num == other_pair_num:
                    continue
                training_refs.extend(training_pair_folds[piece][other_pair_num])  
            training_fold = [r[0] for r in training_refs]
            #print("length of training fold for lm2", piece, pair_num, len(training_fold))
            if len(training_fold) <2:
                print("not enough data", piece, pair_num)
                continue
            speaker_fold_lms[pair_num][piece] = mini_language_model(ORDER, smoothing_k=SMOOTHING_K)
            speaker_fold_lms[pair_num][piece].train([sent.split() for sent in training_fold])


    # STEP 2. Create the data for classification using features from the speaker fold lms        
    piece_references_and_probabilities_per_pair = {}
    total_p = 0

    for piece in test_pair_data.keys():
        print("PIECE", piece)
        total_perplexity = 0
        for pair_num in training_pair_folds[piece]:

            print("PAIR_NUM", pair_num)
            training_refs = []
            for other_pair_num in training_pair_folds[piece]:
                if pair_num == other_pair_num:
                    continue
                training_refs.extend(training_pair_folds[piece][other_pair_num])

            # create global refs lms for combination with


            refs = training_pair_folds[piece][pair_num]
            #print(refs)
            if len(refs) < 2 or (piece not in good_pieces):
                print("insufficient data for piece for pair", pair_num, "for piece", piece)
                continue
            print("length of training data", len(refs)-1)
            list_of_refs_with_probs = mean_cross_entropy_of_each_reference_with_lms_trained_on_interaction_so_far(piece,
                                                                                                                pair_num,
                                                                                                                training_pair_folds,
                                                                                                                order=params["order"],
                                                                                                                smoothing_k=params["smoothing_k"],
                                                                                                                #lm_other=None)
                                                                                                                lm_other=speaker_fold_lms,
                                                                                                                lambda_lm1=params["local_lambda"],
                                                                                                                #lambda_lm1=1)
                                                                                                                ##lambda_lm1=0,
                                                                                                                cutoff_word_i=params["cut_off_local_i"])
            #print(similarity)
            #total_perplexity+=similarity
            if not piece_references_and_probabilities_per_pair.get(pair_num):
                piece_references_and_probabilities_per_pair[pair_num] = {}
            if not piece_references_and_probabilities_per_pair[pair_num].get(piece):
                piece_references_and_probabilities_per_pair[pair_num][piece] = list_of_refs_with_probs



            #break
        print("total", total_perplexity, "*" * 3)
        #total_p+=total_perplexity
    print("cross-entropy ALL PAIRS", total_p)
    
    # STEP 3: CREATE THE CLASSIFIER BASED ON THE TRAINING DATA AND TEST ON TEST DATA
    
    
    
    preds = []
    return preds
    

    

In [24]:
def get_zscore_dict_for_prob_dict(prob_dict):
    probs  = [item[1] for item in sorted(prob_dict.items(), key=lambda x:x[0])] # alpha
    zscores = list(stats.zscore(np.array(probs)))
    def convert_nan(v):
        if np.isnan(v):
            return - 100
        return v
    return {k: convert_nan(v) for k, v in zip(sorted(prob_dict.keys()), zscores)}

get_zscore_dict_for_prob_dict({"K": 0.33, "Y": 0.35, "X": 0.33})

{'K': -0.7071067811865476, 'X': -0.7071067811865476, 'Y': 1.4142135623730951}

In [25]:
def get_rank_dict_for_prob_dict(prob_dict):
    rank_dict = {}
    rank = 0
    for item in sorted(prob_dict.items(), key=lambda x:x[1]):
        rank+=1
        rank_dict[item[0]] = rank
    return rank_dict
get_rank_dict_for_prob_dict({"K": 0.34, "Y": 0.35, "X": 0.33})

{'X': 1, 'K': 2, 'Y': 3}

In [26]:
def moving_average(a_list):

    averages = []
    total = 0
    for i, a in enumerate(a_list):
        total+=a
        averages.append(total/(i+1))
    return averages


In [27]:
if False:
    # Do a cross-fold model by speaker (and get word MI info for referent type)
    results = []
    piece_counter = Counter()
    word_counter = Counter()
    word_piece_counter = Counter()

    all_preds = []
    all_y = []

    if False:
    #for PAIR_NUM in piece_references_and_probabilities_per_pair['X'].keys():
        print(PAIR_NUM)
        if PAIR_NUM in exclude:
            pass
        train_data = []
        test_data = []

        for piece in piece_references_and_probabilities_per_pair.keys():
            #print(piece)
            if not piece_references_and_probabilities_per_pair.get(piece):
                print("not enough data", piece)
                continue
            for pair_num in piece_references_and_probabilities_per_pair[piece]:
                if pair_num in exclude:
                    continue
                #print(pair_num)
                count_ref_so_far = 0
                for ref_info in piece_references_and_probabilities_per_pair[piece][pair_num]:
                    piece_lm_prob_local, piece_lm_moving_average, text, speaker, utt_id, end_time = ref_info
                    piece_counter[piece] += 1
                    if CORPUS in ["TAKECV", "TAKE"]:
                        local_word_count = Counter()
                        feature_vector = {}
                        for f in db.execute('SELECT word from words WHERE gameID ="' + str(gameID) + '"'):
                            #print(f)
                            for word in clean_utt(f[0].lower()).split():
                                local_word_count[word] += 1
                        word_dem = len(local_word_count.values())
                        for k, v in local_word_count.items():
                            feature_vector[k] = local_word_count[k]/word_dem
                            word_counter[k] += (v/word_dem)
                            word_piece_counter[piece+"__"+k] += (v/word_dem)
                    elif CORPUS == "PENTOCV":
                        feature_vector = {}

                        clean_utt = text.lower().split()
                        word_dem = len(clean_utt)

                        local_word_count = Counter()

                        for word in clean_utt:
                            word_counter[word] += (1/word_dem)   # for global
                            word_piece_counter[piece+"__"+word] += (1/word_dem)  #for global
                            local_word_count[word] += 1

                        for k, v in local_word_count.items():
                            feature_vector[k] = local_word_count[k]/word_dem

                        LOCAL = True
                        if LOCAL:
                            #for k, v in piece_lm_prob_local.items():
                            #    feature_vector['local_prob_' + k] = v

                            zscore_dict = get_zscore_dict_for_prob_dict(piece_lm_prob_local)

                            #for k, v in zscore_dict.items():
                            #    feature_vector['local_zscore_prob_' + k] = v 


                            #for k, v in piece_lm_moving_average.items():
                            #    feature_vector['local_prob_moving_' + k] = v


                            zscore_dict_average = get_zscore_dict_for_prob_dict(piece_lm_moving_average)

                            for k, v in zscore_dict_average.items():
                                feature_vector['local_zscore_prob_moving_' + k] = v 


                            #rank_dict = get_rank_dict_for_prob_dict(piece_lm_prob_local)

                            #for k, v in rank_dict.items():
                            #    feature_vector['local_prob_rank' + k] = v 

                        count_ref_so_far+=1
                        if count_ref_so_far > 7:
                            pass
                        #feature_vector["count_ref_so_far"] = count_ref_so_far

                        if pair_num == PAIR_NUM:
                            test_data.append((feature_vector, piece))
                        else:
                            train_data.append((feature_vector, piece))
        cl = trainClassifier(train_data)
        print(train_data[0])
        y_true = [x[1] for x in test_data]
        all_y.extend(y_true)
        y_pred = predictLabels(test_data, cl)
        all_preds.extend(y_pred)
        prf = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        results.append(prf)
        print(prf)
        print(classification_report(y_true, y_pred))
    print(np.mean([x[0] for x in results]))
    print(np.mean([x[1] for x in results]))
    print(np.mean([x[2] for x in results]))

In [28]:
def mean_cross_entropy_on_test_folds(k, pair_piece_lms_orig, folds, prob_model):
    """Returns the mean of the cross entropy on the folds using the appropriate lm.
    Assumes folds will be in correct order w.r.t. time of reference."""
    # firstly, duplicate the language model to avoid any side effects
    #print(k)
    pair_piece_lms = copy.deepcopy(pair_piece_lms_orig)
    # Set the k for all to be the same
    for pair_num in pair_piece_lms.keys():
        for piece in pair_piece_lms[pair_num].keys():
            pair_piece_lms[pair_num][piece].smoothing_k = k[0]
            
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
            # assume we know the correct ref piece
            piece = ref_info[4]
            lm = pair_piece_lms[pair_num][piece]
            ref = ["<s>"] * (lm.order -1) + ref_info[0].split() + ["</s>"]
            prob = 0
            for j in range(lm.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm.order-1):j]
                ngram = context + [target]
                prob += log(lm.prob_lidstone(ngram, lm.order))
            s += prob
            count +=1
            if prob_model == 'self':
                # self language model, starting empty for a piece
                # then will train as it encounters each piece
                #print(pair_piece_lms[pair_num][piece].num_training_sents, "train sents so far before")
                pair_piece_lms[pair_num][piece].train([ref_info[0].split()])
    
        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
    #print(k, cross_entropies, np.mean(cross_entropies))
    return np.mean(cross_entropies)

In [29]:
TEST = 'r3'
training_folds = {k:references_per_pair[k] for k in filter(lambda x:x!=TEST, references_per_pair.keys())}
print(training_folds.keys())
print(len(training_folds['r1']), training_folds['r1'][:10])

dict_keys(['r1', 'r2', 'r4', 'r5', 'r6', 'r7', 'r8'])
100 [('das orange l', 'r1_B', 5887, 204.521, 'L'), ('das kreuz', 'r1_B', 5893, 219.022, 'X'), ('das grüne t', 'r1_B', 5901, 228.117, 'T'), ('das t', 'r1_B', 5903, 231.474, 'T'), ('das t', 'r1_B', 5907, 241.893, 'T'), ('das t', 'r1_B', 5907, 241.893, 'T'), ('das gelbe', 'r1_B', 5919, 258.434, 'U'), ('die brücke', 'r1_B', 5921, 260.36, 'U'), ('das pinkfarbene teil', 'r1_B', 5924, 266.23, 'P'), ('den blauen balken', 'r1_B', 5935, 304.366, 'I')]


In [30]:
def get_params_for_lowest_entropy_on_fold_test(folds, pieces, n_range, k_val_range, prob_model='self'):
    """An x-fold process run multiple times to find the best params of n and k (smoothing constant)
    for an ngram model
    """
    results = {}  # store results for each model
    for test_n in n_range:
        pair_piece_lms = {}  # will produce all lms for each speaker for each piece with order test_n
        for test_pair in folds.keys():
            # test pair is the list of refs to get the entropy results on in both cases
            # in 'other' model, referent_lm model is trained on refs on all other folds
            # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
                  #i.e. retrained with the ref added to the counts after that ref is tested
            pair_piece_lms[test_pair] = {}  # key piece, value language model for that piece
            for piece in pieces:
                if prob_model == 'other':
                    # train the model (smoothing can be changed as it doesn't affect training)

                    training_refs = []
                    
                    for other_pair_num in folds.keys():
                        if other_pair_num == test_pair:
                            continue
                        training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])])  
                    training_fold = [r[0] for r in training_refs]
                    #print("length of training fold for lm2", piece, test_pair, len(training_fold))
                    if len(training_fold) <1:
                        print("not enough data", piece, pair_num)
                        continue
                    pair_piece_lms[test_pair][piece] = mini_language_model(test_n, smoothing_k=0.0001)    
                    pair_piece_lms[test_pair][piece].train([sent.split() for sent in training_fold])

                elif prob_model == 'self':
                    # just initialize lm for piece, no need to train
                    pair_piece_lms[test_pair][piece] = mini_language_model(test_n, smoothing_k=0.0001)    
             
            
        # run nelder mead to get best k for this n
        best = optimize.minimize(
                mean_cross_entropy_on_test_folds,
                k_val_range[0],    # first argument that to be optimized (k)
                args=(pair_piece_lms, folds, prob_model),  # other arguments to the function
                method='Nelder-Mead', # use nelder mead to find minima
                tol=0.0001, # to this degree of error
                options={'disp': False},
                bounds=[k_val_range]
        )
        best_k, best_ce = best.x[0], best.fun
        print("best k", best_k, best_ce)
        results[test_n] = (best_k, best_ce)
    
    print(results)
    n = min(results.items(), key=lambda x:x[1][1])[0]
    k = results[n][0]
               
    return n, k, results

In [31]:
# Firstly, in a self-supervised way, optimize the global + local language models in terms of n and smoothing param to get the lowest cross-entropy
# We only go for the global best average across all pairs and pieces for the rest of the experiments
TEST = 'r2'
training_folds = {k:references_per_pair[k] for k in filter(lambda x:x!=TEST, references_per_pair.keys())}

opt_params = {}
n_range = [1,2,3]
k_val_range = [0.00000001, 1]

if True:
    n_self, k_self, lm_opt_results_self = get_params_for_lowest_entropy_on_fold_test(training_folds, good_pieces,
                                                                  n_range, k_val_range, prob_model='self')

    opt_params["n_self"] = n_self
    opt_params["k_self"] = k_self


n_other, k_other, lm_opt_results_other = get_params_for_lowest_entropy_on_fold_test(training_folds, good_pieces,
                                                              n_range, k_val_range, prob_model='other')
opt_params["n_other"] = n_other
opt_params["k_other"] = k_other
print(opt_params)

best k 1.0 6.73712157905053
best k 1.0 6.253234714310423
best k 1.0 6.821026665546368
{1: (1.0, 6.73712157905053), 2: (1.0, 6.253234714310423), 3: (1.0, 6.821026665546368)}
best k 1.0 8.189708894975334
best k 0.618332169500001 7.2548672865266335
best k 0.6607339615000012 7.620654164968658
{1: (1.0, 8.189708894975334), 2: (0.618332169500001, 7.2548672865266335), 3: (0.6607339615000012, 7.620654164968658)}
{'n_self': 2, 'k_self': 1.0, 'n_other': 2, 'k_other': 0.618332169500001}


In [32]:
def mean_cross_entropy_on_test_folds_joint_model(lambda_lm1, pair_piece_lms_self_orig, pair_piece_lms_other_orig,
                                                 cutoff_word_global_lm, folds):
    #print("lambda", lambda_lm1, "cutoff", cutoff_word_global_lm)
    pair_piece_lms_self = copy.deepcopy(pair_piece_lms_self_orig)
    pair_piece_lms_other = copy.deepcopy(pair_piece_lms_other_orig)
            
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
            # assume we know the correct ref piece
            piece = ref_info[4]
            
            # get the weighted prob contribution from the self model
            self_prob = 1
            lm_self = pair_piece_lms_self[pair_num][piece]
            ref = ["<s>"] * (lm_self.order -1) + ref_info[0].split() + ["</s>"]
            
            for j in range(lm_self.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm_self.order-1):j]
                ngram = context + [target]
                self_prob = self_prob * lm_self.prob_lidstone(ngram, lm_self.order)
                
            # get the weighted prob contribution from the other model
            other_prob = 1
            lm_other = pair_piece_lms_other[pair_num][piece]
            ref = ["<s>"] * (lm_other.order -1) + ref_info[0].split() + ["</s>"]
            for j in range(lm_other.order-1, len(ref)):
                target = ref[j]
                context = ref[j-(lm_other.order-1):j]
                ngram = context + [target]
                other_prob = other_prob * lm_other.prob_lidstone(ngram, lm_other.order)
            
            # combine the probs with correct lambda weights
            
            # calculate the correct lambda weights based on the cut-off word
            
            lambda_global = 1- lambda_lm1
            factor = 0 if cutoff_word_global_lm <=1 else (1-lambda_global)/(cutoff_word_global_lm-1)
            num_refs_f = pair_piece_lms_self[pair_num][piece].num_training_sents  # + 1 # to include the current one?
            current_lambda_lm1 = 1 - max([lambda_global + (factor * ((((cutoff_word_global_lm-1)-num_refs_f)))), lambda_global])

            
            prob = (current_lambda_lm1 * self_prob) + ((1-current_lambda_lm1) * other_prob)
            #final_ref_info = tuple(list(ref_info) + [prob, len(ref_info[0].split()), num_refs_f])
            #piece_references_and_probabilities_per_pair[pair_num].append(final_ref_info)
            
            s += log(prob)
            count +=1
            
            # update self language model, which starts empty for a piece
            # then will train as it encounters each piece
            #print(pair_piece_lms_self[pair_num][piece].num_training_sents, "train sents so far before")
            pair_piece_lms_self[pair_num][piece].train([ref_info[0].split()])

        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
    #print(lambda_lm1, cross_entropies, np.mean(cross_entropies))
    return np.mean(cross_entropies)

In [33]:
def get_best_params_cutoff_lambda_xval(n_self, n_other, k_self, k_other, pieces, folds,
                                       cutoff_range = [1,10],
                                       lambda_range=[0,1]):
    """An x-fold process run multiple times to find the best params of lambda (weighting of local language model)
    and cut-off word at which only the local model applies (with a gradual weighting)
    """
    results = {}  # store results for each model
    
    for test_cutoff in range(cutoff_range[0], cutoff_range[1]+1):
        pair_piece_lms_self = {}  # will produce all lms for each speaker for each piece with order test_cutoff
        pair_piece_lms_other = {}  # will produce all lms for each speaker for each piece with order test_cutoff
        for test_pair in folds.keys():
            # test pair is the list of refs to get the entropy results on in both cases
            # in 'other' model, referent_lm model is trained on refs on all other folds
            # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
                  #i.e. retrained with the ref added to the counts after that ref is tested
            pair_piece_lms_other[test_pair] = {}  # key piece, value language model for that piece
            pair_piece_lms_self[test_pair] = {}  # key piece, value language model for that piece
            for piece in pieces:
                for prob_model in ['other', 'self']:
                    if prob_model == 'other':
                        # train the model (smoothing can be changed as it doesn't affect training)

                        training_refs = []

                        for other_pair_num in folds.keys():
                            if other_pair_num == test_pair:
                                continue
                            training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])])  
                        training_fold = [r[0] for r in training_refs]
                        #print("length of training fold for lm2", piece, test_pair, len(training_fold))
                        if len(training_fold) <1:
                            print("not enough data", piece, pair_num)
                            continue
                        pair_piece_lms_other[test_pair][piece] = mini_language_model(n_other, smoothing_k=k_other)    
                        pair_piece_lms_other[test_pair][piece].train([sent.split() for sent in training_fold])

                    elif prob_model == 'self':
                        # just initialize lm for piece, no need to train
                        pair_piece_lms_self[test_pair][piece] = mini_language_model(n_self, smoothing_k=k_self)    

        # run nelder mead to get best lambda
        best = optimize.minimize(
                mean_cross_entropy_on_test_folds_joint_model,
                lambda_range[0],    # first argument to be optimized (lambda)
                args=(pair_piece_lms_self, pair_piece_lms_other, test_cutoff, folds),  # other arguments to the function
                method='Nelder-Mead', # use nelder mead to find minima
                tol=0.0001, # to this degree of error
                options={'disp': False},
                bounds=[lambda_range]
        )
        best_lambda, best_ce = best.x[0], best.fun
        #print("best lambda", best_lambda, best_ce)
        results[test_cutoff] = (best_lambda, best_ce)
    
    print(results)
    cutoff = min(results.items(), key=lambda x:x[1][1])[0]
    lambda_lm1 = results[cutoff][0]
               
    return cutoff, lambda_lm1, results
    

In [34]:
def get_new_piece_lms_from_folds(n, k, pieces, folds, target_folds=None, train=False):
    """Returns a piece based language model objects of n=n, k=k, using the training folds"""
    pair_piece_lms_other = {}
    for test_pair in folds.keys():
        # test pair is the list of refs to get the entropy results on in both cases
        # in 'other' model, referent_lm model is trained on refs on all other folds
        # in 'self' model, referent_lm model is trained on test_pair incrementally dynamically
              #i.e. retrained with the ref added to the counts after that ref is tested
        if not test_pair in target_folds:
            continue
        pair_piece_lms_other[test_pair] = {}  # key piece, value language model for that piece
        for piece in pieces:
            # train the model (smoothing can be changed as it doesn't affect training)
            pair_piece_lms_other[test_pair][piece] = mini_language_model(n, smoothing_k=k)  
            if train:
                training_refs = []
                for other_pair_num in folds.keys():
                    if other_pair_num == test_pair:
                        continue
                    training_refs.extend([ref for ref in filter(lambda x:x[4]==piece, folds[other_pair_num])])  
                training_fold = [r[0] for r in training_refs]
                #print("length of training fold for lm", piece, test_pair, len(training_fold))
                if len(training_fold) <1:
                    print("not enough data", piece, test_pair)
                    continue
                pair_piece_lms_other[test_pair][piece].train([sent.split() for sent in training_fold])
    return pair_piece_lms_other

In [35]:
def generate_classifier_data_from_lms_and_params(pair_piece_lms_self_orig, pair_piece_lms_other_orig, pieces, lambda_lm1, cutoff_word_global_lm, folds, lexical=True):
    # Creates the appropriate language models for, and trains and tests classifiers for a given test pair
    # create the speaker-wise fold language models (using all other pairs except key)
   
    # Record all scores (probabilities) for each piece according to the language models in the folds 
    # (Which uses both other language models from observing other pairs and the local updating language model/pact model)
    piece_references_and_probabilities_per_pair = {}

    pair_piece_lms_self = copy.deepcopy(pair_piece_lms_self_orig)
    pair_piece_lms_other = copy.deepcopy(pair_piece_lms_other_orig)

  
    
    cross_entropies = []  # get the cross-entropy for each fold
    for pair_num in folds.keys():
        test_set = folds[pair_num]
        piece_references_and_probabilities_per_pair[pair_num] = []
        s = 0
        count = 0
        for i, ref_info in enumerate(test_set):
   
            target_piece = ref_info[4]
            
            piece_prob_dict = {} # gets the raw probs assigned to ref by all models
            
            for piece in pieces:
                # get the weighted prob contribution from the self model
                self_prob = 1
                lm_self = pair_piece_lms_self[pair_num][piece]
                ref = ["<s>"] * (lm_self.order -1) + ref_info[0].split() + ["</s>"]

                for j in range(lm_self.order-1, len(ref)):
                    target = ref[j]
                    context = ref[j-(lm_self.order-1):j]
                    ngram = context + [target]
                    self_prob = self_prob * lm_self.prob_lidstone(ngram, lm_self.order)

                # get the weighted prob contribution from the other model
                other_prob = 1
                lm_other = pair_piece_lms_other[pair_num][piece]
                ref = ["<s>"] * (lm_other.order -1) + ref_info[0].split() + ["</s>"]
                for j in range(lm_other.order-1, len(ref)):
                    target = ref[j]
                    context = ref[j-(lm_other.order-1):j]
                    ngram = context + [target]
                    other_prob = other_prob * lm_other.prob_lidstone(ngram, lm_other.order)

                # combine the probs with correct lambda weights

                # calculate the correct lambda weights based on the cut-off word
                lambda_global = 1- lambda_lm1
                factor = 0 if cutoff_word_global_lm <=1 else (1-lambda_global)/(cutoff_word_global_lm-1)
                num_refs_f = pair_piece_lms_self[pair_num][piece].num_training_sents  # + 1 # to include the current one?
                current_lambda_lm1 = 1 - max([lambda_global + (factor * ((((cutoff_word_global_lm-1)-num_refs_f)))), lambda_global])


                prob = (current_lambda_lm1 * self_prob) + ((1-current_lambda_lm1) * other_prob)
                
                
                    
                
                
                piece_prob_dict[piece] = (-log(prob)/len(ref_info[0].split()), num_refs_f)

                # only get the entropy for target piece
                if piece == target_piece:
                    s += log(prob)
                    count +=1
            
            
            
            # update self language model, which starts empty for a piece
            # then will train as it encounters each piece
            #print(pair_piece_lms_self[pair_num][piece].num_training_sents, "train sents so far before")
            pair_piece_lms_self[pair_num][target_piece].train([ref_info[0].split()])
            
        
            
            # could add moving averages for how this piece has been judged probability-wise by each piece model
            # built so far in this interaction by different models
            # for this instance, we don't know what the correct referent is, so we only have the probs from the models
            #  will be num shapes * num shapes
            # assumes the latest prob dict for this piece is a new instance for all shapes:
            init_probs_from_current = {k: [v[0]] for k, v in piece_prob_dict.items()}
            probs_assigned_to_pieces_so_far = {k:copy.deepcopy(init_probs_from_current) for k, v in piece_prob_dict.items()}
            # for previous instances, we do know what the correct referents were
            # so we can check what the effect of combining the current probs to those from previous positive examples
            # NB and negative ones too?
            # scroll forward from start of interaction up to but not including current ref
            for j in range(0, i):
                back_ref_info = piece_references_and_probabilities_per_pair[pair_num][j]
                back_ref_prob_dict = back_ref_info[5]
                back_ref_target_piece = back_ref_info[4]
                
                for k, v in back_ref_prob_dict.items():
                    if probs_assigned_to_pieces_so_far[back_ref_target_piece].get(k) is None:
                        probs_assigned_to_pieces_so_far[back_ref_target_piece][k] = []
                    probs_assigned_to_pieces_so_far[back_ref_target_piece][k].append(v[0])
            
            # will be num shapes * num shapes
            moving_average_prob_dicts_all_lms = {k:{} for k, v in piece_prob_dict.items()}
            for target_p in probs_assigned_to_pieces_so_far.keys():
                for a_p in probs_assigned_to_pieces_so_far[target_p].keys():
                    moving_average_prob_dicts_all_lms[target_p][a_p] = np.mean(probs_assigned_to_pieces_so_far[target_p][a_p])
                    
            
            
            final_ref_info = tuple(list(ref_info) + [piece_prob_dict, moving_average_prob_dicts_all_lms, len(ref_info[0].split())])
            piece_references_and_probabilities_per_pair[pair_num].append(final_ref_info)
            

        cross_entropy = -s / count
        cross_entropies.append(cross_entropy)
  
    return piece_references_and_probabilities_per_pair
    

In [36]:
def get_zscore_dict_for_prob_dict(prob_dict):
    probs  = [item[1] for item in sorted(prob_dict.items(), key=lambda x:x[0])] # alpha
    zscores = list(stats.zscore(np.array(probs)))
    def convert_nan(v):
        if np.isnan(v):
            return - 100
        return v
    return {k: convert_nan(v) for k, v in zip(sorted(prob_dict.keys()), zscores)}

get_zscore_dict_for_prob_dict({"K": 0.33, "Y": 0.35, "X": 0.33})

{'K': -0.7071067811865476, 'X': -0.7071067811865476, 'Y': 1.4142135623730951}

In [37]:
def generate_classifier_data_from_raw(folds, lexical=True, lm_features=True, moving_average=False):
    """Gets derived data from raw probs from language models"""
    final_data = []
    current_pair_num = None
    for current_pair_num in folds.keys():
        raw_data = folds[current_pair_num]
        for ref_info in raw_data:
            #('das orange l', 'r1_B', 5887, 204.521, 'L', prob_dict, ref_length)

            text, speaker, utt_id, end_time, target_piece, prob_dict, piece_lm_moving_average, ref_length = ref_info




            #if current_pair_piece_probs.get(piece) is None:
            #    current_pair_piece_probs[piece] = []


            feature_vector = {}

            if lexical:
                clean_utt = text.lower().split()
                word_dem = len(clean_utt)

                local_word_count = Counter()

                for word in clean_utt:
                    #word_counter[word] += (1/word_dem)   # for global
                    #word_piece_counter[piece+"__"+word] += (1/word_dem)  #for global
                    local_word_count[word] += 1

                for k, v in local_word_count.items():
                    feature_vector[k] = local_word_count[k]/word_dem



            if lm_features:
                length_weighted_prob_dict = {k:v[0] for k,v in prob_dict.items()}

                #for k, v in length_weighted_prob_dict.items():
                #   feature_vector['local_prob_' + k] = v

                zscore_dict = get_zscore_dict_for_prob_dict(length_weighted_prob_dict)

                for k, v in zscore_dict.items():
                    feature_vector['local_zscore_prob_' + k] = v 

                if moving_average:
                    #pass
                    # to get a smoother measure of the pacts so far (after the first one), 
                    # we calculate what the moving average lm score is for each piece lm
                    # with which previous pieces and current probs for this pair
                    # give for the current piece (not knowing what it is and assuming it's of each type)
                    # i.e. assuming it's of a shape (X), what's the moving average of all the piece lm's on this type of piece?
                    # you'd need num pieces * num pieces number of moving probs for it to be fair, not just for the target

                    for _target in piece_lm_moving_average.keys():
                        #if not _target == target_piece: # NB this is cheating!
                        #    continue
                        zscore_dict_average = get_zscore_dict_for_prob_dict(piece_lm_moving_average[_target])
                        for k, v in zscore_dict_average.items():  
                            if not k == _target:
                                continue
                            #feature_vector['local_prob_moving_zcore' + _target + ":" + k] = v
                            #feature_vector['local_prob_moving_zcore' + _target + ":" + k] =  piece_lm_moving_average[_target][k]
                            feature_vector['local_prob_moving_zcore' + _target + ":" + k] =  v

                #zscore_dict_average = get_zscore_dict_for_prob_dict(piece_lm_moving_average)

                #for k, v in zscore_dict_average.items():
                #    feature_vector['local_zscore_prob_moving_' + k] = v 


            #rank_dict = get_rank_dict_for_prob_dict(piece_lm_prob_local)

            #for k, v in rank_dict.items():
            #    feature_vector['local_prob_rank' + k] = v 
            final_data.append((feature_vector, target_piece))
    return final_data


In [38]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def train_classifier(trainData):
    print("Training Classifier...")
    return SklearnClassifier(LinearSVC(loss='squared_hinge')).train(trainData)
    #return SklearnClassifier(LogisticRegression()).train(trainData)

def predict_labels(samples, classifier):
    return classifier.classify_many(map(lambda t: t[0], samples))


In [39]:
# experiment with and optimize the best lambda weight on the local model and best cut-off point
# in process get the cross validation results
print(opt_params)
n_self = opt_params["n_self"]
n_other = opt_params["n_other"]
k_self = opt_params["k_self"]
k_other = opt_params["k_other"]
cutoff_word_global_lm, lambda_local, x_val_results = get_best_params_cutoff_lambda_xval(n_self, n_other,
                                                                                        k_self, k_other,
                                                                                        good_pieces, training_folds,
                                                                                        cutoff_range=[1,5],
                                                                                        lambda_range=[0,1]) 
opt_params["lambda_local"] = lambda_local
opt_params["cutoff_word_global_lm"] = cutoff_word_global_lm

{'n_self': 2, 'k_self': 1.0, 'n_other': 2, 'k_other': 0.618332169500001}
{1: (0.7490625000000009, 5.865760074071328), 2: (0.8391875000000009, 5.7896640372730195), 3: (0.8640625000000008, 5.785753739659943), 4: (0.880875000000001, 5.787766806993678), 5: (0.8880000000000009, 5.799222345385503)}


In [40]:
# finally get the test results with best language models (will have done this 7 times above in each xval)
TEST = 'r2'
HELDOUT = 'r8'  # take out one of the training folds for the lms?
print(opt_params)
# {'n_self': 2, 'k_self': 1.0, 'n_other': 2, 'k_other': 0.618332169500001, 'lambda_local': 0.8640625000000008, 'cutoff_word_global_lm': 3}
#opt_params['lambda_local'] = 0.8640625000000008
#opt_params['cutoff_word_global_lm'] = 3

n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]

def get_fscore(references_per_pair, test, heldout, n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm):
    # TODO could randomly take one out r7 so it's just 6 for training, else language model is different size
    training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [test], references_per_pair.keys())}
    lms_global = get_new_piece_lms_from_folds(n_other, k_other, good_pieces,
                                                      training_folds, target_folds=training_folds.keys(), train=True)
    lms_self = get_new_piece_lms_from_folds(n_self, k_self, good_pieces,
                                                      training_folds, target_folds=training_folds.keys(), train=False)

    raw_train_data = generate_classifier_data_from_lms_and_params(lms_self,
                                                                  lms_global,
                                                                  good_pieces,
                                                                  lambda_local,
                                                                  cutoff_word_global_lm,
                                                                  training_folds)

    train_data = generate_classifier_data_from_raw(raw_train_data, lexical=True, lm_features=True, moving_average=True)
    #print("length training data", len(train_data))
    #print(train_data[0])


    # now generate the test data using LMs trained on all the other folds
    
    all_training_folds = {k:references_per_pair[k] for k in filter(lambda x:x not in [heldout], references_per_pair.keys())}
    test_folds ={k:references_per_pair[k] for k in filter(lambda x:x==test, references_per_pair.keys())}

    lms_global = get_new_piece_lms_from_folds(n_other, k_other, good_pieces,
                                                      all_training_folds, target_folds=[test], train=True)
    lms_self = get_new_piece_lms_from_folds(n_self, k_self, good_pieces,
                                                      test_folds, target_folds=[test], train=False)


    raw_test_data = generate_classifier_data_from_lms_and_params(lms_self,
                                                                 lms_global,
                                                                 good_pieces,
                                                                 lambda_local,
                                                                 cutoff_word_global_lm,
                                                                 test_folds)


    test_data = generate_classifier_data_from_raw(raw_test_data, lexical=True, lm_features=True, moving_average=True)
    #print(test_data[0])


    cl = train_classifier(train_data)
    #print(train_data[0])
    y_true = [x[1] for x in test_data]
    y_pred = predict_labels(test_data, cl)
    prf = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(prf)
    #print(classification_report(y_true, y_pred))
    return prf[2]

{'n_self': 2, 'k_self': 1.0, 'n_other': 2, 'k_other': 0.618332169500001, 'lambda_local': 0.8640625000000008, 'cutoff_word_global_lm': 3}


In [41]:
n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]

for test in ['r1', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']:
    get_fscore(references_per_pair, test, "", n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm)

Training Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


(0.8253030303030303, 0.78, 0.779216276477146, None)
Training Classifier...




(0.9166345190654965, 0.9132075471698113, 0.9130725322766144, None)
Training Classifier...




(0.9044023256920704, 0.887719298245614, 0.8914993743133893, None)
Training Classifier...




(0.9032582801813569, 0.8881118881118881, 0.8895411562019954, None)
Training Classifier...




(0.8833943240082681, 0.8682170542635659, 0.8658178141796523, None)
Training Classifier...




(0.8876336425274548, 0.8636363636363636, 0.8575700778096376, None)
Training Classifier...
(0.7771469178497548, 0.7668711656441718, 0.761291587495105, None)




In [50]:
n_other = opt_params['n_other']
n_self = opt_params['n_self']
k_other = opt_params['k_other']
k_self = opt_params['k_self']
lambda_local = opt_params['lambda_local']
cutoff_word_global_lm = opt_params["cutoff_word_global_lm"]

cross_val_data = {k:references_per_pair[k] for k in filter(lambda x:x !='r2', references_per_pair.keys())}
results = {}

best_f = 0
best_params = []
for cutoff_word_global_lm in range(1,11):
    for lambda_local_raw in range(0,105,5):
        lambda_local = lambda_local_raw/100
        all_scores = []
        for test in ['r1', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8']:
            fscore = get_fscore(cross_val_data, test, "", n_other, n_self, k_other, k_self, lambda_local, cutoff_word_global_lm)
            all_scores.append(fscore)
        mean_f = np.mean(all_scores)
        test_params = [n_other, n_self, cutoff_word_global_lm, k_other,k_self, lambda_local]
        print(test_params, mean_f)
        results[",".join([str(f) for f in test_params])] = mean_f
        if mean_f == best_f:
            best_params = [best_params, test_params]
        elif mean_f > best_f:
            best_f = mean_f
            best_params = test_params
            print("best", best_f, best_params)


Training Classifier...




(0.7558866133866133, 0.71, 0.7113646981038286, None)
Training Classifier...




(0.9156466676835816, 0.8981132075471698, 0.8969616973323173, None)
Training Classifier...




(0.8962309837972213, 0.887719298245614, 0.8905707277479048, None)
Training Classifier...




(0.8706085581085581, 0.8531468531468531, 0.8575328025499532, None)
Training Classifier...
(0.7964580964128828, 0.7403100775193798, 0.7166355215128715, None)




Training Classifier...
(0.8627141293050383, 0.8522727272727273, 0.851100426624943, None)




Training Classifier...
(0.7841101654841741, 0.7361963190184049, 0.7341779527716643, None)
[1, 1, 1, 0.05, 0.05, 0.0] 0.8083348323776404
best 0.8083348323776404 [1, 1, 1, 0.05, 0.05, 0.0]




Training Classifier...




(0.7438790376290377, 0.7, 0.7024714174714174, None)
Training Classifier...




(0.9156466676835816, 0.8981132075471698, 0.8969616973323173, None)
Training Classifier...




(0.8962309837972213, 0.887719298245614, 0.8905707277479048, None)
Training Classifier...




(0.8706085581085581, 0.8531468531468531, 0.8575328025499532, None)
Training Classifier...
(0.7964580964128828, 0.7403100775193798, 0.7166355215128715, None)




Training Classifier...




(0.8627141293050383, 0.8522727272727273, 0.851100426624943, None)
Training Classifier...
(0.7841101654841741, 0.7361963190184049, 0.7341779527716643, None)
[1, 1, 2, 0.05, 0.05, 0.0] 0.8070643637158674




In [53]:


#results_file = open("results" + datetime.time)
#for row in sorted(results.items(), key=lambda x:x[1], reverse=True):
results_file = open(datetime.datetime.now().isoformat() + ".csv", "w")
results_file.write(",".join(['n_other', 'n_self', 'cutoff_word_global_lm', 'k_other', 'k_self', 'lambda_local', 'fscore']) + "\n")
for params, result in sorted(results.items(), key=lambda x:x[1], reverse=True):
    results_file.write(params + "," + str(result) + "\n")
results_file.close()

In [57]:
#import pandas as pd
#pd.read_csv(open("2023-06-11T23:29:46.235511"))
print("finished")
exit()

Unnamed: 0,n_other,n_self,cutoff_word_global_lm,k_other,k_self,lambda_local,fscore
0,1,1,1,0.05,0.05,0.0,0.808335
1,1,1,2,0.05,0.05,0.0,0.807064
