# Imports

In [23]:
import heapq
import json
import itertools
import psutil
import sys
import scipy.spatial.distance

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from scipy import stats
from functools import lru_cache
from numpy.random import randint
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler, normalize
from numpy.linalg import matrix_power
from scipy.special import expit


ModuleNotFoundError: No module named 'psutil'

In [2]:
def read_vecs(path_to_df):
    # after reading in, we drop first row & column (these are indices, not vectors)
    vecs_pd = pd.read_csv(path_to_df, sep=',',header=None)
    vecs_pd_new = vecs_pd.iloc[1:]
    vecs_pd_new = vecs_pd_new.drop([vecs_pd_new.columns[0]], axis = 1)
    return vecs_pd_new.values

# Import large dataframe giving the full vocabulary

In [3]:
sample_df = pd.read_csv("../data/connector_final.csv")

# Read in Pretrained Semantic Vectors

For each of 12218 words, we have (1) glove, (2) SWOW, and (3) non-contextual BERT embeddings.

SWOW has 2 versions : PPMI and Random Walk. We use RW.

The BERT context-free embeddings obtained by "CLS [word] SEP": summed across last four layers (768-dim)

In [5]:
representations = {}
representations['glove'] = read_vecs("../data/swow12217_glove2017wiki_vectors.csv")
representations['swow'] = np.array(pd.read_csv("../data/SWOW_R123_rw_k300.csv", header=None))
representations['bert-sum'] = read_vecs("../data/bert_contextfree_sum.csv")

# Read in the game boards used in the experiment

In [7]:
with open('../data/boards.json', 'r') as json_file:
    boards = json.load(json_file)

# Read in the clues produced in the experiment

In [9]:
## load cleaned up data from experiment
expdata = pd.read_csv("../data/final_board_clues_all.csv", encoding= 'unicode_escape')

# RSA functions

### Extract wordpairs
For each board, we need to get all of the possible pairs of words on the board in an easy-to-work-with format

In [10]:
## these combos need to be created at the board level
def compute_board_combos(board_name):
    board = boards[board_name]
    all_possible_combs = list(itertools.combinations(board, 2))
    combs_df = pd.DataFrame(all_possible_combs, columns =['Word1', 'Word2'])
    combs_df["wordpair"] = combs_df["Word1"] + '-'+ combs_df["Word2"]
    return combs_df

allcombinations_df = pd.DataFrame(columns=['Board', 'Word1','Word2', 'wordpair'])
board_combos = {board_name : compute_board_combos(board_name) for board_name in boards.keys()}
for board in board_combos:
    newdf = board_combos[board]
    newdf.insert(loc=0, column='Board', value=board)
    allcombinations_df = pd.concat([allcombinations_df, newdf])

allcombinations_df.head()

Unnamed: 0,Board,Word1,Word2,wordpair
0,e1_board1_words,angry,adore,angry-adore
1,e1_board1_words,angry,burglar,angry-burglar
2,e1_board1_words,angry,rumor,angry-rumor
3,e1_board1_words,angry,depth,angry-depth
4,e1_board1_words,angry,anchor,angry-anchor


now that we have the combos, we can make a little helper function to get the wordpair lists for a given board as needed

In [11]:
def get_wordpair_list(board_name) :
  return list(board_combos[board_name]['wordpair'])

### get matrix of similarities
this serves as our literal semantics

In [12]:
## we need to create a exp(clue-w1 * clue-cw2) matrix of size Nx190 for each board
## where N is the size of the search space
## this is normalized by-row for literal guesser
## normalized by column for pragmatic speaker
## and then normalized by row again for pragmetic guesser
def create_board_matrix(combs_df, context_board, embeddings):
    # grab subset of words in given board and their corresponding glove vectors
    board_df = sample_df[sample_df['Word'].isin(context_board)]
    board_word_indices = list(board_df.index)
    board_words = board_df["Word"]
    board_vectors = embeddings[board_word_indices]

    ## clue_sims is the similarity of ALL clues in full searchspace (size N) to EACH word on board (size 20)
    clue_sims = 1 - scipy.spatial.distance.cdist(board_vectors, embeddings, 'cosine')

    ## once we have the similarities of the clue to the words on the board
    ## we define a multiplicative function that maximizes these similarities
    board_df.reset_index(inplace = True)

    ## next we find the product of similarities between c-w1 and c-w2 for that specific board's 190 word-pairs
    ## this gives us a 190 x N array of product similarities for a given combs_df
    ## specifically, for each possible pair, pull out 
    f_w1_list =  np.array([clue_sims[board_df[board_df["Word"]==row["Word1"]].index.values[0]]
                         for  index, row in combs_df.iterrows()])
    f_w2_list =  np.array([clue_sims[board_df[board_df["Word"]==row["Word2"]].index.values[0]] 
                         for  index, row in combs_df.iterrows()])

    # result is of length 190 for the product of similarities (i.e. how similar each word i is to BOTH in pair)
    return ((f_w1_list + 1) /2) * ((f_w2_list + 1)/2)

In [13]:
board_matrices = {
    key : {board_name : create_board_matrix(board_combos[board_name], boards[board_name], embedding) 
           for board_name in boards.keys()}
    for (key, embedding) in representations.items()
}

### Literal Guesser NP

given a 190xN matrix of clue-w1 * clue-w2 products, the literal guesser computes softmax over pairs for each possible clue

In [14]:
def literal_guesser_np(board_name, representation):
    boardmatrix = board_matrices[representation][board_name]
    return softmax(boardmatrix, axis=0)

### Pragmatic Speaker NP

given a Nx190 matrix of clue products, pragmatic speaker first computes literal guesser softmax for each clue in searchspace, then computes softmax over all clues for a specific word-pair. this yields a Nx190 array with literal guesser softmax values for each possible clue.

need to implement S1 = beta * (ln(G0)- cost).

we use frequency-based cost (higher frequency means lower cost)

In [15]:
def pragmatic_speaker_np(board_name, beta, costweight, representation):
    literal_guesser_prob = np.log(literal_guesser_np(board_name, representation))
    clues_cost = -np.array(list(sample_df["LgSUBTLWF"]))
    utility = (1-costweight) * literal_guesser_prob - costweight * clues_cost
    return softmax(beta * utility, axis = 1)

### Pragmatic Guesser NP

Given a 190xN matrix of clue products, pragmatic guesser computes pragamtic speaker softmax for EACH wordpair given a particular clue

In [16]:
def pragmatic_guesser_np(board_name, beta, costweight, representation):
    return softmax(np.log(pragmatic_speaker_np(board_name, beta, costweight, representation)), axis = 0)

### Test models on example

In [17]:
clue = 'equation'
target = 'exam-algebra'
wordpairlist = get_wordpair_list('e1_board1_words')
target_index = wordpairlist.index(target)
clue_index = list(sample_df["Word"]).index(clue)

a = literal_guesser_np('e1_board1_words', 'glove')[:,clue_index]
y = pragmatic_speaker_np('e1_board1_words', 18.858, 0.004, 'glove')
top10 = y[target_index,:].argsort()[-5:][::-1].tolist()
top10_words = [list(sample_df["Word"])[x] for x in top10]
z = pragmatic_guesser_np('e1_board1_words', 18.858, 0.004, 'glove')[:,clue_index]

print("literal guesser prediction is:", wordpairlist[np.argmax(a)])
print("top10 prag speaker predictions are:", top10_words)
print("pragmatic guesser prediction is:", wordpairlist[np.argmax(z)])

literal guesser prediction is: algebra-better
top10 prag speaker predictions are: ['algebra', 'exam', 'mathematics', 'exams', 'calculus']
pragmatic guesser prediction is: depth-algebra


# Generate predictions

In [18]:
## create boards and merge with expdata
combined_boards_df = pd.DataFrame(columns=['Experiment', 'Board','boardwords'])
combined_boards_df["Experiment"]  = ["E1"] * 10 + ["E2"] * 10
combined_boards_df["Board"] = ["TrialList" + str(i) for i in range(1,11)] * 2
combined_boards_df["boardnames"] = (['e1_board' + str(i) + '_words' for i in range(1,11)] 
                                  + ['e2_board' + str(i) + '_words' for i in range(1,11)])
combined_boards_df["boardwords"] = [boards[n] for n in combined_boards_df["boardnames"]]
combined_boards_df.head()

Unnamed: 0,Experiment,Board,boardwords,boardnames
0,E1,TrialList1,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words
1,E1,TrialList2,"[alto, faith, beginning, brake, birds, aircraf...",e1_board2_words
2,E1,TrialList3,"[circle, dance, day, dark, famine, calorie, hu...",e1_board3_words
3,E1,TrialList4,"[ceiling, carpet, corpse, fight, extension, fa...",e1_board4_words
4,E1,TrialList5,"[ancestor, child, carriage, done, denial, disr...",e1_board5_words


In [27]:
## need to get similarity matrix of these words in this order to work with
target_df = pd.read_csv("../data/connector_wordpairs_boards.csv")
target_df["wordpair"]= target_df["Word1"]+ "-"+target_df["Word2"]
target_df.head()

Unnamed: 0,Word1,Word2,Experiment,boardnames,wordpair
0,void,couch,E1,e1_board1_words,void-couch
1,giggle,abnormal,E1,e1_board1_words,giggle-abnormal
2,exam,algebra,E1,e1_board1_words,exam-algebra
3,tea,bean,E1,e1_board10_words,tea-bean
4,tourist,comedy,E1,e1_board10_words,tourist-comedy


## Speaker predictions

These are the best-fitting params for each model (see Optimizing parameters section below for code used to find these values)

In [28]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (22.336514544537227, 0.039),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

### Compute scores
Our main DV is to likelihood of the data, so we compute scores & ranks for all possible clues produced by the participant

In [50]:
def get_speaker_scores(group, speaker_word_pairs, y, y_sorted) :
    speaker_prob = []
    speaker_rank = []
    for index, row in group.iterrows():
        clue1 = row["Clue1"]
        wordpair = str(row["wordpair"]).replace(" ", "")
        wordpair_index = speaker_word_pairs.index(wordpair)
        w1_index, w2_index = [list(sample_df["Word"]).index(word) for word in wordpair.split('-')]
        
        # find index of clue
        if clue1 in list(sample_df["Word"]):
            clue_index = list(sample_df["Word"]).index(clue1)
            clue_probs = y[wordpair_index, clue_index]
            clue_rank = np.nonzero(y_sorted==clue_index)[1][wordpair_index]
        else:
            clue_rank = "NA"
            clue_probs = "NA"

        speaker_prob.append(clue_probs)
        speaker_rank.append(clue_rank)
    return speaker_prob, speaker_rank

In [51]:
speakerprobs_df = pd.DataFrame(columns=['representation', 'Experiment','Board', "Word1", "Word2", "Clue1", "clueCount", "wordpair", "prag_speaker_probs"])
for representation in representations.keys() :
    for index, row in combined_boards_df.iterrows():
        board = row["boardwords"]
        boardname = row["boardnames"]
        wordpairlist = get_wordpair_list(boardname)
        speaker_word_pairs = target_df[(target_df["boardnames"] == row["boardnames"]) & 
                                       (target_df["Experiment"] == row["Experiment"])]["wordpair"]
        speaker_word_pairs = list(speaker_word_pairs)
        speaker_df_new = pd.DataFrame({'wordpair': speaker_word_pairs})
        params = rsa_optimal_params[representation]
        speaker_model = pragmatic_speaker_np(boardname, params[0], params[1], representation)

        ## this is created at the BOARD level
        y = np.array([speaker_model[wordpairlist.index(wordpair)] for wordpair in speaker_word_pairs])
        y_sorted = np.argsort(-y)

        ## so y has 3 vectors of clue probabilities (the 3 pairs on this board)
        ## now we need to go into expdata and score the probabilities for those specific clues
        expdata_board = expdata[(expdata["Board"] == row["Board"]) & (expdata["Experiment"] == row["Experiment"])]
        speaker_prob, speaker_rank = get_speaker_scores(expdata_board, speaker_word_pairs, y, y_sorted)
        expdata_board.loc[:,"representation"] = representation
        expdata_board.loc[:,"prag_speaker_probs"] = speaker_prob
        expdata_board.loc[:,"prag_speaker_rank"] = speaker_rank
        speakerprobs_df = pd.concat([speakerprobs_df, expdata_board])

In [None]:
speakerprobs_df.to_csv("../data/speaker_ranks.csv")

### Compute top-n lists

In [54]:
## Creating a mainlist of boards
speaker_df = pd.DataFrame(columns=['representation', 'wordpair','Board', 'prag_speaker_words'])

for representation in representations.keys() :
  for index, row in combined_boards_df.iterrows():
    board_words = row["boardwords"]
    boardname = row["boardnames"]
    wordpairlist = get_wordpair_list(boardname)
    params = rsa_optimal_params[representation]
    speaker_model = pragmatic_speaker_np(boardname, params[0], params[1], representation)
    ## get empirical speaker probs for this board
    speaker_word_pairs = target_df[(target_df["boardnames"] == row['boardnames']) & 
                                          (target_df["Experiment"] == row['Experiment'])]["wordpair"]
    speaker_df_new = pd.DataFrame({'wordpair': speaker_word_pairs})
    speaker_df_new["Board"] = boardname
    speaker_df_new["representation"] = representation
    y = np.array([speaker_model[wordpairlist.index(wordpair)] for wordpair in speaker_word_pairs])

    ## this will yield 3x10191 array [1-d array for each word-pair, 3 word-pairs per board]
    ## from here we sort each row in descending order 
    y_sorted = np.argsort(-y) ## gives sorted indices
    top20_indices = y_sorted[:,:20]

    ## convert to words
    w1 = [list(sample_df["Word"])[i] for i in top20_indices[0]]
    w2 = [list(sample_df["Word"])[i] for i in top20_indices[1]]
    w3 = [list(sample_df["Word"])[i] for i in top20_indices[2]]
    words = [w1, w2, w3]
    speaker_df_new["prag_speaker_words"] = words
    speaker_df = pd.concat([speaker_df, speaker_df_new])

In [None]:
speaker_df.to_csv("../data/speaker_top.csv")

## Listener predictions

In [55]:
guesser_df = pd.DataFrame(columns=['Experiment','Board', "Word1", "Word2", "Clue1", "clueCount", "wordpair"])
for representation in representations.keys() :
    for index, row in combined_boards_df.iterrows():
        guesser_df_board = pd.DataFrame(columns=['Experiment','Board', "Word1", "Word2", "Clue1", "clueCount", "wordpair"])
        board = row["boardwords"]
        boardname = row['boardnames']
        wordpairlist = get_wordpair_list(boardname)
        # calculate the prag guesser for this specific wordpairlist (corresponding to ONE board)
        params = rsa_optimal_params[representation]
        x = literal_guesser_np(boardname, representation)
        z = pragmatic_guesser_np(boardname, params[0], params[1], representation) 

        # then loop through the clues in expdata_board to get predictions
        expdata_board = expdata[(expdata["Board"] == row["Board"]) & (expdata["Experiment"] == row["Experiment"])]
        expdata_board.loc[:, "representation"] = representation

        for index, row in expdata_board.iterrows():
            clue1 = row["Clue1"]
            if clue1 in list(sample_df["Word"]):
                ## literal guesser uses "x", pragmatic guesser uses "z"
                clue_index = list(sample_df["Word"]).index(clue1)
                literal_pred = wordpairlist[np.argmax(x[:,clue_index])]
                pragmatic_pred = wordpairlist[np.argmax(z[:,clue_index])]
            else:
                literal_pred = "NA"
                pragmatic_pred = "NA"

            # we want to track likelihood for ALL responses (i.e. full listener distribution)
            guesser_df_clue = pd.DataFrame({
                'Clue1' : clue1, 
                'possible_wordpair' : wordpairlist,
                'literal_likelihood' : x[:, clue_index],
                'prag_likelihood' : z[:, clue_index],
                'literal_top_prediction' : literal_pred,
                'prag_top_prediction' : pragmatic_pred
            })
            guesser_df_board = pd.concat([guesser_df_board, pd.merge(expdata_board, guesser_df_clue)])
        guesser_df = pd.concat([guesser_df, guesser_df_board])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
guesser_df.to_csv("../data/guesser_scores.csv")

## Get predictions for 'naive' context model

In [56]:
## define a func that computed similarities of clue to each word on the board
## and then maximizes similarity to the words while minimizing similarity to other words
def speaker_board_func(combs_df, context_board, alpha, beta, representation_model):
    # grab subset of words in given board and their corresponding glove vectors
    board_df = sample_df[sample_df['Word'].isin(context_board)]
    board_word_indices = list(board_df.index)
    board_words = board_df["Word"]
    board_vectors = representation_model[board_word_indices]

    ## clue_sims is the similarity of ALL clues in full searchspace (size N) to EACH word on board (size 20)
    clue_sims = (1-scipy.spatial.distance.cdist(board_vectors, representation_model, 'cosine') + 1 ) / 2
    target_sample = target_df[target_df['Word1'].isin(board_df["Word"]) & target_df['Word2'].isin(board_df["Word"])]
    w1_index = [list(board_df["Word"]).index(row["Word1"]) for index, row in target_sample.iterrows()]
    w2_index = [list(board_df["Word"]).index(row["Word2"]) for index, row in target_sample.iterrows()]
    clue_w1 = clue_sims[w1_index]
    clue_w2 = clue_sims[w2_index]
    clue_prod = np.multiply(clue_w1,clue_w2)

    # deleting the two target words to compute average similarity to other words on the board
    clue_sims_new = np.array([np.delete(clue_sims, [w1_index[i], w2_index[i]], axis=0) for i in range(len(w1_index))])
    avg_sim = np.mean(clue_sims_new, axis=1)

    ## FUNC = alpha(clue_w1*clue_w2) + (1-alpha)*(average of other board words)

    func = np.subtract((alpha)*clue_prod, (1-alpha)*avg_sim)
    return softmax(beta * func, axis=1)


In [63]:
np.sum(speaker_board_func(board_combos['e1_board10_words'], boards['e1_board10_words'], 
                                 0.7, 20, representations['glove']), axis = 1) 

array([1., 1., 1.])

## Top Predictions

In [None]:
speaker_board_df = pd.DataFrame(columns=['Word1', 'Word2', 'boardnames','top10preds', 'alpha', 'Model'])
for representation in ['bert-sum', 'glove', 'swow']: 
    for alpha in np.arange(0,1.1, 0.1):
        ## for a given alpha, compute the clue similarities 
        params = board_optimal_params[representation]
        speaker_board_probs = {
            board_name : speaker_board_func(board_combos[board_name], boards[board_name], alpha, params[0], representations[representation]) 
            for board_name in boards.keys()
        }   
        # we calculate the top5 speaker predictions for each word-pair based on highest value above
        for board in speaker_board_probs.keys():
            ## obtain top10 indices for each word-pair
            idx = [(-speaker_board_probs[board][x]).argsort()[:10].tolist() for x in range(3)]
            a = [list(sample_df["Word"])[z] for y in idx for z in y]
            top10preds = [list(arr) for arr in np.array_split(a, 3)]
            speaker_df_new = pd.DataFrame({'boardnames': [board]*3})
            speaker_df_new["Word1"] = list(target_df[target_df['boardnames']== board]["Word1"])
            speaker_df_new["Word2"] = list(target_df[target_df['boardnames']== board]["Word2"])
            speaker_df_new["alpha"] = [alpha]*3
            speaker_df_new["top10preds"] = top10preds
            speaker_df_new["Model"] = representation
            speaker_board_df = pd.concat([speaker_board_df, speaker_df_new])

Unnamed: 0,Word1,Word2,boardnames,top10preds,alpha,Model
0,void,couch,e1_board1_words,"[David Bowie, United Kingdom, mussel, North Po...",0.0,BERT-sum
1,giggle,abnormal,e1_board1_words,"[David Bowie, United Kingdom, mussel, North Po...",0.0,BERT-sum
2,exam,algebra,e1_board1_words,"[David Bowie, mussel, United Kingdom, North Po...",0.0,BERT-sum
0,beginning,brake,e1_board2_words,"[David Bowie, United Kingdom, mussel, North Po...",0.0,BERT-sum
1,birds,aircraft,e1_board2_words,"[David Bowie, mussel, United Kingdom, North Po...",0.0,BERT-sum
...,...,...,...,...,...,...
1,olive,real,e2_board9_words,"[real, olive, lemon, Thanksgiving, purple, lof...",1.0,BERT-sum
2,candle,wick,e2_board9_words,"[candle, wick, spark, wrench, cookie, hick, wa...",1.0,BERT-sum
0,garage,bone,e2_board10_words,"[bone, garage, fatigue, joints, stump, warehou...",1.0,BERT-sum
1,feet,chapel,e2_board10_words,"[cottage, sandals, tile, nuns, fabric, bathroo...",1.0,BERT-sum


In [None]:
speaker_board_df.to_csv("../data/speaker_boardfunc_df.csv")

## Obtain Clue Score for every possible Clue

In [None]:
## merge expdata with combined_boards so we have the "boardname" correct
expdata_new = pd.merge(expdata,combined_boards_df,on=['Board', 'Experiment'],how='left')
expdata_new["wordpair"] = expdata_new["Word1"] + "-" + expdata_new["Word2"]
expdata_new.head()

Unnamed: 0,Experiment,Board,Word1,Word2,Clue1,clueCount,wordpair,boardwords,boardnames
0,E1,TrialList1,exam,algebra,calculus,1,exam-algebra,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words
1,E1,TrialList1,exam,algebra,equation,1,exam-algebra,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words
2,E1,TrialList1,exam,algebra,knowledge,1,exam-algebra,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words
3,E1,TrialList1,exam,algebra,math,22,exam-algebra,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words
4,E1,TrialList1,exam,algebra,school,2,exam-algebra,"[angry, adore, burglar, rumor, depth, anchor, ...",e1_board1_words


In [None]:
target_df["wordpair"] = target_df["Word1"] + "-" + target_df["Word2"]
target_df.head()

Unnamed: 0,Word1,Word2,Experiment,boardnames,wordpair
0,void,couch,E1,e1_board1_words,void-couch
1,giggle,abnormal,E1,e1_board1_words,giggle-abnormal
2,exam,algebra,E1,e1_board1_words,exam-algebra
3,tea,bean,E1,e1_board10_words,tea-bean
4,tourist,comedy,E1,e1_board10_words,tourist-comedy


In [None]:
clue_board_df_main = pd.DataFrame(columns=['boardnames','wordpair', 'Clue1', 'clue_score', 'alpha', 'Model'])

for representation in ['bert-sum', 'glove', 'swow']: 
  for alpha in np.arange(0,1.1, 0.1):
    ## for a given alpha, compute the clue similarities 
    beta = board_optimal_params[representation][0]
    speaker_board_probs = {
        board_name : speaker_board_func(board_combos[board_name], boards[board_name], alpha, beta, representations[representation]) 
        for board_name in boards.keys()
    }   
    
    for board in speaker_board_probs.keys():
      
      ## get the clues we need scores for from expdatanew
      clue_main = expdata_new.loc[expdata_new['boardnames'] == board]
      target_main = target_df.loc[target_df['boardnames'] == board]
      
      target_main.reset_index(inplace = True)
      #print(target_main)

      for index, row in clue_main.iterrows():
        if row["Clue1"] in list(sample_df["Word"]):
          #print("clue is:", row["Clue1"])
          clue_index = list(sample_df["Word"]).index(row["Clue1"])
          #print("clue_index:",clue_index)
          wordpair = row["wordpair"]
          ## need to figure out specific wordpair this clue corresponds to
          wordpair_index = target_main.index[(target_main['wordpair'] == wordpair)].tolist()[0]
          #print("wordpair_index:",wordpair_index)
          # get a sorted array of the clue scores
          mainscores = speaker_board_probs[board][wordpair_index]
          sorted_clue_probs = np.argsort(-mainscores).tolist()
          #print("sorted_clue_probs_indices = ", sorted_clue_probs)
          
          # we next obtain the score for each clue for a specific wordpair
          clue_similarity = speaker_board_probs[board][wordpair_index][clue_index]
          # want to find index of this particular clue in the overall distribution
          clue_rank = sorted_clue_probs.index(clue_index)
          #print("clue_rank:",clue_rank)
        else:
          clue_similarity = "NA"
          clue_rank = "NA"
        
        clue_board_df = pd.DataFrame({'boardnames': [board]})
        clue_board_df["wordpair"] = wordpair
        clue_board_df["Clue1"] = row["Clue1"]
        clue_board_df["clue_score"] = clue_similarity
        clue_board_df["clue_rank"] = clue_rank
        clue_board_df["alpha"] = alpha
        clue_board_df["Model"] = representation
          
        clue_board_df_main = pd.concat([clue_board_df_main, clue_board_df])

clue_board_df_main                

Unnamed: 0,boardnames,wordpair,Clue1,clue_score,alpha,Model,clue_rank
0,e1_board1_words,exam-algebra,calculus,7.46495e-05,0.0,bert-sum,3812
0,e1_board1_words,exam-algebra,equation,5.92828e-05,0.0,bert-sum,4913
0,e1_board1_words,exam-algebra,knowledge,3.59289e-05,0.0,bert-sum,8081
0,e1_board1_words,exam-algebra,math,6.25472e-05,0.0,bert-sum,4631
0,e1_board1_words,exam-algebra,school,6.21898e-05,0.0,bert-sum,4657
...,...,...,...,...,...,...,...
0,e2_board10_words,garage-bone,storage,0.000495596,1.0,swow,206
0,e2_board10_words,garage-bone,structure,0.000288999,1.0,swow,388
0,e2_board10_words,garage-bone,tool,0.00229715,1.0,swow,23
0,e2_board10_words,garage-bone,trash,0.00038321,1.0,swow,288


In [None]:
clue_board_df_main.to_csv("../data/speaker_boardfunc_df_ranks_softmax.csv")

# Optimizing model params

Our speaker models have two free parameters. In order to make a fair comparison across different representations, we want to find best version of each model.

In [24]:
softplus = lambda x: np.log1p(np.exp(x))

### Optimize 'board' models (i.e. non-RSA way of incorporating context)

In [25]:
representation = 'glove'
def get_board_speaker_likelihood(params):
    speaker_prob = []
    beta = softplus(params[0])
    alpha = expit(params[1])
    representation_model = representations[representation]
    for index, row in combined_boards_df.iterrows():
        # grab subset of words in given board and their corresponding glove vectors
        boardname = row["boardnames"]
        board_df = sample_df[sample_df['Word'].isin(boards[boardname])]
        board_word_indices = list(board_df.index)
        board_words = board_df["Word"]
        board_vectors = representation_model[board_word_indices]

        ## clue_sims is the similarity of ALL clues in full searchspace (size N) to EACH word on board (size 20)
        clue_sims = (1-scipy.spatial.distance.cdist(board_vectors, representation_model, 'cosine') + 1) / 2
        target_sample = target_df[target_df['Word1'].isin(board_df["Word"]) & target_df['Word2'].isin(board_df["Word"])]
        w1_index = [list(board_df["Word"]).index(row["Word1"]) for index, row in target_sample.iterrows()]
        w2_index = [list(board_df["Word"]).index(row["Word2"]) for index, row in target_sample.iterrows()]
        clue_w1 = clue_sims[w1_index]
        clue_w2 = clue_sims[w2_index]
        clue_prod = np.multiply(clue_w1, clue_w2)
        clue_sims_new = np.array([np.delete(clue_sims, [w1_index[i], w2_index[i]], axis=0) for i in range(len(w1_index))])
        avg_sim = np.mean(clue_sims_new, axis=1)
        func = np.subtract((alpha)*clue_prod, (1-alpha)*avg_sim)
        y = softmax(beta * func, axis=1)
        expdata_board = expdata[(expdata["Board"] == row["Board"]) & (expdata["Experiment"] == row["Experiment"])]
        speaker_word_pairs = list(target_sample['wordpair'])
        for index, row in expdata_board.iterrows():
            wordpair = str(row["wordpair"]).replace(" ", "")
            wordpair_index = speaker_word_pairs.index(wordpair)
            clue1 = row["Clue1"]
            if clue1 in list(sample_df["Word"]):
                # find index of clue
                clue_index = list(sample_df["Word"]).index(clue1)
                clue_probs = y[wordpair_index, clue_index]
                speaker_prob.append(row['clueCount'] * np.log(clue_probs))
    print(beta, alpha, '(', params[1], ')', ':', np.sum(speaker_prob))
    return -np.sum(speaker_prob)

Each of these lines gives a good 'initialization' for each model

In [None]:
#scipy.optimize.minimize(get_board_speaker_likelihood, [24, 3.349]) # optimize for 'swow'
#scipy.optimize.minimize(get_board_speaker_likelihood, [20.6, 3.349]) # optimize for 'glove'
scipy.optimize.minimize(get_board_speaker_likelihood, [1, 5]) # optimize for 'bert-sum'

### Optimize RSA speaker models

In [26]:
representation = 'swow'

def get_rsa_speaker_likelihood(params) :
  # params is a list
  speaker_prob = []
  beta = softplus(params[0])
  costweight = expit(params[1])
  for index, row in combined_boards_df.iterrows():
    board = row["boardwords"]
    boardname = row["boardnames"]
    wordpairlist = get_wordpair_list(boardname)
    speaker_word_pairs = target_df[(target_df["boardnames"] == row["boardnames"]) & 
                                   (target_df["Experiment"] == row["Experiment"])]["wordpair"]
    speaker_model = pragmatic_speaker_np(boardname, beta, costweight, representation)
    y = np.array([speaker_model[wordpairlist.index(wordpair)] for wordpair in speaker_word_pairs])
    expdata_board = expdata[(expdata["Board"] == row["Board"]) & (expdata["Experiment"] == row["Experiment"])]
    speaker_word_pairs = list(speaker_word_pairs)
    for index, row in expdata_board.iterrows():
      wordpair = str(row["wordpair"]).replace(" ", "")
      wordpair_index = speaker_word_pairs.index(wordpair)
      clue1 = row["Clue1"]
      if clue1 in list(sample_df["Word"]):
        clue_index = list(sample_df["Word"]).index(clue1)
        clue_probs = y[wordpair_index, clue_index]
        speaker_prob.append(row['clueCount'] * np.log(clue_probs))
  print(beta, costweight, '(', params[1], ')', ':', np.sum(speaker_prob))
  return -np.sum(speaker_prob)

In [None]:
#scipy.optimize.minimize(get_board_speaker_likelihood, [24, 3.349]) # optimize for 'swow'
#scipy.optimize.minimize(get_board_speaker_likelihood, [20.6, 3.349]) # optimize for 'glove'
scipy.optimize.minimize(get_rsa_speaker_likelihood, [25.399, -3.219]) # optimize for 'swow'