# importing embeddings, vocabulary, & functions file
We load the embeddings, vocabulary, and all the search functions. The functions have been predefined and are stored in the search-models subdirectory in the github repository, so we directly load them here.

In [1]:
import networkx as nx
import pandas as pd
import pickle
import json
import numpy as np
from search import SWOW
from pragmatics import RSA, nonRSA, utils
import walker 
%load_ext autoreload
%autoreload 2

# import walk data

In [2]:
import json
# with open('../data/walk_data/intersection_candidates.json') as json_file:
#     intersection_dict = json.load(json_file)
# with open('../data/walk_data/union_candidates.json') as json_file:
#     union_dict = json.load(json_file)
corrections = pd.read_csv('../data/corrections.csv')
vocab = pd.read_csv("../data/vocab.csv").rename(columns={"Word": "vocab_word"})
print(f"vocab is {len(vocab)} words")


vocab is 12218 words


# experiment 1: candidate generation

In [8]:
generation_study_data = (
    pd.read_csv('../data/exp1/e1_data.csv')
      .melt(id_vars=['clueGiverID' , 'wordpair_id', 'Level', 'clueFinal'], 
            value_vars=["clueOption1", "clueOption2", "clueOption3", "clueOption4", 
                        "clueOption5", "clueOption6", "clueOption7", "clueOption8"])
      .rename(columns={"value": "Clue1"})
)
generation_study_data = utils.apply_corrections(generation_study_data, corrections, vocab)
generation_study_data = (
    generation_study_data.dropna()
      .drop_duplicates()
      .groupby(['clueGiverID','wordpair_id',  'Level', 'clueFinal'], 
               as_index=False)['correctedClue']
      .agg(','.join)
)
generation_study_data['clue_list'] = generation_study_data['correctedClue'].str.split(',')
common_candidates = SWOW.get_common_candidates(generation_study_data, '../data/exp1/e1_common_candidates.csv')

In [9]:
common_candidates

Unnamed: 0,clueGiverID,wordpair,Level,clueFinal,budget,behavioral_clue_list,len_cluelist_behavioral,len_cluelist_union,union_common,len_union_common,finalclue_index_union,len_cluelist_intersection,intersection_common,len_intersection_common,finalclue_index_intersection
0,454955,bowl-sun,Medium,acai,budget=2,"['brunch', 'yard game', 'bright', 'acai']",4,12218,"['bright', 'brunch']",2,-1,12218,"['bright', 'brunch']",2,-1
0,454955,bowl-sun,Medium,acai,budget=4,"['brunch', 'yard game', 'bright', 'acai']",4,12218,"['bright', 'brunch']",2,-1,12218,"['bright', 'brunch']",2,-1
0,454955,bowl-sun,Medium,acai,budget=8,"['brunch', 'yard game', 'bright', 'acai']",4,12218,"['bright', 'brunch']",2,-1,12218,"['bright', 'brunch']",2,-1
0,454955,bowl-sun,Medium,acai,budget=16,"['brunch', 'yard game', 'bright', 'acai']",4,12218,"['bright', 'brunch']",2,-1,12218,"['bright', 'brunch']",2,-1
0,454955,bowl-sun,Medium,acai,budget=32,"['brunch', 'yard game', 'bright', 'acai']",4,12218,"['bright', 'brunch']",2,-1,12218,"['bright', 'brunch']",2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,491095,trauma-weird,Medium,flashbacks,budget=32,"['flashbacks', 'nightmares', 'dreams']",3,12218,['dreams'],1,-1,12218,['dreams'],1,-1
0,491095,trauma-weird,Medium,flashbacks,budget=64,"['flashbacks', 'nightmares', 'dreams']",3,12218,['dreams'],1,-1,12218,['dreams'],1,-1
0,491095,trauma-weird,Medium,flashbacks,budget=128,"['flashbacks', 'nightmares', 'dreams']",3,12218,['dreams'],1,-1,12218,['dreams'],1,-1
0,491095,trauma-weird,Medium,flashbacks,budget=256,"['flashbacks', 'nightmares', 'dreams']",3,12218,['dreams'],1,-1,12218,['dreams'],1,-1


# Experiment 2: original connector

In [None]:
e2_data = pd.read_csv("../data/exp2/e2_empirical_clues.csv", encoding= 'unicode_escape')
target_df = pd.read_csv("../data/targets.csv")
print(target_df.head())
representations = {}
representations['swow'] = pd.read_csv("../data/swow_associative_embeddings.csv").transpose().values
print(f"embeddings are shaped:", representations['swow'].shape)
with open('../data/exp2/e2_boards.json', 'r') as json_file:
    e2_boards = json.load(json_file)


## non-RSA

Here we import the candidates from the json files and first compute how the non-RSA model would rank these candidates for each budget level. Next, we merge these obtained probabilities with the actual behavioral data 

In [None]:
board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [None]:
e2_data = nonRSA.apply_corrections(e2_data, corrections, vocab)
e2_data['wordpair'] = e2_data['wordpair'].str.replace(' ', '')
e2_data = e2_data.merge(target_df, on='wordpair', how='left')
e2_data.to_csv("../data/exp2/e2_corrected.csv", index=False)
e2_data.head()


In [None]:
candidateprobs_nonRSA  = nonRSA.get_nonRSA_union_int(union_dict, intersection_dict, target_df, e2_boards, board_optimal_params, vocab, representations, e2_data, '../data/exp2/nonRSAprobs.csv')

## RSA

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (82.83019661384789, 0.9997249702731884),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
candidateprobs_RSA  = RSA.get_RSA_union_int(union_dict, intersection_dict, target_df, e2_boards, rsa_optimal_params, vocab, representations, e2_data, '../data/exp2/RSAprobs.csv')

# Experiment 3: targeted endorsements
Here, we look at how the models predict specific target endorsements from the candidate set provided to the speaker

In [None]:
with open('../data/exp3/e3_boards.json', 'r') as json_file:
    e3_boards = json.load(json_file)
e3_stimuli = pd.read_csv('../data/exp3/e3_stimuli.csv')
## pass through corrections file
e3_stimuli = nonRSA.apply_corrections(e3_stimuli, corrections, vocab)
# create column that records whether correctedClue in vocab or not
e3_stimuli['correctedClue_in_vocab'] = e3_stimuli['correctedClue'].isin(vocab['vocab_word'])
# merge with target_df
e3_stimuli = e3_stimuli.merge(target_df, on='wordpair', how='left')
e3_stimuli.head()

## non-RSA probabilities

In [None]:
e3_nonRSA = nonRSA.get_nonRSA_union_int(union_dict, intersection_dict, target_df, e3_boards, board_optimal_params, vocab, representations, e3_stimuli, '../data/exp3/nonRSAprobs.csv')

## RSA probabilities

In [None]:
e3_nonRSA  = RSA.get_RSA_union_int(union_dict, intersection_dict, target_df, e3_boards, rsa_optimal_params, vocab, representations, e3_stimuli, '../data/exp3/RSAprobs.csv')

# other code 

In [None]:
# code for finding closest matches to a given word (spell-check)
from english_words import english_words_set
'imitate' in english_words_set
import difflib
difflib.get_close_matches('turbine', list(english_words_set))


In [None]:
expdata_long = e3_stimuli.groupby(['wordpair'], as_index=False)['value'].agg(','.join)
expdata_long['clue_list'] = expdata_long['value'].str.split(',')
expdata_long = expdata_long.merge(target_df, on = "wordpair")
# count how many clues for each wordpair are not in vocab
expdata_long['len_cluelist_not_in_vocab'] = expdata_long['clue_list'].apply(lambda x: len([e for e in x if e not in list(vocab.vocab_word)]))
expdata_long_subset = expdata_long[expdata_long['len_cluelist_not_in_vocab'] == 0]


# Old: Dedicated functions for full vocab

In [None]:
## create boards and merge with expdata
combined_boards_df = pd.DataFrame(columns=['Experiment', 'Board','boardwords'])
combined_boards_df["Experiment"]  = ["E1"] * 10 + ["E2"] * 10
combined_boards_df["Board"] = ["TrialList" + str(i) for i in range(1,11)] * 2
combined_boards_df["boardnames"] = (['e1_board' + str(i) + '_words' for i in range(1,11)] 
                                  + ['e2_board' + str(i) + '_words' for i in range(1,11)])
combined_boards_df["boardwords"] = [boards[n] for n in combined_boards_df["boardnames"]]

expdata_new = pd.merge(expdata,combined_boards_df,on=['Board', 'Experiment'],how='left')
expdata_new["wordpair"] = expdata_new["Word1"] + "-" + expdata_new["Word2"]
board_combos = {board_name : search_funcs.RSA.compute_board_combos(board_name, boards) for board_name in boards.keys()}

## Non - RSA method


In [None]:
board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [None]:
cluescoredf = nonRSA.speaker_targetboard_cluescores(['swow', 'glove'], board_optimal_params, board_combos, boards, list(vocab.vocab_word), vocab, representations, target_df, expdata_new)
cluescoredf.head()

## RSA Method

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (22.336514544537227, 0.039),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
pragmaticspeakerdf = search_funcs.RSA.get_speaker_df(representations, combined_boards_df,rsa_optimal_params, list(vocab.vocab_word), vocab, expdata_new, board_combos, target_df, boards)
pragmaticspeakerdf.head()