<a href="https://colab.research.google.com/github/abhilasha-kumar/Connector/blob/master/search-models/search_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing embeddings, vocabulary, & functions file
We load the embeddings, vocabulary, and all the search functions. The functions have been predefined and are stored in the search-models subdirectory in the github repository, so we directly load them here.

In [6]:
import pandas as pd
import pickle
import json
import numpy as np
import search_funcs
import walker 

%load_ext autoreload
%autoreload 2


We have the following models that can be run on full vocab/candidates and with/without pragmatics:
1.   Target+Board (non-RSA)
2.   Pragmatic speaker (RSA)

We have the following search models that generate candidates:
1.   Union (RW)
2.   Intersection (RW)



In [34]:
swow = search_funcs.SWOW('../../data')

In [33]:
walker.random_walks(swow.graph, n_walks=1, walk_len=10, start_nodes=[0])

Random walks - T=2.25s


array([[   0,    1,    5, 2915, 3125,  218, 1941, 3139, 5456, 4832]],
      dtype=uint32)

In [53]:
np.random.seed(44)
swow.union_intersection_nwalks('man', 'woman', 100, 1)

Random walks - T=2.62s
Random walks - T=2.62s
{'w1_visited_count': array([0., 0., 0., ..., 0., 0., 0.])}


AttributeError: 'dict' object has no attribute 'loc'

# Candidate generation (Union & Intersection)

## keeping n_walks constant

Here we keep the number of random walks constant = 1000 and compute the union and intersection between the two words' walks for different number of steps (determined via powers of 2)

In [None]:
## here we generate candidates for each of our wordpairs: stored in target_df
candidates_df = pd.DataFrame()

# keep n_walks fixed to a large number
n_walks = 1000

for index, row in target_df.iterrows():
  w1 = row["Word1"]
  w2 = row["Word2"]
  print(f"for {w1} and {w2}")
  union_df, int_df = search_funcs.search.union_intersection(w1,w2, n_walks, swow_vocab, swowGraph)
  print(f"union/int calculation complete!")
  
  union_df["Word1"] = w1
  union_df["Word2"] = w2
  union_df["n_walks"] = n_walks
  union_df["type"] = "union"
  union_df["wordpair"] = row["wordpair"]
  

  int_df["Word1"] = w1
  int_df["Word2"] = w2
  int_df["n_walks"] = n_walks
  int_df["type"] = "intersection"
  int_df["wordpair"] = row["wordpair"]

  overall_df = pd.concat([union_df, int_df])

  candidates_df = pd.concat([candidates_df, overall_df])

## varying n_walks with powers of 2

Here, we vary the number of walks in powers of 2 and then compute the union & intersection for different number of steps within those walks

In [None]:
## here we generate candidates for each of our wordpairs: stored in target_df

#ncandidates_df = pd.DataFrame()

# keep n_walks fixed to a large number
walkmax = 4096 

for index, row in target_df.iterrows():
  w1 = row["Word1"]
  w2 = row["Word2"]
  print(f"for {w1} and {w2}")
  wordpair = row["wordpair"]
  if(wordpair != "exam-algebra"):
    union_df, int_df = search_funcs.search.union_intersection_nwalks(w1, w2, walkmax, swow_vocab, swowGraph)
    print(f"union/int calculation complete!")
    
    union_df["Word1"] = w1
    union_df["Word2"] = w2
    union_df["type"] = "union"
    union_df["wordpair"] = wordpair
    
    int_df["Word1"] = w1
    int_df["Word2"] = w2
    int_df["type"] = "intersection"
    int_df["wordpair"] = wordpair

    overall_df = pd.concat([union_df, int_df])

    ncandidates_df = pd.concat([ncandidates_df, overall_df])
    ncandidates_df.to_csv(parentfolder+'candidates_nwalks.csv', index = False)

## non-RSA method (varied walks)

In [None]:
ndf_filtered = ncandidates_df[(ncandidates_df['vocab_word'] != ncandidates_df["Word1"]) & (ncandidates_df['vocab_word'] != ncandidates_df["Word2"])]
ndf_filtered["wordpair"] = ndf_filtered["Word1"] + "-"+ndf_filtered["Word2"]
## group by number of steps in the RW and union/intersection
cwide = ndf_filtered.groupby(['wordpair', 'type', 'n_steps'], as_index=False)['vocab_word'].agg(','.join)
cwide['clue_list'] = cwide['vocab_word'].str.split(',')
cwide = cwide.merge(target_df, on = "wordpair")
cwide

## non-RSA method (fixed walks)

In [None]:
candidates_df = pd.read_csv(parentfolder+'swow_candidates.csv')
candidates_df

In [None]:
df_filtered = candidates_df[(candidates_df['vocab_word'] != candidates_df["Word1"]) & (candidates_df['vocab_word'] != candidates_df["Word2"])]

df_filtered["wordpair"] = df_filtered["Word1"] + "-"+df_filtered["Word2"]
## group by number of steps in the RW and union/intersection
cwide = df_filtered.groupby(['wordpair', 'type', 'n_steps'], as_index=False)['vocab_word'].agg(','.join)
cwide['clue_list'] = cwide['vocab_word'].str.split(',')
cwide = cwide.merge(target_df, on = "wordpair")
cwide

In [None]:
board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [None]:
modelname = "swow"
candidateprobs_nonRSA = pd.DataFrame()
for alpha in np.arange(0,1.1, 0.1):
  ## for a given alpha, compute the clue similarities at the board level 
  beta = board_optimal_params[modelname][0]
  for index, row in cwide.iterrows():
    boardname = row["boardnames"]
    cluelist = row["clue_list"]
    wordpair = row["wordpair"]
    board_probs = search_funcs.nonRSA.speaker_targetboard(boards[boardname], alpha, board_optimal_params[modelname][0], cluelist, representations, modelname, vocab, target_df)
    ## obtain the probs for the specific wordpair
    target_main = target_df.loc[target_df['boardnames'] == boardname]
    target_main.reset_index(inplace = True)
    wordpair_index = target_main.index[(target_main['wordpair'] == wordpair)].tolist()[0]
    mainscores = board_probs[wordpair_index]

    clue_board_df = pd.DataFrame({'alpha': [alpha]})
    clue_board_df["Model"] = "swow"
    clue_board_df["type"] = row["type"]
    clue_board_df["n_steps"] = row["n_steps"]        
    clue_board_df["boardnames"] = boardname
    clue_board_df["wordpair"] = wordpair
    clue_board_df["cluelist"] = str(','.join(cluelist))
    clue_board_df["clue_score"] = str(np.round(mainscores,10).tolist())
      
    candidateprobs_nonRSA = pd.concat([candidateprobs_nonRSA, clue_board_df])

### obtaining probabilities for data

In [None]:
#df_filtered["wordpair"] = df_filtered["Word1"]+"-"+df_filtered["Word2"]
# TODO: this is slow!
nonrsa_probs = pd.DataFrame()

for index, row in expdata.iterrows():
  wordpair = row["wordpair"].replace(" - ", "-")
  clue = row["Clue1"]
  clue_df = df_filtered[(df_filtered["vocab_word"] == clue) & (df_filtered["wordpair"] == wordpair)]
  for i, j in clue_df[:2].iterrows():
    ctype = j["type"]
    n_steps = j["n_steps"]
    clueprobs_df = candidateprobs_nonRSA[(candidateprobs_nonRSA["wordpair"]== wordpair) & (candidateprobs_nonRSA["type"] == ctype) & (candidateprobs_nonRSA["n_steps"]==n_steps)]
    for x, y in clueprobs_df.iterrows():
      alpha = y["alpha"]
      cluelist = y["cluelist"].split(',')
      clue_index = cluelist.index(clue)
      clue_score = y["clue_score"][1:-1].split(', ')[clue_index]

      clue_board_df = pd.DataFrame({'alpha': [alpha]})
      clue_board_df["type"] = ctype
      clue_board_df["n_steps"] = n_steps
      clue_board_df["wordpair"] = wordpair
      clue_board_df["Clue1"] = clue
      clue_board_df["clue_score"] = clue_score
        
      nonrsa_probs = pd.concat([nonrsa_probs, clue_board_df])

#nonrsa_probs.to_csv(parentfolder+'finalexpdata_nonRSA_speaker.csv', index = False)

## RSA (fixed walks)

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (82.83019661384789, 0.9997249702731884),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
pragmaticspeaker_df = pd.read_csv(parentfolder+'candidates_RSAprobs.csv')

In [None]:
## need to obtain list of candidates for each board separately

#pragmaticspeaker_df = pd.DataFrame()
modelname = 'swow'

beta = rsa_optimal_params[modelname][0]
cost = rsa_optimal_params[modelname][1]
for index, row in cwide.iterrows():
  boardname = row["boardnames"]
  cluelist = row["clue_list"]
  wordpair = row["wordpair"]
  clue_probs = search_funcs.RSA.pragmatic_speaker(boardname, beta, cost, representations, 'swow', cluelist, vocab, boards)
  ## obtain the probs for the specific wordpair
  combos_df = search_funcs.RSA.compute_board_combos(boardname,boards)
  wordpairlist = list(combos_df["wordpair"])
  mainscores = clue_probs[wordpairlist.index(wordpair)]

  clue_board_df = pd.DataFrame({'Model': [modelname]})
  clue_board_df["boardnames"] = boardname
  clue_board_df["type"] = row["type"]
  clue_board_df["n_steps"] = row["n_steps"]   
  clue_board_df["wordpair"] = wordpair
  clue_board_df["cluelist"] = str(','.join(cluelist))
  clue_board_df["clue_score"] = str(np.round(mainscores,10).tolist())
    
  pragmaticspeaker_df = pd.concat([pragmaticspeaker_df, clue_board_df])
  pragmaticspeaker_df.to_csv(parentfolder+'candidates_RSAprobs.csv', index = False)

### obtaining probabilities for data

Now that we have the candidate-level probabilities for the pragmatic speaker, we compute the probabilities for the behavioral data

In [None]:
#candidates_df["wordpair"] = candidates_df["Word1"]+"-"+candidates_df["Word2"]

#rsa_probs = pd.DataFrame()

for index, row in expdata.iterrows():
  wordpair = row["wordpair"].replace(" - ", "-")
  clue = row["Clue1"]
  clue_df = df_filtered[(df_filtered["vocab_word"] == clue) & (df_filtered["wordpair"] == wordpair)]
  for i, j in clue_df.iterrows():
    ctype = j["type"]
    n_steps = j["n_steps"]
    clueprobs_df = pragmaticspeaker_df[(pragmaticspeaker_df["wordpair"]== wordpair) & (pragmaticspeaker_df["type"] == ctype) & (pragmaticspeaker_df["n_steps"]==n_steps)]
    if(len(clueprobs_df)>0):
      clue_index = list(clueprobs_df.cluelist)[0].split(',').index(clue)
      clue_score = list(clueprobs_df.clue_score)[0][1:-1].split(', ')[clue_index]

      clue_board_df = pd.DataFrame({'alpha': ["RSA"]})
      clue_board_df["type"] = ctype
      clue_board_df["n_steps"] = n_steps
      clue_board_df["wordpair"] = wordpair
      clue_board_df["Clue1"] = clue
      clue_board_df["clue_score"] = clue_score
        
      rsa_probs = pd.concat([rsa_probs, clue_board_df])
      rsa_probs.to_csv(parentfolder+'finalexpdata_RSA_speaker.csv', index = False)

In [None]:
rsa_probs.to_csv(parentfolder+'finalexpdata_RSA_speaker.csv', index = False)

# Comparing with online candidates study

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
online = pd.read_csv(parentfolder+'online_new_coded.csv')

In [None]:
main_online = online[["wordpair_id", "Level", "clueOption1", "clueOption2", "clueOption3", "clueOption4", "clueOption5", "clueOption6", "clueOption7", "clueOption8"]]
main_online = main_online.melt(id_vars=['wordpair_id', 'Level'], value_vars=["clueOption1", "clueOption2", "clueOption3", "clueOption4", "clueOption5", "clueOption6", "clueOption7", "clueOption8"])
main_online = main_online.dropna()
main_online = main_online.drop_duplicates()
main_online = main_online.groupby(['wordpair_id', 'variable', 'Level'], as_index=False)['value'].agg(','.join)
main_online['clue_list'] = main_online['value'].str.split(',')

In [None]:
common_candidates = pd.DataFrame()

for index, row in main_online.iterrows():
  wordpair = row["wordpair_id"]
  clue_list = row["clue_list"]
  # find all candidates for that wordpair in cwide
  wp_candidates = cwide[cwide["wordpair"] == wordpair]
  if(len(wp_candidates) == 0):
    w1, w2 = wordpair.split("-")
    wordpair = w2 + "-" + w1
    wp_candidates = cwide[cwide["wordpair"] == wordpair]
    
  for i, j in wp_candidates.iterrows():
    candidate_list = j["clue_list"]
    intersection = list(set(clue_list).intersection(candidate_list))

    clue_board_df = pd.DataFrame({'wordpair': [wordpair]})
    clue_board_df["Level"] = row["Level"]
    clue_board_df["candidate_type"] = row["variable"]
    clue_board_df["type"] = j["type"]
    clue_board_df["n_steps"] = j["n_steps"]
    clue_board_df["n_model_candidates"] = len(candidate_list)
    clue_board_df["n_human_candidates"] = len(clue_list)
    clue_board_df["n_intersection"] = len(intersection)
    clue_board_df["intersection"] = str(intersection)
      
    common_candidates = pd.concat([common_candidates, clue_board_df])

common_candidates.to_csv(parentfolder+'common_candidates.csv', index = False)

# Writing files to CSV

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
parentfolder = "/content/drive/My Drive/search-models/"
common_candidates.to_csv(parentfolder+'common_candidates.csv', index = False)

# Old: Dedicated functions for full vocab

In [11]:
## create boards and merge with expdata
combined_boards_df = pd.DataFrame(columns=['Experiment', 'Board','boardwords'])
combined_boards_df["Experiment"]  = ["E1"] * 10 + ["E2"] * 10
combined_boards_df["Board"] = ["TrialList" + str(i) for i in range(1,11)] * 2
combined_boards_df["boardnames"] = (['e1_board' + str(i) + '_words' for i in range(1,11)] 
                                  + ['e2_board' + str(i) + '_words' for i in range(1,11)])
combined_boards_df["boardwords"] = [boards[n] for n in combined_boards_df["boardnames"]]

expdata_new = pd.merge(expdata,combined_boards_df,on=['Board', 'Experiment'],how='left')
expdata_new["wordpair"] = expdata_new["Word1"] + "-" + expdata_new["Word2"]
board_combos = {board_name : search_funcs.RSA.compute_board_combos(board_name, boards) for board_name in boards.keys()}

## Non - RSA method


In [12]:
board_optimal_params = {
    'swow' : (23.488850322875496, 1), # -13204
    'glove' : (20.952928531665275, 1), # -15774.814774380024)
    'bert-sum' : (19.983835225540847, 0.787924454045298),
}

In [13]:
cluescoredf = search_funcs.nonRSA.speaker_targetboard_cluescores(['swow', 'glove'], board_optimal_params, board_combos, boards, list(vocab.vocab_word), vocab, representations, target_df, expdata_new)
cluescoredf.head()

for swow and alpha 0.0


KeyboardInterrupt: 

## RSA Method

In [None]:
rsa_optimal_params = {
    'swow' : (25.1522030761838, 0.03863169001849234),
    'glove' : (22.336514544537227, 0.039),
    'bert-sum' : (29.709602301411962, 0.031659060110267576), #-17533
}

In [None]:
pragmaticspeakerdf = search_funcs.RSA.get_speaker_df(representations, combined_boards_df,rsa_optimal_params, list(vocab.vocab_word), vocab, expdata_new, board_combos, target_df, boards)
pragmaticspeakerdf.head()