# Calculate predictions of words related to a text using word association norms

Word association norms can, at least in most cases, be defined by a matrix $A$, such that 
$$
A_{ij} \triangleq \text{frequency that word $w_i$ is stated as associated with word $w_j$}.
$$

Therefore, the conditional probability of word $w_i$ given $w_j$ is 
$$
\mathrm{P}(w_i \vert w_j) = \frac{A_{ij}}{\sum_{i=1}^V A_{ij}},
$$
where $V$ is the total number of words in our vocabulary of response words.

Given a text 
$$ 
\textrm{text}_{j^\prime} \triangleq w_{j^\prime 1}, w_{j^\prime 2} \ldots w_{j^\prime n_{j^\prime}}, 
$$
the predicted probability that word $w_k$ is associated with $\textrm{text}_{j^\prime}$ is 
$$
\mathrm{P}(w_k \vert \textrm{text}_{j^\prime}) = \frac{1}{n_{j^\prime}} \sum_{i = 1}^{n_{j^\prime}} \mathrm{P}(w_k \vert w_{j^\prime i}).
$$

In [1]:
from __future__ import division

from matplotlib import pyplot

import configobj
import numpy
import pandas
import cPickle as pickle

from utils import utils

In [2]:
def text_to_words(text):
    return [word for word in utils.tokenize(text) if word in word2index]

In [3]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '_cache'

filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'corpus' : [('bnc_78723408_250_500_49328.npz.bz2', 
                 'b9d828f7697871e01a263b8f3978911c70ff45cab9af4c86fbb43c3baef969d9')]
}

utils.curl(url_root, 
                 filenames['experiment_cfg'] + filenames['corpus'], 
                 cache=cache_directory,
                 verbose=False)

In [4]:
stimuli = configobj.ConfigObj('_cache/Brismo.cfg')['text_memoranda']

In [5]:
corpus_data = utils.loadnpz('bnc_78723408_250_500_49328.npz.bz2',  
                               cache='_cache',
                               verbose=False)

word2index = {w:i for i,w in enumerate(corpus_data['vocabulary'])}
index2word = {i:w for i,w in enumerate(corpus_data['vocabulary'])}

In [6]:
texts = {}
for key,value in stimuli.items():
    texts[key] = text_to_words(value['text'])  

The following assumes that the file `associations_en_05_01_2015.csv.bz2`, whose sha256 checksum is `06a527e5c9647f37a4a2ee0744a309f57f259e203238b87e0f466b74f7a6e63e` is available in the `_cache` directory. This is compressed csv file of word association norms collected at https://www.smallworldofwords.org/en and generously shared by Simon De Deyne (https://simondedeyne.me/). I am not at liberty to share this data presently, and so please contact Simon De Deyne, or Gert Storms in order to obtain it. 

In [7]:
word_associations_data = utils.loadcsv('associations_en_05_01_2015.csv.bz2', cache='_cache')

In [8]:
from collections import defaultdict

class WordAssociations(object):
    

    def __init__(self, word_associations_data):
        
        self.word_associations_data = word_associations_data
            
        self.build_associations()

    def build_associations(self):
        
        self.associations = defaultdict(lambda: defaultdict(lambda : 1e-4))
        
        for row in self.word_associations_data:
            
            subject, stimulus, assoc1, assoc2, assoc3 = row.split(';')

            for associate in (assoc1, assoc2, assoc3):
            
                self.associations[stimulus][associate] += 1
                
        self._normalize_associations()
                
    def _normalize_associations(self):
        
        for stimulus in self.associations:
            values = numpy.array(self.associations[stimulus].values())
            z = values.sum()
            for associate in self.associations[stimulus]:
                self.associations[stimulus][associate] /= z

In [9]:
word_association = WordAssociations(word_associations_data)

In [10]:
def get_associations(text_name):
    
    '''
    This implements
     
     \mathrm{P}(w_k \vert \textrm{text}_{j^\prime}) = 
     \frac{1}{n_{j^\prime}} \sum_{i = 1}^{n_{j^\prime}} \mathrm{P}(w_k \vert w_{j^\prime i})
     
     (see above)
     
     where `associate` below corresponds to `w_k` 
     and so we calculate the probability of `associate` as a response for each word in the text.
     And then average over all of these words.
    '''
    
    
    d = defaultdict(lambda : 0.0)

    n = 0.0 
    for word in texts[text_name]:
        if word in word_association.associations:
            n += 1
            for associate, strength in word_association.associations[word].items():
                d[associate] += strength
    
    for associate in d:
        d[associate] /= n
    
    return d

In [11]:
Df = {}
Df['recall'] = pandas.read_pickle('_cache/brisbane_06b643a_recall_results.pkl')

recalled_words = sorted(set(Df['recall']['word'].values).intersection(corpus_data['vocabulary']))

In [12]:
stimuli_words = []
for text_name in stimuli:
    d = get_associations(text_name)
    _, n = text_name.split('_')
    n = int(n)+1
    inwords = stimuli[text_name]['inwords'].split(',')
    outwords = stimuli[text_name]['outwords'].split(',')
    for word in inwords+outwords+recalled_words:
        try:
            p = d[word]
            stimuli_words.append((str(n) + '-' + word, p))
        except KeyError:
            print('Unknown word in text %s: "%s"' % (text_name,word))

associations_predictions = dict(stimuli_words)

In [13]:
with open('_cache/associations_predictions.pkl', 'wb') as f:
    pickle.dump(associations_predictions, f, protocol=2)