In [1]:
from collections import Counter, defaultdict, OrderedDict
import configobj
import numpy
import sys

from importlib import reload

import utils

from ernst import sysutils 

## Get vocabulary

Read in the vocabulary, and make a dictionary where the key is the word and value is a unique integer.

In [2]:
vocab = open('data/bnc_vocab.txt').read().strip().split()
vocab_dictionary = {word:i for i, word in enumerate(vocab)}

In [3]:
# Restrict our stimulus (or cue) words
minimum_cue_word_count = 75 
minimum_number_of_associates = 20
minimum_associate_count = 3

## Prepare association norm data 

We use the English *small world of words* association norms. We impose the following restrictions:

* The stimulus word and all of their associates must be in the vocabulary defined above. 

* The stimulus word must occur on {{minimum_cue_word_count}} or more response trials.

* There must be at least {{minimum_number_of_associates}} unique associates per stimulus word, each occurring at least {{minimum_associate_count}} times in total.

Load up the associations data.

In [4]:
sww_norms = utils.SwwNorms('data/associations_en_05_01_2015.csv')

assert len(sww_norms) == 1074227

Filter out the trials where stimulus word and all its associates are not in the vocabulary.

In [5]:
sww_norms.filter_norms_by_vocabulary(vocab_dictionary, in_place=True)

assert len(sww_norms) == 543379

Filter out trials where stimulus word occurs less than the minumum, i.e. {{minimum_cue_word_count}}.

In [6]:
sww_norms.filter_norms_by_stimulus_counts(minimum_cue_word_count, in_place=True)

assert len(sww_norms) == 174370 

Collect all the associates of all stimulus words.

In [7]:
sww_norms.gather_associates()

assert len(sww_norms.associates.keys()) == 2136

Filter the stimuli words so that they have the required number of associates, each with the required number of counts.

In [8]:
sww_norms.filter_norms_by_associates(minimum_number_of_associates, minimum_associate_count, in_place=True)

assert len(sww_norms.associates) == 721

## Why not restrict attention to British English participants?

Given that we are intending to do this experiment with British English participants, why don't we just use the association norms produced by British English speakers? 

In [9]:
users = open('data/usersasso.csv', encoding='utf8').read().strip().split('\n')

strip = lambda x : x.replace('"','')

users = [
    dict(zip(map(strip, users[0].split(';')), map(strip, user.split(';'))))
    for user in users[1:]
]

gbr_proportion = numpy.mean([user['nativeLanguage'] == 'GBR' for user in users])

assert numpy.allclose(gbr_proportion, 0.092570077113972124)

The percentage of the participants who are native British English speakers is just {{int(gbr_proportion*1000)/10}}%, so we would use only a small proportion of the available data if we restricted attention to just these participants. 

Get the unigram statistics for all words in the BNC. 

In [10]:
V = defaultdict(int)
for text in open('data/bnc_texts.txt', 
                 encoding='utf-8').read().strip().split('\n'):
    words = text.split('|')
    for word in words:
        V[word.lower()] += 1
        
assert (V['woods'], V['milk'], V['military']) == (1647, 3342, 9180)

In [11]:
minimum_bnc_count = 2500
maximum_bnc_count = 5000 
stimulus_set_length = 50
memorandum_length = 20

Randomly select words for the set of acceptable stimulus word (cue words). Keep if the rate of occurrence of that word in the BNC is greater than {{minimum_bnc_count}} and less than {{maximum_bnc_count}}. Exclude words that end in s, as that would get too many plurals or derivatives of the present tense in verbs, e.g. runs, walks, etc. Collect a total of {{stimulus_set_length}} words this way.

In [12]:
stimulus_words = list(sww_norms.associates.keys())

seed = 10001

random = numpy.random.RandomState(seed)

final_stimulus_set = []
for k in random.permutation(len(stimulus_words)):
    
    stimulus_word = stimulus_words[k]
    if V[stimulus_word] > minimum_bnc_count and V[stimulus_word] < maximum_bnc_count and stimulus_word[-1] != 's':
        final_stimulus_set.append(stimulus_word)

    if len(final_stimulus_set) >= stimulus_set_length:
        break

assert sww_norms.associates[final_stimulus_set[0]][0] == ('crazy', 24) # If seed is 10001

For each word in the final stimulus set, collect the top {{memorandum_length}} associated words.

In [13]:
memoranda = {}
for critical_lure in final_stimulus_set:
    memoranda[critical_lure] = [item[0] 
                                for item in sww_norms.associates[critical_lure][:minimum_number_of_associates]]
    memoranda_str = ','.join(memoranda[critical_lure])
    print(critical_lure + ': ' + memoranda_str)

wild: crazy,animal,nature,feral,animals,beast,west,free,fun,woods,jungle,untamed,wilderness,party,tame,berry,hair,bear,child,forest
ice: cold,water,cream,snow,cube,frozen,freeze,drink,skate,white,winter,cubes,iceberg,solid,pick,hard,fire,hockey,slippery,tea
bear: grizzly,animal,hug,black,brown,teddy,carry,polar,cub,endure,fur,arms,burden,stand,ferocious,furry,dangerous,growl,bare,tolerate
electric: light,power,car,current,shock,exciting,fence,electricity,static,lamp,wires,kettle,wire,company,outlet,eel,avenue,lightning,guitar,energy
chain: link,gang,mail,metal,lock,fence,necklace,letter,links,ball,reaction,gold,rope,iron,bike,smoke,slave,connection,bicycle,prisoner
prison: jail,bars,cell,orange,inmate,penitentiary,rape,prisoner,incarceration,criminal,convict,guards,guard,break,cold,crime,gaol,stripes,sentence,locked
drawing: pencil,art,sketch,paper,painting,picture,board,pen,paint,color,artist,crayons,easel,charcoal,illustration,crayon,child,lottery,sketching,pictures
insurance: car,mo

In [14]:
def get_distractors(target, K=5):
    
    '''
    Find distractors for a given target word.
    
    * For every target word, find its top associates.
    * Find all those words in the final stimulus set where
      there is an overlap of `K` or more words between their 
      associates and the associates of the target word.
    * Collect all the associates of matching other words.
    * If the target word is in the final_stimulus_set (which it
      will be), then we'll also look at the overlap between the 
      associates of the target word and itself. But that's ok. 
      It does not mess things up. 
    * We count all these associate words, and then return all the
      counts as a list, but removing any associates that are in 
      the memoranda associated with the target.
    
    '''
    
    def associates_as_set(word):
        return set([x[0] 
                    for x in sww_norms.associates[word]])
    
    words = []
    for word in final_stimulus_set:
        if len(associates_as_set(target).intersection(associates_as_set(word))) > K:
            words.extend([x[0] for x in sww_norms.associates[word] if x[0] in vocab_dictionary])
                
    return [item for item in sorted(Counter(words).items(), 
                                    key=lambda item: item[1],
                                    reverse = True) 
            if item[0] not in memoranda[target]]
                  
assert get_distractors('milk', K=5)[0] == ('water', 4)

In [15]:
distractors = {}
for critical_lure in memoranda:
    distractors[critical_lure] = [x[0] 
                                  for x in get_distractors(critical_lure)]
    print(critical_lure + ': ' + ', '.join(distractors[critical_lure][:7]))

wild: tree, life, bird, house, wood, love, home
ice: wet, box, block, arctic, baby, yummy, blue
bear: hair, bird, tree, strong, woods, forest, wood
electric: horse, green, blue, lights, money, gas, dog
chain: heavy, box, steel, key, cold, rock, dance
prison: house, black, hard, metal, lock, government, dance
drawing: table, line, water, ink, hard, hand, horse
insurance: house, grey, death, hard, happy, free, gas
fun: life, house, family, dance, love, game, sun
chemical: water, gas, hot, strong, gray, light, factory
comfortable: life, house, love, hard, dance, sun, calm
urban: life, house, home, nature, free, cat, tree
comfort: fire, house, safety, life, safe, comfort, water
abuse: happy, pain, hate, sex, red, animal, animals
card: party, money, life, house, happy, dance, hard
block: house, hard, black, dance, box, time, play
teeth: ache, hurt, body, hard, eating, gold, bone
heat: bed, happy, light, blanket, home, steam, weather
motion: blue, life, cars, power, train, city, line
smoke: 

In [16]:
seed = 424242

random = numpy.random.RandomState(seed)

def get_inwords(critical_word, K=5):
    _memoranda = memoranda[critical_word]
    return [_memoranda[k] for k in random.permutation(len(_memoranda))[:K]]

def check_stimuli(critical_lure, stimuli):
    
    memoranda, inwords, outwords = map(set,
                                       stimuli.values())
    
    assert memoranda.issuperset(inwords)
    assert memoranda.isdisjoint(outwords)
    assert inwords.isdisjoint(outwords)
    
    assert critical_lure in outwords
    assert not critical_lure in inwords
    assert not critical_lure in memoranda
    
    return True
    
    

stimuli = OrderedDict()

for critical_lure in memoranda.keys():
    
    _stimuli = OrderedDict()
    
    _stimuli['memoranda'] = memoranda[critical_lure]
    K = int(memorandum_length/2)
    _stimuli['inwords'] = get_inwords(critical_lure, K=K)
    _stimuli['outwords'] = [critical_lure] + distractors[critical_lure][:K-1]
    
    assert check_stimuli(critical_lure, _stimuli)
    
    
    stimuli[critical_lure] = _stimuli

In [17]:
C = configobj.ConfigObj('stimuli.cfg', indent_type=' '*4)

C['wordlists'] = stimuli

C.write()

assert sysutils.checksum('stimuli.cfg') == '942167cb194cb938c6f89dedc811e5110d82c67befc72ae8f6aa4fd5f4028a7d'

In [23]:
print(C.write(outfile=sys.stdout))

[wordlists]
    [[wild]]
        memoranda = crazy, animal, nature, feral, animals, beast, west, free, fun, woods, jungle, untamed, wilderness, party, tame, berry, hair, bear, child, forest
        inwords = berry, west, animal, beast, bear, feral, party, hair, nature, untamed
        outwords = wild, tree, life, bird, house, wood, love, home, warm, green
    [[ice]]
        memoranda = cold, water, cream, snow, cube, frozen, freeze, drink, skate, white, winter, cubes, iceberg, solid, pick, hard, fire, hockey, slippery, tea
        inwords = white, hard, iceberg, frozen, solid, tea, cream, skate, cold, cubes
        outwords = ice, wet, box, block, arctic, baby, yummy, blue, vanilla, gas
    [[bear]]
        memoranda = grizzly, animal, hug, black, brown, teddy, carry, polar, cub, endure, fur, arms, burden, stand, ferocious, furry, dangerous, growl, bare, tolerate
        inwords = bare, ferocious, hug, cub, tolerate, arms, brown, endure, dangerous, fur
        outwords = bear, hair, b