In [1]:
from collections import Counter, defaultdict, OrderedDict
import configobj
import numpy

In [2]:
vocab = open('data/bnc_vocab.txt').read().split()

In [3]:
associations = open('data/associations_en_05_01_2015.csv').read().strip().split('\n')

In [4]:
vocab_dictionary = {word:i for i, word in enumerate(vocab)}

In [5]:
def stimuli_in_bnc_vocab(row):
    
    ctr, word, assoc_1, assoc_2, assoc_3 = row.split(';')
    
    return all([v.lower() in vocab_dictionary for v in (word, assoc_1, assoc_2, assoc_3)])

In [6]:
associations_redux = filter(stimuli_in_bnc_vocab, associations)

In [7]:
cue_words = []
for row in associations_redux:
    ctr, word, assoc_1, assoc_2, assoc_3 = row.split(';')
    
    cue_words.append(word.lower())
cue_words = Counter(cue_words)

cue_words_redux = [word_tuple[0] for word_tuple in filter(lambda arg: arg[1]> 75, cue_words.items())]

In [8]:
def frequent_associates(item):
    
    try:
        return item[1][14][1] >= 3
    except IndexError: # less than 15 items
        return False
    

In [9]:
associates = defaultdict(list)

for row in associations_redux:
    ctr, word, assoc_1, assoc_2, assoc_3 = row.split(';')
    associates[word].extend([assoc_1, assoc_2, assoc_3])

associates_redux = {}
for cue_word in cue_words_redux:
    associates_redux[cue_word] = sorted(Counter(associates[cue_word]).items(),
                                        key=lambda item: item[1],
                                        reverse=True)

cue_words_redux = sorted(dict(filter(frequent_associates, associates_redux.items())).keys())

In [10]:
V = defaultdict(int)
for text in open('data/bnc_texts.txt').read().strip().split('\n'):
    words = text.split('|')
    for word in words:
        V[word] += 1

In [11]:
seed = 10001

random = numpy.random.RandomState(seed)

final_cue_words = []
for k in random.permutation(len(cue_words_redux)):
    
    cue_word = cue_words_redux[k]
    if V[cue_word] >= 2500 and V[cue_word] <= 5000 and cue_word[-1] != 's':
        final_cue_words.append(cue_word)

    if len(final_cue_words) >= 40:
        break

In [12]:
memoranda = {}
for critical_lure in final_cue_words:
    memoranda[critical_lure] = [item[0] for item in associates_redux[critical_lure][:15]]
    memoranda_str = ','.join(memoranda[critical_lure])
    print(critical_lure + ': ' + memoranda_str)

steel: metal,iron,strong,hard,cold,sword,mill,wool,shiny,silver,beams,blue,industry,stainless,grey
abuse: hurt,child,hit,violence,neglect,harm,verbal,domestic,children,bad,drug,women,sexual,rape,bruise
clinical: doctor,medical,hospital,trial,study,cold,clean,sterile,medicine,depression,psychology,white,precise,clinic,research
beach: sand,sun,ocean,water,ball,waves,sea,surf,tan,umbrella,whale,shore,summer,swim,towel
fuel: gas,car,fire,petrol,energy,oil,gasoline,food,tank,diesel,pump,power,wood,coal,expensive
eat: food,drink,consume,dine,hungry,merry,restaurant,dinner,meal,chew,taste,fat,meat,devour,love
injury: hurt,pain,hospital,wound,blood,accident,sports,damage,ouch,death,break,bandage,fall,bleeding,lawyer
throat: sore,neck,swallow,cough,voice,deep,sing,cancer,cut,tonsils,mouth,dry,tongue,singing,frog
urban: city,rural,outfitters,dictionary,sprawl,concrete,suburban,black,cowboy,town,building,planning,busy,ghetto,urbane
afraid: scared,fear,dark,frightened,terrified,scary,fearful,anxio

In [13]:
def associates_as_set(word, K=15):
    return set([x[0] for x in associates_redux[word][:K]])

def get_distractors(target, K=15):
    words = []
    for word in final_cue_words:
        if len(as_set(target, K=K).intersection(as_set(word, K=K))) > 5:
            words.extend([x[0] for x in associates_redux[word] if x[0] in vocab_dictionary])
    return list(set(words).difference(memoranda[target]))

In [14]:
distractors = {}
for critical_lure in final_cue_words:
    distractors[critical_lure] = get_distractors(critical_lure)[:5]
    print(critical_lure + ': ' + ','.join(distractors[critical_lure]))

NameError: global name 'as_set' is not defined

In [None]:
seed = 10001

random = numpy.random.RandomState(seed)

def get_inwords(critical_word, K=5):
    _memoranda = memoranda[critical_word]
    return [_memoranda[k] for k in random.permutation(len(_memoranda))[:K]]

stimuli = OrderedDict()

for critical_lure in memoranda.keys():
    
    _stimuli = OrderedDict()
    
    inwords = get_inwords(critical_lure)
    outwords = distractors[critical_lure]
    _memoranda = memoranda[critical_lure]
    
    _stimuli['memoranda'] = _memoranda
    _stimuli['inwords'] = inwords
    _stimuli['outwords'] = outwords
    
    stimuli[critical_lure] = _stimuli

In [None]:
! rm -f stimuli.cfg

In [None]:
C = configobj.ConfigObj('stimuli.cfg', indent_type=' '*4)

In [None]:
C['wordlists'] = all_stimuli

In [None]:
C.write()