In [55]:
import pandas as pd
import numpy as np
from random import shuffle, sample, randint
from itertools import combinations, chain

# Pilot triads/trials
This notebook generates triads (and trials) for a pilot experiment in the multilingual semantic triads project.

## How many triads/trials are needed
With current methods (see simulation notebooks) we need to sample about 50% of all possible triads before the amount of semantic similarity structure we can recover plateaus and increases no further. Because the total number of possible triads scales very poorly with the number of concepts used, we start with a set of only 50 concepts.

Below we briefly demonstrate how many trials per participant are theoretically needed to recover a reasonable amount of semantic similarity structure for 50 concepts.

In [213]:
concepts = list(range(50))  # numbers to use as stand-ins for concepts
max_triads = len(list(combinations(concepts, 3)))
print(f'Number of possible triads for 50 concepts: {max_triads}')
print(f'Number of triads we need to sample to recover structure for 50 concepts: {max_triads / 2:.0f}')
print(f'Number of participants needed, assuming 4 triads per trial and 100 trials per participant: '
        + f'{(max_triads / 2) / (4 * 120):.0f}')

Number of possible triads for 50 concepts: 19600
Number of triads we need to sample to recover structure for 50 concepts: 9800
Number of participants needed, assuming 4 triads per trial and 100 trials per participant: 20


From an experimenter's perspective it might make sense to gather 2 or more observations per triad, because we usually do not trust single observations very much. It's important to keep in mind, however, that the triads are not independent, since they describe a structured space. Triads are therefore strongly interdependent (e.g. if we have collected a number of triads that suggest _cat_ is similar to various animals while being dissimilar from vehicles, then a new participant rating _cat_ as being more similar to _train_ than to _dog_ won't carry much weight in the determination of the overall structure.

## Picking pilot concepts
In order to have good materials to compare the recovered semantic similarity structure to, we want concepts that are represented in other lexical semantics datasets. There are a number of such datasets available, but some of the most-used our field are the McRae feature norms (2005), the Buchanan feature norms (?), and the Small World of Words data (?).

Ideally, we want to use words that are represented in all three of these datasets, but specifically the Buchanan and McRae norms don't have much overlap. To see what we have to work with, we compute the intersection of concepts represented in all three datasets.

In [5]:
df_mcrae = pd.read_csv('../datasets/mcrae_concepts.txt', sep='\t')
df_swow = pd.read_csv('../datasets/SWOW-EN.R100.csv')
df_buchanan = pd.read_csv('../datasets/buchanan_words.csv')
display(df_mcrae)
display(df_swow)
display(df_buchanan)

Unnamed: 0,Concept,Pronunciation,Phon_1st,KF,ln(KF),BNC,ln(BNC),Familiarity,Length_Letters,Length_Phonemes,...,Num_Func,Num_Vis_Mot,Num_VisF&S,Num_Vis_Col,Num_Sound,Num_Taste,Num_Smell,Num_Tact,Num_Ency,Num_Tax
0,accordion,[@][kO:][dj@n],@,1,0.00,2,0.69,2.90,9,7,...,2,0,2,0,2,0,0,0,2,1
1,airplane,[E@][pleIn],E@,21,3.04,108,4.68,6.55,8,5,...,3,3,5,0,0,0,0,0,2,0
2,alligator,[&][lI][geI][t@r*],&,4,1.39,114,4.74,3.75,9,8,...,0,2,6,1,0,0,0,0,5,2
3,ambulance,[&m][bjU][l@ns],&,7,1.95,1846,7.52,6.45,9,9,...,7,1,4,3,1,0,0,0,1,2
4,anchor,[&N][k@r*],&,17,2.83,700,6.55,3.85,6,5,...,3,0,3,0,0,0,0,1,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,wrench,[rEntS],r,2,0.69,213,5.36,4.70,6,4,...,8,0,3,0,0,0,0,1,1,1
537,yacht,[jOt],j,7,1.95,1426,7.26,3.85,5,3,...,5,0,3,0,0,0,0,0,5,1
538,yam,[j&m],j,1,0.00,51,3.93,3.30,3,3,...,2,0,0,1,0,1,0,0,2,1
539,zebra,[zE][br@],z,1,0.00,276,5.62,2.60,5,5,...,2,2,5,2,0,0,0,0,2,3


Unnamed: 0.1,Unnamed: 0,id,participantID,age,gender,nativeLanguage,country,education,created_at,cue,R1,R2,R3
0,1,29,3,33,Fe,United States,Australia,,2011-08-12 02:19:38,although,nevertheless,yet,but
1,2,30,3,33,Fe,United States,Australia,,2011-08-12 02:19:38,deal,no,cards,shake
2,3,31,3,33,Fe,United States,Australia,,2011-08-12 02:19:38,music,notes,band,rhythm
3,4,32,3,33,Fe,United States,Australia,,2011-08-12 02:19:38,inform,tell,rat on,
4,5,33,3,33,Fe,United States,Australia,,2011-08-12 02:19:38,way,path,via,method
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228195,1228196,1530300,132506,29,Ma,Canada,Australia,5.0,2018-08-10 01:56:27,strange,mask,weird,stranger
1228196,1228197,1530290,132506,29,Ma,Canada,Australia,5.0,2018-08-10 01:56:27,sunset,sea,sky,clause
1228197,1228198,1530291,132506,29,Ma,Canada,Australia,5.0,2018-08-10 01:56:27,useless,pitty,worthless,worth
1228198,1228199,1530284,132506,29,Ma,Canada,Australia,5.0,2018-08-10 01:56:27,volume,loud,music,key


Unnamed: 0,where,cue,feature,translated,frequency_feature,Unnamed: 5
0,top,abandon,desert,desert,9,
1,top,abandon,give,give,19,
2,top,abandon,leave,leave,26,
3,top,abandon,leaving,leave,1,
4,top,abandon,left,leave,5,
...,...,...,...,...,...,...
49041,top,TRUE,rightly,right,1,
49042,top,TRUE,truth,truth,10,
49043,top,TRUE,unfaithful,faith,1,
49044,top,TRUE,unreal,real,1,


In [6]:
concepts_mcrae = set(df_mcrae['Concept'])
concepts_swow = set(df_swow['cue'])
concepts_buchanan = set(df_buchanan['cue'])
print(f'Number of concepts in McRae feature norms: {len(concepts_mcrae)}')
print(f'Number of concepts in Small World of Words: {len(concepts_swow)}')
print(f'Number of concepts in Buchanan feature norms: {len(concepts_buchanan)}')
print()
print(f'Number of concepts in both McRae and SWoW: {len(concepts_mcrae & concepts_swow)}')
print(f'Number of concepts in both McRae and Buchanan: {len(concepts_mcrae & concepts_buchanan)}')
print(f'Number of concepts in both Buchanan and SWoW: {len(concepts_buchanan & concepts_swow)}')
print()
intersect = concepts_mcrae & concepts_swow & concepts_buchanan
print(f'Number of concepts in all three datasets: {len(intersect)}')

Number of concepts in McRae feature norms: 541
Number of concepts in Small World of Words: 12282
Number of concepts in Buchanan feature norms: 3722

Number of concepts in both McRae and SWoW: 500
Number of concepts in both McRae and Buchanan: 63
Number of concepts in both Buchanan and SWoW: 3529

Number of concepts in all three datasets: 63


### Categorizing and selecting concepts
From the intersection of the three datasets, we can choose our 50 concepts. It makes sense to not just do this at random, but put the concepts into rough semantic categories and ensure good coverage of a few of these categories. It's easiest to store them in a TSV file now, work on them outside this notebook, and them load them in again afterwards.

We will also insert a few verbs into the list, as the intersection concepts are all fairly basic concrete nouns.

In [112]:
df = pd.DataFrame(list(intersect), columns=['concept'])
df['category'] = ''
display(df)
df.to_csv('pilot_items_uncategorized.csv', index=False)

Unnamed: 0,concept,category
0,wagon,
1,hammer,
2,shirt,
3,buffalo,
4,snail,
...,...,...
58,fox,
59,pen,
60,elephant,
61,garlic,


For the verbs, we do something similar to the intersection of datasets trick we used before, except the McRae feature norms don't really contain verbs, so we replace that dataset with a dataset that specifically contains verb similarity ratings. (This dataset isn't very big, but it will be helpful to have another reference point for the semantic similarity structure we're attempting to recover.)

In [117]:
df_verbs = pd.read_csv('../datasets/en-verb143.tsv', sep='\t', comment='#')
concepts_verbs = set(df_verbs['word1']) | set(df_verbs['word2'])
print(f'Number of unique verbs in SimVerb dataset: {len(concepts_verbs)}')
verb_intersect = concepts_swow & concepts_verbs & concepts_buchanan
print(f'Number of verbs in intersection of SWoW, Buchanan, and SimVerb: {len(verb_intersect)}')
print()
print('Verbs in intersection of SWoW, Buchanan, and SimVerb:\n')
for verb in verb_intersect:
    print(verb)

Number of unique verbs in SimVerb dataset: 117
Number of verbs in intersection of SWoW, Buchanan, and SimVerb: 19

Verbs in intersection of SWoW, Buchanan, and SimVerb:

find
cause
set
giving
exist
help
found
use
form
make
seem
protect
start
strike
show
develop
work
happening
allow


### Inspecting the categorized concepts
We can now load the concepts we categorized by hand, so that we can tabulate how large the categories are (and use them to make triads for the pilot study).

In [118]:
df = pd.read_csv('pilot_items_categorized.csv').sort_values('category')
display(df)
display(df.groupby('category').count())

Unnamed: 0,concept,category
24,clam,animal
29,turtle,animal
40,fox,animal
35,mouse,animal
15,alligator,animal
13,squirrel,animal
42,elephant,animal
8,zebra,animal
37,lion,animal
6,dolphin,animal


Unnamed: 0_level_0,concept
category,Unnamed: 1_level_1
animal,13
clothing,6
furniture,6
household item,6
kitchenware,6
vehicle,6
verb,7


## Triads
### Within- and between-categories triads
Triads can consist of various permutations of either same- or different-category items, and we can choose which of these permutations to present to participants. Realistically, these semantic categories are continuous, rather than discrete, and there is no real reason to believe that the level of categorization or the groupings chosen here are any more or less correct than whatever categorization someone else can come up with, but since we want to do a basic test of whether specific types of triads are more informative than others, we'll just stick with the groupings we have here.

At the most basic level, a triad can consist of either:
1. All same-category items (e.g. _car-ship-train_)
2. A different-category slider on same-category anchors (e.g. _car-lion-train_)
3. Slider and one anchor from same category, with one different-category anchor (e.g. _car-ship-lion_)
4. All different-category items (e.g. _car-lion-desk_)

The latter triads are fairly difficult to make a judgment about because of the disparity between the items; the axis on which to position the slider feels poorly defined.  
Furthermore, it is conceivable that if we sample triads of type 2 and 3, we get a decent idea of the global structure of our semantic space (i.e. the relationships between categories) but a fairly underspecified representation of the local structure (i.e. relationships of items within a category). Conversely, if we sample only triads from of the first type, we'll know about local structure, but we won't get any idea of global structure at all!

It seems then that choosing which types of triads to present merits some further consideration. Choosing triads at random from all possible triads will result in a mix from all four types of triads described above, but not a balanced mix: There are simply many more triads of type 2 and 3 than of type 1, and even more of type 4 than of types 2 and 3 together.

How to balance the different type of triads to recover the best representation of the semantic similarity structure of our set of concepts is an open question. If we knew the ground truth (i.e. what the semantic similarity structure in the minds of our participants looks like) we could figure out the correct balance of triad types through simulations, but this introduces a circularity problem, because we don't know very much about the true semantic structure until we run our experiment on a large sample of participants.

The intermediate solution, then, is to present groups of participants with a different mix of triad types, and see which of these groups produces a similarity structure most similar to the overall average similarity structure, and to our reference datasets (McRae & Buchanan feature norms, Small World of Words, word embeddings, etc.). Since we know presenting _only_ type 1 triads will recover **no** global structure at all, it makes sense to present two groups with a mix of type 1 and type 3 triads (in either a 5:1 or 3:3 composition), and one group with a completely random sampling (which will include mostly type 4 trials, as well as type 2 and 3 trials).

### Generating toy triads
First, we write some functions to generate triads

In [185]:
def generate_type1_trials(categories, num_trials, triads_per_trial):
    trials = []
    
    # randomize order of categories and loop over them
    shuffle(categories)
    for i in range(num_trials):
        left = []
        right = []
        slider = []
        
        # grab concepts in category and shuffle them
        concepts = categories[i % len(categories)]
        shuffle(concepts)
        
        # loop over required number of triads
        for j in range(triads_per_trial):
            
            # make the first two concepts the anchors
            left.append(concepts[0])
            right.append(concepts[1])
            # index remaining within-category concepts to grab the slider item
            slider.append(concepts[j + 2])
            
        # append trial to list
        trials.append([','.join(left), ','.join(slider), ','.join(right), 'type1'])
            
    return trials

def generate_type3_trials(categories, num_trials, triads_per_trial):
    trials = []
    
    # randomize order of categories and loop over them
    shuffle(categories)
    for i in range(num_trials):
        left = []
        right = []
        slider = []
        
        # grab concepts in category and shuffle them
        concepts = categories[i % len(categories)]
        shuffle(concepts)
        
        # make the first concept the left anchor
        anchor_left = concepts[0]
        
        # grab the right anchor from a random other category
        # (compare to current concepts to make sure it's a different category)
        random_cat = sample(categories, 1)[0]
        while random_cat == concepts:
            random_cat = sample(categories, 1)[0]
        anchor_right = sample(random_cat, 1)[0]
        
        # 50% chance of switching the left and right anchors
        if randint(0, 1):
            anchor_left, anchor_right = anchor_right, anchor_left
        
        # loop over required number of triads
        for j in range(triads_per_trial):
            
            # append anchors
            left.append(anchor_left)
            right.append(anchor_right)
            # index remaining within-category concepts to grab the slider item
            slider.append(concepts[j + 1])
            
        # append trial to list
        trials.append([','.join(left), ','.join(slider), ','.join(right), 'type3'])
            
    return trials

def generate_random_trials(categories, num_trials, triads_per_trial):
    trials = []
    
    # concatenate all categories together to form one list of concepts
    concepts = list(chain(*categories))
    
    # loop over number of trials
    for i in range(num_trials):
        left = []
        right = []
        slider = []
        
        # shuffle concepts each time
        shuffle(concepts)
        
        # loop over numbers of triads needed
        for j in range(triads_per_trial):
            
            # make the first two concepts the anchors
            left.append(concepts[0])
            right.append(concepts[1])
            # use subsequent concepts as sliders
            slider.append(concepts[j + 2])
            
        # append trial to list
        trials.append([','.join(left), ','.join(slider), ','.join(right), 'random'])
            
    return trials

Then, we generate a set of toy concepts in toy categories, to test the triad-generating functions.

In [186]:
categories = [[letter + str(q) for q in range(8)] for letter in ['a', 'b', 'c', 'd']]
display(categories)

[['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7'],
 ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
 ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7'],
 ['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7']]

In [187]:
generate_type1_trials(categories, 4, 4)

[['b1,b1,b1,b1', 'b5,b0,b3,b7', 'b4,b4,b4,b4', 'type1'],
 ['c6,c6,c6,c6', 'c2,c7,c5,c0', 'c4,c4,c4,c4', 'type1'],
 ['d0,d0,d0,d0', 'd5,d4,d7,d2', 'd1,d1,d1,d1', 'type1'],
 ['a3,a3,a3,a3', 'a2,a4,a5,a6', 'a0,a0,a0,a0', 'type1']]

In [188]:
generate_type3_trials(categories, 4, 4)

[['c5,c5,c5,c5', 'd5,d0,d6,d2', 'd3,d3,d3,d3', 'type3'],
 ['a6,a6,a6,a6', 'b7,b6,b3,b1', 'b2,b2,b2,b2', 'type3'],
 ['a6,a6,a6,a6', 'a7,a1,a5,a3', 'd1,d1,d1,d1', 'type3'],
 ['a4,a4,a4,a4', 'c3,c7,c5,c6', 'c2,c2,c2,c2', 'type3']]

In [189]:
generate_random_trials(categories, 4, 4)

[['b3,b3,b3,b3', 'a4,d3,c3,b4', 'd6,d6,d6,d6', 'random'],
 ['d7,d7,d7,d7', 'b6,b3,c5,a0', 'c2,c2,c2,c2', 'random'],
 ['c5,c5,c5,c5', 'a7,d1,d2,b1', 'd5,d5,d5,d5', 'random'],
 ['b6,b6,b6,b6', 'd2,b2,c1,b7', 'd5,d5,d5,d5', 'random']]

That all looks exactly as it should, so now we can generate the actual trial files using our set of 50 concepts.

### Generating real triads
As a quick test, we'll generate some triads with real concepts.

In [190]:
categories = [list(df.loc[df['category'] == cat, 'concept']) for cat in df['category'].unique()]
print(categories)

[['clam', 'turtle', 'fox', 'mouse', 'alligator', 'squirrel', 'elephant', 'zebra', 'lion', 'dolphin', 'octopus', 'snail', 'buffalo'], ['necklace', 'gloves', 'coat', 'shirt', 'belt', 'dress'], ['couch', 'door', 'cabinet', 'chair', 'desk', 'cushion'], ['toy', 'brush', 'doll', 'pen', 'hammer', 'radio'], ['oven', 'scissors', 'knife', 'dish', 'pot', 'fork'], ['wagon', 'airplane', 'train', 'bike', 'ship', 'submarine'], ['use', 'protect', 'find', 'exist', 'make', 'work', 'allow']]


In [194]:
generate_type1_trials(categories, 4, 3)

[['hammer,hammer,hammer', 'brush,toy,radio', 'pen,pen,pen', 'type1'],
 ['pot,pot,pot', 'dish,scissors,fork', 'knife,knife,knife', 'type1'],
 ['belt,belt,belt', 'necklace,coat,shirt', 'gloves,gloves,gloves', 'type1'],
 ['desk,desk,desk', 'cabinet,couch,chair', 'cushion,cushion,cushion', 'type1']]

In [197]:
generate_type3_trials(categories, 4, 3)

[['use,use,use', 'airplane,bike,wagon', 'train,train,train', 'type3'],
 ['use,use,use', 'coat,necklace,gloves', 'dress,dress,dress', 'type3'],
 ['make,make,make', 'work,allow,find', 'gloves,gloves,gloves', 'type3'],
 ['hammer,hammer,hammer', 'cabinet,couch,cushion', 'door,door,door', 'type3']]

In [210]:
generate_random_trials(categories, 4, 3)

[['airplane,airplane,airplane',
  'squirrel,protect,desk',
  'knife,knife,knife',
  'random'],
 ['protect,protect,protect', 'dish,work,snail', 'train,train,train', 'random'],
 ['make,make,make', 'cabinet,pot,dress', 'lion,lion,lion', 'random'],
 ['buffalo,buffalo,buffalo', 'radio,knife,elephant', 'use,use,use', 'random']]

## Generating pilot trial lists

In [236]:
practice_trials = [
    ['dog,dog,dog,dog', 'pony,cat,cow,seagull', 'horse,horse,horse,horse', 'practice'],
    ['donut,donut,donut,donut', 'orange,pancake,coffee,potato', 'apple,apple,apple,apple', 'practice'],
]

catch_trials = [
    ['same', 'same', 'different', 'catch'],
    ['yes', 'yes', 'no', 'catch'],
    ['left', 'left', 'right', 'catch'],
    ['left', 'right', 'right', 'catch'],
]

start = practice_trials + catch_trials

# generate 80% type 1 trials, 20% type 3 trials
for i in range(0, 20):
    trials = start + generate_type1_trials(categories, 48, 4) + generate_type3_trials(categories, 12, 4)
    trials = pd.DataFrame(trials, columns=['left_words', 'slider_words', 'right_words', 'question_type'])
    trials['trial'] = range(1, len(trials) + 1)
    trials['max_duration'] = 10
    trials.loc[trials['question_type'] == 'practice', 'max_duration'] = 100
    trials['min'] = 0
    trials['max'] = 100
    trials['default'] = 50
    trials['triad_mix'] = '80% type 1/20% type 3'
    trials.to_csv(f'trials/trials_{i + 1:02d}.tsv', sep='\t', index=False)
    
# generate 50% type 1, 50% type 3 trials
for i in range(20, 40):
    trials = start + generate_type1_trials(categories, 30, 4) + generate_type3_trials(categories, 30, 4)
    trials = pd.DataFrame(trials, columns=['left_words', 'slider_words', 'right_words', 'question_type'])
    trials['trial'] = range(1, len(trials) + 1)
    trials['max_duration'] = 10
    trials.loc[trials['question_type'] == 'practice', 'max_duration'] = 100
    trials['min'] = 0
    trials['max'] = 100
    trials['default'] = 50
    trials['triad_mix'] = '50% type 1/50% type 3'
    trials.to_csv(f'trials/trials_{i + 1:02d}.tsv', sep='\t', index=False)

# generate random triad trials
for i in range(40, 60):
    trials = start + generate_random_trials(categories, 60, 4)
    trials = pd.DataFrame(trials, columns=['left_words', 'slider_words', 'right_words', 'question_type'])
    trials['trial'] = range(1, len(trials) + 1)
    trials['max_duration'] = 10
    trials.loc[trials['question_type'] == 'practice', 'max_duration'] = 100
    trials['min'] = 0
    trials['max'] = 100
    trials['default'] = 50
    trials['triad_mix'] = 'random'
    trials.to_csv(f'trials/trials_{i + 1:02d}.tsv', sep='\t', index=False)

In [237]:
display(trials)

Unnamed: 0,left_words,slider_words,right_words,question_type,trial,max_duration,min,max,default,triad_mix
0,"dog,dog,dog,dog","pony,cat,cow,seagull","horse,horse,horse,horse",practice,1,100,0,100,50,random
1,"donut,donut,donut,donut","orange,pancake,coffee,potato","apple,apple,apple,apple",practice,2,100,0,100,50,random
2,same,same,different,catch,3,10,0,100,50,random
3,yes,yes,no,catch,4,10,0,100,50,random
4,left,left,right,catch,5,10,0,100,50,random
...,...,...,...,...,...,...,...,...,...,...
61,"zebra,zebra,zebra,zebra","fork,find,snail,hammer","desk,desk,desk,desk",random,62,10,0,100,50,random
62,"octopus,octopus,octopus,octopus","knife,elephant,bike,radio","brush,brush,brush,brush",random,63,10,0,100,50,random
63,"fork,fork,fork,fork","belt,fox,necklace,elephant","couch,couch,couch,couch",random,64,10,0,100,50,random
64,"dolphin,dolphin,dolphin,dolphin","snail,fox,protect,oven","radio,radio,radio,radio",random,65,10,0,100,50,random
