# Analysis pipeline for Specificity

## Preliminaries

In [1]:
%cd ~/code/textrec/

/Users/kcarnold/code/textrec


In [2]:
import datetime
import json
import pathlib

import pandas as pd
import numpy as np
import toolz
from IPython.display import Image, HTML

In [3]:
from textrec.paths import paths
from textrec import analysis_util, util
reload(analysis_util), reload(util)

(<module 'textrec.analysis_util' from '/Users/kcarnold/code/textrec/src/textrec/analysis_util.py'>,
 <module 'textrec.util' from '/Users/kcarnold/code/textrec/src/textrec/util.py'>)

Download Karpathy's version of the COCO captions dataset. This has the train-test split that is more commonly used in the literature, as well as pre-tokenized captions.

In [4]:
images = util.get_coco_captions()
images_by_split = toolz.groupby('split', images)

In [5]:
id2img = {img['cocoid']: img for img in images}

In [6]:
id2url = util.get_coco_id2url()

In [7]:
def show_images(indices):
    def img(idx):
        img = id2img[idx]
        captions = '\n'.join(
            '<div>{}</div>'.format(sent)
            for sent in toolz.pluck('raw', img['sentences'])
        )
        return '<div style="display: inline-block;"><div>{}/{}</div><img src="{}">{}</div>'.format(
            img['split'], img['cocoid'], id2url[img['cocoid']], captions)

    return '\n'.join(img(idx) for idx in indices)

In [8]:
HTML(show_images([images_by_split['val'][0]['cocoid']]))

## Load results of writing experiment

In [9]:
%run -m textrec.batch_analysis

In [10]:
participants = get_participants_by_batch()

In [11]:
participants.keys()

dict_keys(['2018-04-09', '2018-04-24', '2018-04-27', '2018-05-02-invalid', '2018-05-02'])

In [12]:
# summarize('2018-04-27')
summarize('2018-05-02')


h52x67
practice-0:specific:a black cat napping on a sunny unpainted wood bench in front of a red wall
final-0-0:specific:a cat sitting next to a glass bowl, looking up to the camera
final-0-1:specific:a shower with dirty glass doors has a beige towel hanging on the outside
final-0-2:specific:there is no image here 
practice-1:norecs:a man with black hair and glasses placing a large turkey into an upper oven
final-1-0:norecs:a bride and groom cutting their wedding cake, while a photographer guides them
final-1-1:norecs:a man helping his children fly a multicolor butterfly kite on a clear day
final-1-2:norecs:a passenger train approaching a small quaint station with a blue and white building on the background
practice-2:general:a black and red vehicle with bikes on top and people standing nearby with umbrellas. 
final-2-0:general:a busy street in a historic town with a red bus driving on the street. 
final-2-1:general:a tennis player hits a ball during a game 
final-2-2:general:a surfer

practice-0:norecs:a black cat napping on a sunny unpainted wood bench in front of a red wall
final-0-0:norecs:a glass of red wine sitting in fronf of a brown cat with brown stripes on a brown mat
final-0-1:norecs:a beige towel hangs over the rightmost shower door both of which are wet with water
final-0-2:norecs:a toilet with the seat down and a roll of toilet paper on top is next to a white sink
practice-1:specific:a man with black hair and glasses placing a large turkey into an upper oven
final-1-0:specific:a husband, bride and female all stand in front of a table holding a knife cutting a cake
final-1-1:specific:two children and one adult stand on the beach holding kites and flying them into the sky 
final-1-2:specific:a train on the tracks passes a white house with blue paint before approaching another white house
practice-2:general:a black and red vehicle with bikes on top and people standing nearby with umbrellas. 
final-2-0:general:a red double-decker bus passes a group of peopl

practice-0:norecs:a black cat napping on a sunny unpainted wood bench in front of a red wall
final-0-0:norecs:a gray and beige cat looks upward as a half full glass of wine can be seen in the foreground
final-0-1:norecs:a closed shower door with crackled glass encases some hanging colored toiletries
final-0-2:norecs:a white bathroom sink and toilet with a mirror and a roll of unopened toilet paper
practice-1:general:a man with black hair and glasses placing a large turkey into an upper oven 
final-1-0:general:a man in a tuxexo and a woman in a gown are show how to cut their wedding cake by a woman holding a camera 
final-1-1:general:a man in a red shirt and two children stand on the beach and fly kites
final-1-2:general:an old train runs on the tracks in front of powder blue and white buildings
practice-2:specific:a black and red vehicle with bikes on top and people standing nearby with umbrellas. 
final-2-0:specific:a gorgeous european city with tall gothic buildings and a red trolley

practice-0:general:a black cat napping on a sunny unpainted wood bench in front of a red wall 
final-0-0:general:a wine glass with red wine less than half full and an orange striped tabby cat in the background behind the glass
final-0-1:general:sliding glass, frosted, shower doors with a tan towel hanging on the handle and a white toilet with a blue floor rug
final-0-2:general:a bathroom sink and mirror and toilet with a silver handle attached to the wall behind it
practice-1:specific:a man with black hair and glasses placing a large turkey into an upper oven
final-1-0:specific:a groom and bride slicing a white wedding cake on the banquet food table 
final-1-1:specific:some people on a sandy beach flying kites with a clear blue sky 
final-1-2:specific:some train tracks with a train on it traveling beside some white and blue buildings with green trees in the background and a clear blue sky 
practice-2:norecs:a black and red vehicle with bikes on top and people standing nearby with umbre

In [13]:
trial_data = get_trial_data('2018-05-02')

In [14]:
for trial in trial_data:
    trial['text'] = trial['text'].strip()

I had the wrong URL for one image when one person ran it.

In [15]:
# trial_data = [trial for trial in trial_data if not (trial['stimulus'] == 431140 and trial['participant'] == 'h52x67')]
trial_data = [trial for trial in trial_data if not trial['participant'] == 'h52x67']

In [16]:
len(list(toolz.pluck('text', trial_data)))

162

In [17]:
len(set(toolz.pluck('participant', trial_data)))

18

In [18]:
# pd.DataFrame(trial_data).to_clipboard()

# Generate nAFC task

For each stimulus image, choose a foil set. It should be about equally difficult for each condition. Simplest approach: find the nearest caption to the concatenation of all captions we got for that image.

TODO: should we be computing similarities of individual captions, rather than complete images?

In [19]:
cap_vectorizer, caption_vecs = util.get_vectorized_captions()

In [20]:
caption_vecs.shape

(123287, 9952)

In [21]:
id2url[570528]

'http://images.cocodataset.org/train2017/000000570528.jpg'

In [22]:
concat_captions = {stimulus: '\n'.join(toolz.pluck('text', trials))
                   for stimulus, trials in toolz.groupby('stimulus', trial_data).items()}
concat_captions.keys()

dict_keys([275449, 396295, 431140, 227326, 200451, 223777, 247576, 71815, 240275])

In [23]:
# print(concat_captions[71815])
# print(concat_captions[275449])
print(concat_captions[396295])

a small bath with a shower with a blue mat on the floor
someome is using a shower but it's hard to see due to the opaque glass
brown towel is hanging on a sliding shower door
a beige towel hangs over the rightmost shower door both of which are wet with water
a tan towel hangs in front of a glass shower
i see a standing shower with two hazy sliding glass doors and a towel hanging off of one.
a rusty and dirty shower in the bathroom has a tan towel over its handle
a sliding glass shower door with a bath mat hanging on it
a closed shower door with crackled glass encases some hanging colored toiletries
a shower with a towel hanging on the handle of the door
a tan towel is hanging from a chrome handle on a textured glass shower door
a person is taking a shower in a shower with very opaque sliding doors
a beige towel hanging on a translucent glass shower door
sliding glass, frosted, shower doors with a tan towel hanging on the handle and a white toilet with a blue floor rug
a toilet and a sh

For 275449, the foil images are fixated on the 'wine'. But unsurprising, since all but one caption mentions it, and it's probably less common than "cat".

In [24]:
def get_similar_images(caption, n=10):
    query_vec = cap_vectorizer.transform([caption])
    similarity = caption_vecs.dot(query_vec.T).A.ravel()
    return [images[idx]['cocoid'] for idx in np.argsort(similarity)[-n:][::-1]]
query_caption = concat_captions[396295].replace('wine', '') #trial_data[0]['text']
# query_caption = "a rusty and dirty shower in the bathroom has a tan towel over its handle"
# query_caption = "a sliding glass shower door with a bath mat hanging on it"
query_caption = "a closed shower door with crackled glass encases some hanging colored toiletries"
print(query_caption)
HTML(show_images(get_similar_images(query_caption)))

a closed shower door with crackled glass encases some hanging colored toiletries


In [25]:
def get_foil_set(*, stimulus, caption, rs):
    similar_images = get_similar_images(caption, n=10)
    if stimulus not in similar_images:
        print("Inserting", stimulus, 'into foil set')
        similar_images[-1] = stimulus
    rs.shuffle(similar_images)
    return similar_images
stimulus = trial_data[1]['stimulus']
get_foil_set(stimulus=stimulus, caption=concat_captions[stimulus], rs=np.random.RandomState(1234))

[510852, 212082, 372775, 396295, 409842, 262284, 233737, 503200, 490872, 98257]

In [26]:
rs = np.random.RandomState(1234)
foil_sets = {
    stimulus: get_foil_set(stimulus=stimulus, caption=concat_captions[stimulus], rs=rs)
    for stimulus in sorted(concat_captions.keys())}

Inserting 200451 into foil set
Inserting 223777 into foil set
Inserting 240275 into foil set
Inserting 431140 into foil set


Group tasks so that (1) each annotator never gets the same target image twice and (2) each annotator never sees two captions from the same person. Do this by assigning trials round-robin to annotators:

annotator 1: p0i0,p1i1,p2i2,...p9i9
annotator 2: p10i0,p11i1,...p19i9

or...

In [27]:
import random
def shuffled(lst):
    lst = lst[:]
    random.shuffle(lst)
    return lst

In [28]:
trials_by_img = toolz.groupby('stimulus', trial_data)
trials_by_img = {stimulus: shuffled(trials) for stimulus, trials in trials_by_img.items()}
annotators = []
while not any(len(trials) == 0 for trials in trials_by_img.values()):
    trials_for_annotator = []
    for stimulus, trials in trials_by_img.items():
        trials_for_annotator.append(trials.pop(0))
    annotators.append(shuffled(trials_for_annotator))
assert all(len(trials) == 0 for trials in trials_by_img.values())

In [29]:
trials_by_img

{275449: [],
 396295: [],
 431140: [],
 227326: [],
 200451: [],
 223777: [],
 247576: [],
 71815: [],
 240275: []}

In [30]:
annotators[4]

[{'participant': '4ggxj8',
  'block': 1,
  'idx_in_block': 1,
  'idx': 4,
  'condition': 'general',
  'text': 'a man in a red shirt and two children stand on the beach and fly kites',
  'stimulus': 200451},
 {'participant': '3267ww',
  'block': 2,
  'idx_in_block': 1,
  'idx': 7,
  'condition': 'general',
  'text': 'an adult is holding a tennis racket trying to hit a ball',
  'stimulus': 71815},
 {'participant': 'qmwvwv',
  'block': 2,
  'idx_in_block': 0,
  'idx': 6,
  'condition': 'general',
  'text': 'a red two story city bus is driving in a european city with pedestrians all around',
  'stimulus': 247576},
 {'participant': '9f5xwx',
  'block': 0,
  'idx_in_block': 0,
  'idx': 0,
  'condition': 'specific',
  'text': 'a cat behind a glass vase',
  'stimulus': 275449},
 {'participant': '26w4jv',
  'block': 0,
  'idx_in_block': 1,
  'idx': 1,
  'condition': 'specific',
  'text': 'a shower with a towel hanging on the handle of the door',
  'stimulus': 396295},
 {'participant': '5c39rx',

In [32]:
len(annotators)

18

In [33]:
def make_task(stimulus, text):
    foil_set = foil_sets[stimulus]
    return dict(
        description=text,
        correct_idx=foil_set.index(stimulus),
        images=[id2url[idx] for idx in foil_set]
    )

In [34]:
trial = annotators[0][0]
make_task(trial['stimulus'], trial['text'])

{'description': 'a black and white photo or a large man and a woman cutting thwir wedding cake',
 'correct_idx': 7,
 'images': ['http://images.cocodataset.org/train2017/000000352892.jpg',
  'http://images.cocodataset.org/val2017/000000263969.jpg',
  'http://images.cocodataset.org/train2017/000000564058.jpg',
  'http://images.cocodataset.org/train2017/000000082990.jpg',
  'http://images.cocodataset.org/train2017/000000466456.jpg',
  'http://images.cocodataset.org/train2017/000000312289.jpg',
  'http://images.cocodataset.org/train2017/000000086147.jpg',
  'http://images.cocodataset.org/train2017/000000227326.jpg',
  'http://images.cocodataset.org/train2017/000000119065.jpg',
  'http://images.cocodataset.org/train2017/000000561454.jpg']}

In [36]:
pd.DataFrame([
    json.dumps([make_task(trial['stimulus'], trial['text']) for trial in annotator_trials])
    for annotator_trials in annotators], columns=['task']
).to_csv(str(paths.data / 'anno-tasks' / 'guesses.csv'), index=False)

### Load MTurk results so far

In [None]:
result_files = list((paths.data / 'mturk').glob('*-guesstheimage.csv'))
mturk_nafc_results = (
    pd.concat([pd.read_csv(str(f)) for f in result_files], axis=0, ignore_index=True)
    if len(result_files)
    else pd.DataFrame([], columns=['Answer.description', 'Answer.guesses', 'Input.correct_idx']))
mturk_nafc_results = mturk_nafc_results.rename(columns={'Input.correct_idx': 'correctIdx'})
print("Loaded", len(mturk_nafc_results), "guess task results")

In [None]:
mturk_nafc_results['guesses'] = mturk_nafc_results['Answer.guesses'].map(json.loads)

In [None]:
mturk_nafc_results['guess_indices'] = [[guess['idx'] for guess in row.guesses] for row in mturk_nafc_results.itertuples()]
mturk_nafc_results['guessed_right_sometime'] = [row.correctIdx in row.guess_indices for row in mturk_nafc_results.itertuples()]

In [None]:
mturk_nafc_results = mturk_nafc_results[mturk_nafc_results['guessed_right_sometime']]
print(len(mturk_nafc_results), "remain")

In [None]:
mturk_nafc_results['num_guesses'] = [row.guess_indices.index(row.correctIdx) + 1 for row in mturk_nafc_results.itertuples()]

In [None]:
num_responses_by_caption = mturk_nafc_results.groupby('Answer.description').size().to_dict()

### Generate tasks remaining

In [None]:
trials_todo = [trial for trial in trial_data if num_responses_by_caption.get(trial['text'], 0) < 3]
len(trial_data), len(trials_todo)

In [None]:
i = 0
while True:
    out_fn = paths.top_level / 'HITs' / f'{datetime.date.today().isoformat()}-{i}-nAFC.csv'
    if not out_fn.exists():
        break
    i += 1
out_fn

In [None]:
rs = np.random.RandomState(1234)
pd.DataFrame([make_task(trial, rs) for trial in trials_todo]).to_csv(out_fn, index=False)

### Generate the actual HIT text...

In [None]:
from jinja2 import Template
html = Template(open(paths.top_level / 'HITs' / '2018-05-04-image-description-match.jinja.html').read()).render(dict(
    description='${description}',
    images=['${image_%d_url}' % i for i in range(10)]))

In [None]:
html2 = html
trial = trial_data[18+7*9]
for k, v in make_task(trial['stimulus'], trial['text']).items():
    html2 = html2.replace('${' + k + '}', str(v))
HTML('<div style="height: 1000px; position: relative;">'+html2+'</div>')
# print(html2)

In [None]:
import subprocess
subprocess.Popen('pbcopy', stdin=subprocess.PIPE).communicate(html.encode('utf-8'))

# Analyze MTurk results

In [None]:
mturk_nafc_results.groupby('Answer.description').num_guesses.mean().to_dict()

In [None]:
%matplotlib inline

In [None]:
(mturk_nafc_results['WorkTimeInSeconds'][mturk_nafc_results['WorkTimeInSeconds'] < 5*60] / 60).hist(bins=30)

In [None]:
np.median(mturk_nafc_results['WorkTimeInSeconds'] / 60) * 9/60

In [None]:
(
    15 # participants
    * 3 # conditions per participant
    * 3 # captions per condition
    - 1 # image not shown
) * 3 # annotators per description


In [None]:
    * .24 # reward per annotator
) * 1.2 # MTurk 20% fee

Did the same worker see the same target image multiple times?

In [None]:
next(data.iterrows())[1]['Input.image_0_url']

In [None]:
mturk_nafc_results['target_image_url'] = [row['Input.image_'+str(row['correctIdx'])+"_url"] for _, row in mturk_nafc_results.iterrows()]

In [None]:
bad_worker_image_pairs = set()
for worker_id, data in mturk_nafc_results.groupby('WorkerId'):
    target_images = [row['target_image_url'] for _, row in data.iterrows()]
    if len(target_images) != len(set(target_images)):
#         print(worker_id)
        value_counts = pd.Series(target_images).value_counts()
        value_counts = value_counts[value_counts > 1]
#         print(value_counts)
        for img in value_counts.index:
            bad_worker_image_pairs.add((worker_id, img))
bad_worker_image_pairs

annotation_row_is_bad = [
    (row['WorkerId'], row['target_image_url']) in bad_worker_image_pairs
    for _, row in mturk_nafc_results.iterrows()
]
mturk_nafc_results['row_is_bad'] = annotation_row_is_bad

In [None]:
mturk_nafc_results['row_is_bad'].mean()

In [None]:
guess_results = mturk_nafc_results[~mturk_nafc_results['row_is_bad']].rename(columns={'Answer.description': 'text'})

In [None]:
len(mturk_nafc_results)

In [None]:
len(guess_results), len(trial_data)

In [None]:
trial_data

In [None]:
annotator_level_data = pd.merge(
    pd.DataFrame(trial_data).rename(columns={'participant': 'writer'}),
    guess_results.rename(columns={'WorkerId': 'guesser'}).drop(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'RequesterAnnotation', 'guesses'], axis=1),
    on='text', validate='1:m', how='right')
annotator_level_data
    #.groupby().num_guesses.mean().to_frame('mean_num_guesses'),
#     left_on='text', right_index=True).groupby('condition').mean_num_guesses.aggregate(['mean', 'std'])

In [None]:
annotator_level_data.columns

In [None]:
annotator_level_data.to_csv('annotator_level_data_2018-05-15.csv', index=False)

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(lme4)
library(ggplot2)

In [None]:
%%R -i annotator_level_data
(model = lmer(num_guesses ~ condition + (1|writer) + (1|guesser) + (1|target_image_url), annotator_level_data))

In [None]:
%%R -i annotator_level_data
(null_model = lmer(num_guesses ~ (1|writer) + (1|guesser) + (1|target_image_url), annotator_level_data))

In [None]:
%%R
library(pbkrtest)

In [None]:
%%R
(kr <- KRmodcomp(model, null_model))

In [None]:
%%R -i annotator_level_data
(glm.full = glmer(num_guesses ~ condition + (1|writer) + (1|target_image_url), annotator_level_data, family=poisson()))
#  (1|guesser)

In [None]:
%%R
(glm.null = glmer(num_guesses ~ (1|writer) + (1|target_image_url), annotator_level_data, family=poisson()))

In [None]:
%%R
confint(glm.full)

In [None]:
%%R
anova(glm.full, glm.null, test="Chisq")

In [None]:
results = pd.DataFrame([dict(trial, specificity=specificity_lookup[trial['text'].strip()]) for trial in trial_data])
for col in ['condition', 'participant']:
    results[col] = results[col].astype('category')
results.info()

In [None]:
results.groupby('condition').specificity.mean()

In [None]:
pd.DataFrame(trial_data).sample(frac=1.0).sort_values('stimulus').to_csv('trial_data_by_stimulus.csv')

# How many images does this caption apply to?

In [None]:
data = pd.read_csv("/Users/kcarnold/Downloads/Submitted Captions - Sheet1.csv").iloc[:,:6]

In [None]:
data

In [None]:
data.shape

In [None]:
data = data.dropna().copy()
data.shape

In [None]:
data['is_unique'] = (data.iloc[:,5] == '1')
data.is_unique.mean()

In [None]:
data.groupby(['block', 'idx_in_block', 'condition']).is_unique.mean()

In [None]:
data['participant'] = data['participant'].astype('category')
data['condition'] = data['condition'].astype('category')

In [None]:
data.groupby('condition').is_unique.mean()

In [None]:
%%R -i data
transformed <- art(is_unique ~ condition + (1|participant), data=data)
summary(transformed)
anova(transformed)

In [None]:
example = trial_data[-1]['text']
example

Concepts: traffic light. COCO doesn't have "pedestian crossing sign". There are 4330 images with traffic lights in them in COCO. That's way too much. Looking at Visual Genome.

Visual Genome synsets are potentially best, but they're sometimes inaccurate. e.g., "18 wheeler" is "cyclist.n.01". So let's consider an object a match if matches either the synset or object name.

In [None]:
vg_base = pathlib.Path('/Data/VisualGenome')
image_objects = json.load(open(vg_base / 'objects.json'))

In [None]:
img_by_id = {img['image_id']: img for img in image_objects}

In [None]:
obj_synsets = json.load(open(vg_base / 'object_synsets.json'))

In [None]:
obj_attributes = json.load(open(vg_base / 'attributes.json'))

In [None]:
len(obj_attributes), len(image_objects)

In [None]:
obj_attributes[0].keys()

In [None]:
attributes_by_img = {att['image_id']: att['attributes'] for att in obj_attributes}

In [None]:
attributes_by_img[61514]

In [None]:
# def has_object(obj_name):
#     return {
#         img['image_id'] for img in image_objects
#         if any(obj_name in '\n'.join(obj['names']) for obj in img['objects'])
#            }
def has_object(imgid, obj_name):
    return any(obj_name in '\n'.join(obj['names']) for obj in attributes_by_img[imgid])

In [None]:
# def has_synset(obj_synset):
#     return {
#         img['image_id'] for img in image_objects
#         if any(obj_synset in obj['synsets'] for obj in img['objects'])}
def has_synset(imgid, obj_synset):
    return any(obj_synset in obj['synsets'] for obj in attributes_by_img[imgid])

In [None]:
def has_obj_with_attr(imgid, obj_name, attr):
    return any(
        (obj_name in '\n'.join(obj['names'])) and (attr in obj.get('attributes', []))
        for obj in attributes_by_img[imgid])

def has_synset_with_attr(imgid, obj_synset, attr):
    return any(
        (obj_synset in obj['synsets']) and (attr in obj.get('attributes', []))
        for obj in attributes_by_img[imgid])

def has_obj_without_attr(imgid, obj_name, attr):
    return any(
        (obj_name in '\n'.join(obj['names'])) and (attr not in obj.get('attributes', []))
        for obj in attributes_by_img[imgid])

def has_synset_without_attr(imgid, obj_synset, attr):
    return any(
        (obj_synset in obj['synsets']) and (attr not in obj.get('attributes', []))
        for obj in attributes_by_img[imgid])

In [None]:
obj_synsets['pedestrian sign']

In [None]:
# candidates = (
#     (has_object('pedestrian sign') | has_object('pedestrian crossing sign') | has_object('crossing sign') | has_object('sign')) &
#     (has_object('traffic light') | has_synset('traffic_light.n.01'))
# )
candidates = {
    imgid for imgid in attributes_by_img.keys()
    if (
        (
            has_object(imgid, 'pedestrian sign') |
            has_object(imgid, 'pedestrian crossing sign') |
            has_object(imgid, 'crossing sign') |
            has_obj_with_attr(imgid, 'sign', 'yellow')
        ) & (
            has_obj_without_attr(imgid, 'traffic light', 'red') |
            has_synset_without_attr(imgid, 'traffic_light.n.01', 'red')
        ))}
len(candidates)

In [None]:
show_images(candidates)

In [None]:
[img['image_id'] for img in image_objects if '61514' in img.get('image_url', '')]
#Image(img_by_id[61514]['image_url'])

In [None]:
img_by_id[61514]

# Measuring Specificity

We use paired comparisons to analyze specificity and accuracy. For a target image $x$ and a fixed set of imposter images $Y$, the **specific accuracy** of a caption is the fraction of comparisons that chose $x$. 

We start with our dataset of paired comparisons.

In [None]:
captions = [
    "exactly how are both the dog and the person going to fit on that skateboard?",
    "the dark haired dog is trying to ride on the skateboard.",
    "a person in shorts and a black dog both have one foot on a skateboard.",
    "a dog with a black head and black legs and ears standing up has one black paw on a black skateboard with white wheels and a guy with black and white shoes and white socks has one foot on the skateboard also and there are bikes and other people in the background"
]

In [None]:
alternatives = 'dog-and-guy-on-skateboard just-dog-on-skateboard guy-on-skateboard-holding-dog dog-and-guy-next-to-skateboard'.split()
target = alternatives[0]
imposters = alternatives[1:]
applies_to = [
    'dog-and-guy-on-skateboard dog-and-guy-next-to-skateboard'.split(),
    'just-dog-on-skateboard'.split(),
    'dog-and-guy-on-skateboard'.split(),
    'dog-and-guy-on-skateboard just-dog-on-skateboard guy-on-skateboard-holding-dog dog-and-guy-next-to-skateboard'.split()
]
applies_to = {cap: tgts for cap, tgts in zip(captions, applies_to)}
applies_to

In [None]:
import random
random.seed(0)
pairs = [[target, imposter] for imposter in imposters]
for pair in pairs:
    random.shuffle(pair)
pairs

In [None]:
def fake_answer_pairs_for_caption(applies, pairs):
    outcomes = []
    for a, b in pairs:
        choices = []
        if a in applies:
            choices.append(0)
        if b in applies:
            choices.append(1)
        if len(choices) == 0:
            choices = [0, 1]
        outcomes.append(random.choice(choices))
    return outcomes
fake_answer_pairs_for_caption(applies_to[captions[0]], pairs)

In [None]:
fake_comparisons_data = []
for caption in captions:
    for annotator in range(5):
        for pair, outcome in zip(pairs, fake_answer_pairs_for_caption(applies_to[caption], pairs)):
            picked = pair[outcome]
            fake_comparisons_data.append(dict(
                caption=caption,
                annotator=annotator,
                pair=pair,
                picked=picked))

In [None]:
data = pd.DataFrame(fake_comparisons_data)
len(data)

In [None]:
data['picked_correct'] = data['picked'] == 'dog-and-guy-on-skateboard'
data.groupby('caption').picked_correct.mean().sort_values()

# Final analyses

We find a main effect of writing condition on outcome specificity.

In [None]:
%load_ext rpy2.ipython

In [None]:
results = pd.DataFrame([
    dict(participant_id=participant_id, condition=condition)
    for participant_id in 'abc def ghi'.split() for condition in 'general specific norecs'.split()
])
results['participant_id'] = results['participant_id'].astype('category')
results['condition'] = results['condition'].astype('category')
results['specificity'] = np.random.randn(len(results))

In [None]:
%%R
#install.packages("ARTool")
library(ARTool)

In [None]:
%%R -i results
summary(results)

In [None]:
%%R -i results
transformed <- art(specificity ~ condition + (1|participant), data=results)
summary(transformed)
anova(transformed)