## Preliminaries

In [1]:
%cd ~/code/textrec/

/Users/kcarnold/code/textrec


In [2]:
%matplotlib inline
import datetime
import json
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import toolz
from IPython.display import Image, HTML

In [3]:
from textrec.paths import paths
from textrec import analysis_util, util, notebook_util, automated_analyses
reload(analysis_util), reload(util), reload(notebook_util), reload(automated_analyses)
from textrec.notebook_util import images, id2img, id2url, show_images

Loading COCO captions
Loading COCO id2url
Done


  from ._conv import register_converters as _register_converters


Loading ONMT models...
coco_lm_adam_acc_46.00_ppl_16.32_e10_nooptim.pt
Loading model parameters.
coco_cap_adam_acc_48.73_ppl_12.56_e10_nooptim.pt
Loading model parameters.
Ready.
Loading SpaCy...done
Loading COCO captions
Loading COCO id2url
Done
Loading SpaCy...done


In [4]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import onmt.io

## Load results of writing experiment

Prerequisites: Run `textrec.logs_to_csv {batch_name}` and `textrec.gruntwork {batch_name}`.

In [5]:
batch = 'spec1'
experiment_level_data = pd.read_csv(paths.analyzed / f'experiment_{batch}.csv')
block_level_data = pd.read_csv(paths.analyzed / f'block_{batch}.csv')
trial_level_data = pd.read_csv(paths.analyzed / f'trial_withmanual_{batch}.csv')
helpful_ranks_by_condition = pd.read_csv(paths.analyzed / f'helpful_ranks_by_condition_{batch}.csv')

# Output interventions

**Concept**: One reason that writing is difficult is because we have to "simulate the reader", imagining what's going on in their mind, so that we can construct the desired concept there. A system could help by simulating the reader for us and giving us some peek into what our writing is doing inside their minds. That peek could look like:

* what image is the writing conjuring in their mind? -> *show an image*
* what inferences are they drawing? -> *show statements that are entailed by our writing*
* what questions do they have? -> *show questions that writing similar to ours answers in different ways*

See "Chapter Intros" for more fleshed out concept.

In [6]:
joined_captions = {img['cocoid']: util.join_captions(img) for img in notebook_util.images}

In [7]:
cap_vectorizer, caption_vecs = util.get_vectorized_captions()

In [8]:
caption_vecs.shape

(123287, 9952)

In [9]:
if False:
    concat_captions = {stimulus: '\n'.join(toolz.pluck('text', trials))
                       for stimulus, trials in toolz.groupby('stimulus', trial_data).items()}
    concat_captions.keys()

In [30]:
query_word = 'wheelchair'
from collections import Counter
follows = Counter()
for img in notebook_util.images:
    for sent in img['sentences']:
        tokens = sent['tokens']
        while query_word in tokens:
            idx = tokens.index(query_word)
            if idx + 1 < len(tokens):
                follows[tokens[idx + 1]] += 1
                tokens = tokens[idx+1:]
            else:
                break

In [31]:
follows.most_common()

[('is', 11),
 ('with', 6),
 ('on', 5),
 ('holding', 5),
 ('sitting', 5),
 ('and', 4),
 ('playing', 4),
 ('while', 3),
 ('pulling', 3),
 ('sits', 3),
 ('in', 2),
 ('walking', 2),
 ('has', 2),
 ('accessible', 1),
 ('plays', 1),
 ('waiting', 1),
 ('looks', 1),
 ('near', 1),
 ('prepares', 1),
 ('are', 1),
 ('at', 1),
 ('dragging', 1),
 ('riders', 1),
 ('sets', 1),
 ('works', 1),
 ('sign', 1),
 ('driving', 1),
 ('ramp', 1),
 ('watched', 1),
 ('as', 1),
 ('shakes', 1),
 ('get', 1),
 ('next', 1),
 ('happily', 1),
 ('bound', 1),
 ('opening', 1),
 ('poses', 1),
 ('taking', 1),
 ('having', 1),
 ('beside', 1),
 ('parked', 1),
 ('flipping', 1),
 ('down', 1),
 ('looking', 1),
 ('doesnt', 1),
 ('stops', 1),
 ('desk', 1),
 ('users', 1)]

In [29]:
len(notebook_util.images)

123287

In [26]:
follows.most_common()

[('parking', 6),
 ('bathroom', 5),
 ('accessible', 4),
 ('man', 3),
 ('restroom', 3),
 ('toilet', 3),
 ('sign', 3),
 ('bar', 2),
 ('people', 2),
 ('area', 2),
 ('with', 2),
 ('spot', 1),
 ('bars', 1),
 ('person', 1),
 ('accessory', 1),
 ('bus', 1),
 ('equipped', 1),
 ('urinal', 1),
 ('sticker', 1),
 ('chair', 1),
 ('jockeys', 1),
 ('rails', 1),
 ('access', 1),
 ('public', 1),
 ('veteran', 1),
 ('skier', 1),
 ('athletes', 1)]

In [20]:
def get_similar_images(caption, n=10):
    query_vec = cap_vectorizer.transform([caption])
    similarity = caption_vecs.dot(query_vec.T).A.ravel()
    return [images[idx]['cocoid'] for idx in np.argsort(similarity)[-n:][::-1]]
#query_caption = concat_captions[396295].replace('wine', '') #trial_data[0]['text']
# query_caption = "a rusty and dirty shower in the bathroom has a tan towel over its handle"
# query_caption = "a sliding glass shower door with a bath mat hanging on it"
# query_caption = "a closed shower door with crackled glass encases some hanging colored toiletries"
# query_caption = "a brown train pulls into the tracks next to some colorful buildings"
# query_caption = "a red city bus heading down the street"
# query_caption = "a red double-decker bus heading down the street"
# query_caption = "a red double-decker bus heading down the wide street with buildings on both sides"
query_caption = "disabled"
# print(query_caption)
HTML(show_images(get_similar_images(query_caption, n=50)))

In [10]:
query_caption = "a brown train pulls into the tracks next to some colorful buildings"
query_vec = cap_vectorizer.transform([query_caption])
similarity = caption_vecs.dot(query_vec.T).A.ravel()

How similar should we count as similar?

In [25]:
similarity_argsort = np.argsort(similarity)

In [34]:
HTML(show_images([images[similarity_argsort[int(.978*len(similarity_argsort))]]['cocoid']]))

tf-idf similarity mixes relevant (trains at stations) with irrelevant (colorful kites). I wonder if we need visual similarity as well, or if just better caption similarity (e.g., sentence vectors, Transformer LM, etc.) would do it... or maybe better data, like Visual Genome? Let's push ahead though.

### Approach 1: just the most similar images

In [38]:
print(query_caption)
HTML(show_images([images[idx]['cocoid'] for idx in similarity_argsort[-5:][::-1]]))


a brown train pulls into the tracks next to some colorful buildings


These aren't terribly inspiring, because it's immediately obvious that for all but one, the train isn't brown... the listener "doesn't get it". If that's important, then we probably need better data (VG). Is there a simple way to filter those by what actually applies? How about argmaxes(p(img|caption))?

### Approach 1a: Use captioning model to find images that match caption.

In [41]:
from textrec import onmt_model_2

In [42]:
coco_cap = onmt_model_2.models['coco_cap']

In [47]:
import h5py
f = h5py.File(str(paths.imgdata_h5_all))

def load_vecs(imgids, num_objs=36, feature_dim=2048):
    batch_size = len(imgids)
    vecs = np.empty((num_objs, batch_size, feature_dim))
    for i, idx in enumerate(imgids):
        vecs[:, i, :] = f[str(idx)][:]
    return Variable(torch.FloatTensor(vecs), volatile=True)


def encode_vecs(self, vecs):
    # vecs: objs x batch_size x feature_dim
    mean_feature = torch.mean(vecs, dim=0)  # batch_size x feature_dim

    # Construct the hidden and cell states.
    hidden_state = F.tanh(self.init_hidden(mean_feature))
    cell_state = F.tanh(self.init_cell(mean_feature))
    # hidden_state: batch_size x rnn_size

    # To make this look like the output of a sequence RNN, states need to
    # have an extra first dimension (per decoder layer) and be packed in a
    # tuple.

    enc_final = (
        hidden_state.unsqueeze(0),
        cell_state.unsqueeze(0)
    )

    return enc_final, vecs

In [73]:
# query_caption = "a brown train pulls into the tracks next to some colorful buildings"
# query_caption = "a train on the tracks in front of blue buildings"
query_caption = "a shower with a blue mat on the floor in front of it"
query_vec = cap_vectorizer.transform([query_caption])
similarity = caption_vecs.dot(query_vec.T).A.ravel()

n_similar = 500
image_set = [images[idx]['cocoid'] for idx in np.argsort(similarity)[-n_similar:][::-1]]
vecs = load_vecs(image_set)
encoder_final, memory_bank = encode_vecs(coco_cap.model.encoder, vecs)

def eval_logprobs_varying_image(model, imgids, tgt_field, tgt_text):
    batch_size = len(imgids)

    vecs = load_vecs(image_set)
    encoder_final, memory_bank = encode_vecs(coco_cap.model.encoder, vecs)

    decoder_state = model.decoder.init_decoder_state(vecs, memory_bank=memory_bank, encoder_final=encoder_final)
#    decoder_state.repeat_beam_size_times(batch_size)
#    memory_bank = memory_bank.repeat(1, batch_size, 1)

    # "process" handles padding and numericalization
    tgt = tgt_field.process([tgt_text] * batch_size, device=-1, train=False)
    pad_idx = tgt_field.vocab.stoi[tgt_field.pad_token]
    
    # Decoder wants an extra dim for extra features.
    dec_out, dec_states, attn = model.decoder(tgt[:-1].unsqueeze(2), memory_bank, decoder_state)
    logits = model.generator(dec_out).contiguous()
    seq_len, batch_size_2, num_vocab = logits.shape
    assert batch_size == batch_size_2
    losses = F.nll_loss(
        logits.view(seq_len * batch_size, num_vocab), tgt[1:].view(seq_len * batch_size), reduce=False
    ).view(seq_len, batch_size)
    return losses.data.sum(0)
#     mask = tgt[1:].eq(pad_idx)
#     losses = losses.masked_fill(mask, 0).data.sum(0)
#     length = (~mask.data).long().sum(0)
#     return losses# / length.float()

losses_by_img = eval_logprobs_varying_image(
    coco_cap.model,
    image_set,
    coco_cap.fields['tgt'],
    coco_cap.fields['tgt'].preprocess(query_caption)
).numpy()
# losses_by_img

print(query_caption)
HTML(
    "<h1>Captioning Model</h1>"
    + show_images(np.array(image_set)[np.argsort(losses_by_img)[:10]])
    + "<h1>Text Similarity</h1>"
    + show_images(image_set[:10])
)

a shower with a blue mat on the floor in front of it


In [69]:
HTML(show_images(np.array(image_set)[np.argsort(losses_by_img)[-10:]]))

### Approach 1c: Visual Genome

### Approach 2: relatively similar images that are most different from each other

In [None]:
def get_foil_set(*, stimulus, caption, rs):
    similar_images = get_similar_images(caption, n=10)
    if stimulus not in similar_images:
        print("Inserting", stimulus, 'into foil set')
        similar_images[-1] = stimulus
    rs.shuffle(similar_images)
    return similar_images
stimulus = trial_data[1]['stimulus']
get_foil_set(stimulus=stimulus, caption=concat_captions[stimulus], rs=np.random.RandomState(1234))

In [None]:
rs = np.random.RandomState(1234)
foil_sets = {
    stimulus: get_foil_set(stimulus=stimulus, caption=concat_captions[stimulus], rs=rs)
    for stimulus in sorted(concat_captions.keys())}

Group tasks so that (1) each annotator never gets the same target image twice and (2) each annotator never sees two captions from the same person. The latter criterion cannot always be met, though, since the number of annotators may not evenly divide the number of participants.

In [None]:
import random
def shuffled(lst):
    lst = lst[:]
    random.shuffle(lst)
    return lst

In [None]:
while True:
    trials_by_img = toolz.groupby('stimulus', shuffled(trial_data))
    annotators = []
    while not any(len(trials) == 0 for trials in trials_by_img.values()):
        trials_for_annotator = []
        participants_seen_by_annotator = set()
        for stimulus, trials in trials_by_img.items():
            for i in range(len(trials)):
                participant = trials[i]['participant']
                if participant not in participants_seen_by_annotator:
                    trials_for_annotator.append(trials.pop(i))
                    participants_seen_by_annotator.add(participant)
                    break
            else:
#                 print("Have to use the same participant again")
                trials_for_annotator.append(trials.pop(0))

        annotators.append(shuffled(trials_for_annotator))
    if all(len(set(toolz.pluck('participant', trials))) == len(trials) for trials in annotators):
        break
    assert all(len(trials) == 0 for trials in trials_by_img.values())

In [None]:
a = annotators[:]

In [None]:
trials_by_img

Check that each annotator never gets the same target image twice

In [None]:
assert all(len(set(toolz.pluck('stimulus', trials))) == len(trials) for trials in annotators)

Check that each annotator never sees two captions from the same person.

In [None]:
assert all(len(set(toolz.pluck('participant', trials))) == len(trials) for trials in annotators)

In [None]:
[len(trials) for trials in annotators]

In [None]:
len(annotators)

In [None]:
def make_task(stimulus, text):
    foil_set = foil_sets[stimulus]
    return dict(
        description=text,
        correct_idx=foil_set.index(stimulus),
        images=[id2url[idx] for idx in foil_set]
    )

In [None]:
trial = annotators[0][0]
make_task(trial['stimulus'], trial['text'])

In [None]:
guesses_task = pd.DataFrame([
    json.dumps([make_task(trial['stimulus'], trial['text']) for trial in annotator_trials])
    for annotator_trials in annotators], columns=['task'])
guesses_task.iloc[:1].to_csv(str(paths.data / 'anno-tasks' / 'guesses_test.csv'), index=False)
guesses_task.iloc[1:].to_csv(str(paths.data / 'anno-tasks' / 'guesses_remain.csv'), index=False)
guesses_task.to_csv(str(paths.data / 'anno-tasks' / 'guesses.csv'), index=False)

### Load MTurk results so far

In [None]:
result_files = list((paths.data / 'mturk').glob('*-guesses.csv'))
batched_guesses_results = (
    pd.concat([pd.read_csv(str(f)) for f in result_files], axis=0, ignore_index=True)
)

In [None]:
%matplotlib inline

In [None]:
(batched_guesses_results['WorkTimeInSeconds']/60).hist(bins=30)

In [None]:
def get_active_dur(results):
    pages = json.loads(results)
    try:
        first_guess = pages[0]['guesses'][0]['timestamp']
        last_guess = pages[-1]['guesses'][-1]['timestamp']
        return (last_guess - first_guess) / 1000 / 60
    except IndexError:
        # Something failed in the UI probably...
        return None
batched_guesses_results['Answer.results'].apply(get_active_dur).hist()
batched_guesses_results['Answer.results'].apply(get_active_dur).describe()
        

In [None]:
batched_guesses_results[batched_guesses_results['Answer.results'].apply(lambda x: '"guesses":[]' in x)].T

In [None]:
%debug

In [None]:
guesses_results = []
for i, row in batched_guesses_results.iterrows():
    for page in json.loads(row['Answer.results']):
#         print(page)
        guess_indices = [guess['idx'] for guess in page['guesses']]
        if len(guess_indices) == 0:
            print("UI fail", row['WorkerId'])
            continue
#         guessed_right_sometime = [row.correctIdx in row.guess_indices for row in mturk_nafc_results.itertuples()]
        stimulus_url = [img for img in page['images'] if img['isCorrect']][0]['url']
        guesses_results.append(dict(
            guesser=row['WorkerId'],
            description=page['description'],
            num_guesses=len(guess_indices),
            stimulus_url=stimulus_url))
pd.DataFrame(guesses_results)

In [None]:
num_responses_by_caption = mturk_nafc_results.groupby('Answer.description').size().to_dict()

### Generate tasks remaining

In [None]:
# trials_todo = [trial for trial in trial_data if num_responses_by_caption.get(trial['text'], 0) < 3]
# len(trial_data), len(trials_todo)

In [None]:
# i = 0
# while True:
#     out_fn = paths.data / 'anno-tasks' / f'{datetime.date.today().isoformat()}-{i}-nAFC.csv'
#     if not out_fn.exists():
#         break
#     i += 1
# out_fn

In [None]:
# rs = np.random.RandomState(1234)
# pd.DataFrame([make_task(trial, rs) for trial in trials_todo]).to_csv(out_fn, index=False)

### Generate the actual HIT text...

In [None]:
from jinja2 import Template
html = Template(open(paths.top_level / 'HITs' / '2018-05-04-image-description-match.jinja.html').read()).render(dict(
    description='${description}',
    images=['${image_%d_url}' % i for i in range(10)]))

In [None]:
html2 = html
trial = trial_data[18+7*9]
for k, v in make_task(trial['stimulus'], trial['text']).items():
    html2 = html2.replace('${' + k + '}', str(v))
HTML('<div style="height: 1000px; position: relative;">'+html2+'</div>')
# print(html2)

In [None]:
import subprocess
subprocess.Popen('pbcopy', stdin=subprocess.PIPE).communicate(html.encode('utf-8'))

# Analyze MTurk results

In [None]:
mturk_nafc_results.groupby('Answer.description').num_guesses.mean().to_dict()

In [None]:
%matplotlib inline

In [None]:
(mturk_nafc_results['WorkTimeInSeconds'][mturk_nafc_results['WorkTimeInSeconds'] < 5*60] / 60).hist(bins=30)

In [None]:
np.median(mturk_nafc_results['WorkTimeInSeconds'] / 60) * 9/60

In [None]:
(
    15 # participants
    * 3 # conditions per participant
    * 3 # captions per condition
    - 1 # image not shown
) * 3 # annotators per description


In [None]:
    * .24 # reward per annotator
) * 1.2 # MTurk 20% fee

Did the same worker see the same target image multiple times?

In [None]:
next(data.iterrows())[1]['Input.image_0_url']

In [None]:
mturk_nafc_results['target_image_url'] = [row['Input.image_'+str(row['correctIdx'])+"_url"] for _, row in mturk_nafc_results.iterrows()]

In [None]:
bad_worker_image_pairs = set()
for worker_id, data in mturk_nafc_results.groupby('WorkerId'):
    target_images = [row['target_image_url'] for _, row in data.iterrows()]
    if len(target_images) != len(set(target_images)):
#         print(worker_id)
        value_counts = pd.Series(target_images).value_counts()
        value_counts = value_counts[value_counts > 1]
#         print(value_counts)
        for img in value_counts.index:
            bad_worker_image_pairs.add((worker_id, img))
bad_worker_image_pairs

annotation_row_is_bad = [
    (row['WorkerId'], row['target_image_url']) in bad_worker_image_pairs
    for _, row in mturk_nafc_results.iterrows()
]
mturk_nafc_results['row_is_bad'] = annotation_row_is_bad

In [None]:
mturk_nafc_results['row_is_bad'].mean()

In [None]:
guess_results = mturk_nafc_results[~mturk_nafc_results['row_is_bad']].rename(columns={'Answer.description': 'text'})

In [None]:
len(mturk_nafc_results)

In [None]:
len(guess_results), len(trial_data)

In [None]:
trial_data

In [None]:
annotator_level_data = pd.merge(
    pd.DataFrame(trial_data).rename(columns={'participant': 'writer'}),
    guess_results.rename(columns={'WorkerId': 'guesser'}).drop(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'RequesterAnnotation', 'guesses'], axis=1),
    on='text', validate='1:m', how='right')
annotator_level_data
    #.groupby().num_guesses.mean().to_frame('mean_num_guesses'),
#     left_on='text', right_index=True).groupby('condition').mean_num_guesses.aggregate(['mean', 'std'])

In [None]:
guesses_results[0]

In [None]:
annotator_level_data = pd.merge(
    pd.DataFrame(trial_data).rename(columns={'participant': 'writer'}),
    pd.DataFrame(guesses_results).rename(columns={'description': 'text'}),
    on='text', validate='1:m', how='right')
annotator_level_data
    #.groupby().num_guesses.mean().to_frame('mean_num_guesses'),
#     left_on='text', right_index=True).groupby('condition').mean_num_guesses.aggregate(['mean', 'std'])

In [None]:
annotator_level_data.columns

In [None]:
annotator_level_data.to_csv('annotator_level_data_2018-05-22v2.csv', index=False)

In [None]:
(annotator_level_data['num_guesses'] == 1).mean()

In [None]:
%pwd

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(lme4)
library(ggplot2)

In [None]:
%%R -i annotator_level_data
(model = lmer(num_guesses ~ condition + (1|writer) + (1|guesser) + (1|stimulus), annotator_level_data))

In [None]:
%%R -i annotator_level_data
(null_model = lmer(num_guesses ~ (1|writer) + (1|guesser) + (1|stimulus), annotator_level_data))

In [None]:
%%R
library(pbkrtest)

In [None]:
%%R
(kr <- KRmodcomp(model, null_model))

In [None]:
%%R -i annotator_level_data
(glm.full = glmer(num_guesses ~ condition + (1|writer) + (1+guesser) + (1|stimulus), annotator_level_data, family=poisson()))

In [None]:
%%R
(glm.null = glmer(num_guesses ~ (1|writer) + (1+guesser) + (1|stimulus), annotator_level_data, family=poisson()))

In [None]:
%%R
#confint(glm.full)

In [None]:
%%R
anova(glm.full, glm.null, test="Chisq")

In [None]:
%%R -i annotator_level_data
(nb_model = glmer.nb(num_guesses ~ condition + (1|writer) + (1+guesser) + (1|stimulus), annotator_level_data))

In [None]:
%%R -i annotator_level_data
(nb_model.null = glmer.nb(num_guesses ~ (1|writer) + (1+guesser) + (1|stimulus), annotator_level_data))

In [None]:
%%R
anova(nb_model, nb_model.null, test="Chisq")

In [None]:
results = pd.DataFrame([dict(trial, specificity=specificity_lookup[trial['text'].strip()]) for trial in trial_data])
for col in ['condition', 'participant']:
    results[col] = results[col].astype('category')
results.info()

In [None]:
results.groupby('condition').specificity.mean()

In [None]:
pd.DataFrame(trial_data).sample(frac=1.0).sort_values('stimulus').to_csv('trial_data_by_stimulus.csv')