In [18]:
import datetime
import json
import os
import pathlib
from contextlib import contextmanager

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import toolz
import yaml

from textrec.paths import paths

from IPython.display import display, HTML

In [14]:
pd.set_option('display.max_colwidth', 200)

# Find good examples from existing data

In [4]:
trial_level_data = pd.read_csv(
    paths.analyzed / f"combined_data.csv", dtype={"stimulus": str}
)

assert len(trial_level_data[trial_level_data["corrected_text"].isnull()]) == 0

for col in "participant stimulus".split():
    trial_level_data[col] = trial_level_data[col].astype("category")

condition_type = pd.api.types.CategoricalDtype(
    categories=["gated", "contextual", "standard", "norecs"], ordered=True
)
trial_level_data["condition"] = trial_level_data["condition"].astype(condition_type)


In [5]:
dataset = trial_level_data[['participant', 'stimulus', 'condition', 'corrected_text']].rename(columns={'corrected_text': 'text'})

In [6]:
def strip_uninformative(text):
    text = text.strip()
    for beginning in ['there is', 'there are', 'a view of', 'a photo of', 'a photo shows']:
        beginning = beginning + ' '
        if text.startswith(beginning):
            text = text[len(beginning):]
            return strip_uninformative(text)
    return text
dataset['text'] = dataset.text.str.strip().str.rstrip('.')
dataset['text'] = dataset.text.apply(strip_uninformative)

In [7]:
import wordfreq

dataset['num_words'] = [
    len(wordfreq.tokenize(text, 'en'))
    for text in dataset.text]

dataset['min_freq'] = [
    np.min([wordfreq.zipf_frequency(tok, 'en') for tok in wordfreq.tokenize(text, 'en')])
    for text in dataset.text]

dataset['mean_freq'] = [
    np.mean([wordfreq.zipf_frequency(tok, 'en') for tok in wordfreq.tokenize(text, 'en')])
    for text in dataset.text]

In [8]:
dataset['mean_rarity'] = (7 - dataset.mean_freq) / 7
dataset['total_rarity'] = dataset['mean_rarity'] * dataset['num_words']

In [11]:
dataset['predicted_details'] = dataset.num_words * 0.148274 + dataset.total_rarity * 1.198227

In [12]:
1.198227/7

0.1711752857142857

What is the marginal effect of adding an additional word?

0.148274 - .171175 zipf_freq(word)

In [23]:
with pd.option_context('display.max_colwidth', 500):
    display(dataset.sort_values('predicted_details').iloc[-10:])

Unnamed: 0,participant,stimulus,condition,text,num_words,min_freq,mean_freq,mean_rarity,total_rarity,predicted_details
191,4pc77f,275449,standard,"a glass of red wine is in the foreground. a tabby cat is crouched on the table behind it, looking up cheekily at the camera holder. you can see the picture taker reflected in the glass",36,1.91,5.801111,0.17127,6.165714,12.725789
229,5cgvvc,396295,norecs,two shower doors closed with opaque glass. a beige towel hangs on the handle of the door on the right. visible through the glass are various showering implements hanging from a rack,32,3.11,5.537813,0.208884,6.684286,12.75406
1462,7h666q,236272,norecs,a baseball player wearing a black helmet and a black shirt with a red stripe on the sleeve and a number eight on the back with grey pants swinging a bat in front of a catcher and his teammates,39,3.39,5.917436,0.154652,6.031429,13.009707
736,g55cwj,71815,standard,a man is lunging to his right and swinging his racket at a tennis ball. he is wearing a pink polo and brown shorts with white shoes. the word toronto is written underneath his knee,35,2.26,5.686286,0.187673,6.568571,13.06023
1730,533r6c,431140,standard,a bathroom with one white sink and a white toilet with a toilet paper roll on the back lid sits next to the sink which has a mirror above it which shows the exact same toilet across the room,39,3.78,5.905128,0.15641,6.1,13.091871
231,5cgvvc,227326,norecs,the bride and groom cut wedding cake together. the groom is on the left and the bride on the right. a photographer to their left is guiding their hands. another photographer in the left background snaps a picture from behind,40,3.58,5.94475,0.15075,6.03,13.156269
238,5cgvvc,236272,gated,a baseball player swings at homeplate. he wears a uniform with a black top and grey pants. the number eight is visible on his back in orange. the catcher's arm and mitt are visible on the left. other players watch in the background,43,1.14,5.745116,0.179269,7.708571,15.6124
235,5cgvvc,71815,standard,a tennis player is preparing to backhand a ball that is in midair in front of him. he wears a white shirt and green shorts with white sneakers. he behind the baseline on what looks like grass. the word toronto is printed on the grass in white,47,2.7,5.85,0.164286,7.721429,16.220902
230,5cgvvc,431140,norecs,this is a bathroom with a white commode on the left and a white sink with a mirror above on the right. there is a roll of toilet paper on top of the toilet tank and behind this is a stainless steel grab bar mounted to the wall. another commode is visible in the mirror,55,2.25,6.013455,0.140935,7.751429,17.443041
734,g55cwj,527375,norecs,"a very old apple computer is sitting on a wooden l desk surrounded by speakers, an apple keyboard and an apple mouse. there is also a small laptop with a picture from the movie cars displayed on it. there is a speaker next to the laptop and a shelf with lots of trinkets above it",55,2.82,5.892909,0.158156,8.698571,18.577933


That's mostly just the long descriptions. But I wonder if there are some words that characterize them that we could target.

Waaait.. what I really want is a _local_ measure of specificity. Can I predict the _overall_ specificity from a slice of, say, 5 words of the original?