In [12]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm.auto import tqdm
import spacy
import spacy_transformers
import spacy_curated_transformers
from functools import lru_cache
from collections import Counter

In [21]:
!python -m spacy download en_core_web_trf
# !pip install spacy-transformers

  _torch_pytree._register_pytree_node(
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [22]:
nlp = spacy.load("en_core_web_trf")

ValueError: [E002] Can't find factory for 'curated_transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer

In [17]:
df_conc = pd.read_excel('../Concreteness_ratings_Brysbaert_et_al_BRM.xlsx')

In [18]:
concreteness = df_conc.set_index('Word')['Conc.M'].to_dict()

In [19]:
def is_concrete(noun, t=4.5):
    if noun in concreteness:
        return concreteness[noun] > t
    return False

In [37]:
@lru_cache(maxsize=None)
def get_nouns(cap):
    doc = nlp(cap.lower())
    return {token.lemma_ for token in doc if token.pos_ == 'NOUN'}

In [39]:
@lru_cache(maxsize=None)
def cap2objs(cap):
    nouns = get_nouns(cap)
    return {n for n in nouns if is_concrete(n)}

In [42]:
tqdm.pandas()

In [15]:
with sqlite3.connect('out.db') as con:
    df = pd.read_sql('select * from data', con)

In [16]:
df

Unnamed: 0,cap
0,A bride and groom cutting a cake on their wedd...
1,a mother giving a child a hug.
2,A woman in a long dress standing with a large ...
3,A hand is holding a mug of tea on a kitchen co...
4,a close up of a bowl of pasta with a spoon in it.
...,...
135847,an open-air cafe with a few patrons.
135848,A close-up of a jar of honey and a butter knife.
135849,A view of a large shelf with numerous food items.
135850,A close-up of three colorful glasses of juice.


In [172]:
df['nouns'] = df.cap.progress_apply(get_nouns)

  0%|          | 0/135852 [00:00<?, ?it/s]

In [173]:
df['objs'] = df.cap.progress_apply(cap2objs)

  0%|          | 0/135852 [00:00<?, ?it/s]

In [174]:
df['n_objs'] = df.objs.str.len()

In [175]:
df

Unnamed: 0,cap,nouns,objs,n_objs
0,A bride and groom cutting a cake on their wedd...,"{bride, day, cake, groom, wedding}","{bride, groom, cake}",3
1,a mother giving a child a hug.,"{child, mother, hug}","{mother, child}",2
2,A woman in a long dress standing with a large ...,"{woman, tree, dress}","{tree, dress}",2
3,A hand is holding a mug of tea on a kitchen co...,"{kitchen, hand, mug, counter, tea}","{hand, kitchen, tea, mug}",4
4,a close up of a bowl of pasta with a spoon in it.,"{pasta, bowl, spoon, close}","{pasta, bowl, spoon}",3
...,...,...,...,...
135847,an open-air cafe with a few patrons.,"{cafe, patron, air}",{cafe},1
135848,A close-up of a jar of honey and a butter knife.,"{knife, honey, butter, up, jar, close}","{jar, honey, knife, butter}",4
135849,A view of a large shelf with numerous food items.,"{item, shelf, view, food}","{shelf, food}",2
135850,A close-up of three colorful glasses of juice.,"{up, juice, close, glass}","{juice, glass}",2


In [176]:
len(set.union(*df.objs))

2401

In [177]:
df.sample(10)

Unnamed: 0,cap,nouns,objs,n_objs
114357,A group of people are hiking through a forest.,"{people, group, forest}","{people, forest}",2
90656,Two men are riding a motorcycle together in th...,"{winter, motorcycle, man}","{motorcycle, man}",2
118006,A group of red-green-blue trolls are standing ...,"{unicorn, troll, group}",{},0
37044,A pile of red tomatoes on a wooden surface.,"{surface, tomato, pile}","{tomato, pile}",2
104681,a large variety of fruit hanging from a tree.,"{tree, fruit, variety}","{tree, fruit}",2
12708,"A large, bearded man is shown holding a wooden...","{sword, man, shore}","{sword, man, shore}",3
21566,a stack of papers with a pair of scissors stuc...,"{scissor, pair, paper, middle, stack}","{scissor, paper}",2
45963,A pair of binoculars is being held by a man.,"{pair, binocular, man}",{man},1
12600,A close up of a red flower growing on a tree.,"{tree, close, flower}","{tree, flower}",2
129527,A man holding a fishing rod while standing in ...,"{man, fishing, water, rod, beach}","{water, beach, man}",3


In [178]:
c = Counter(obj for x in df.objs for obj in x)

In [179]:
df_ = df[df.n_objs > 0].copy()
df_['min_count'] = df_.objs.apply(lambda x: min(c[y] for y in x))
df_ = df_.sort_values(by='min_count')

In [180]:
df_

Unnamed: 0,cap,nouns,objs,n_objs,min_count
122511,A bee is chasing a wasp away from a flower.,"{bee, flower, wasp}","{bee, flower, wasp}",3,1
108091,A man in the process of loading a cartridge in...,"{cartridge, process, man, rifle}","{cartridge, rifle, man}",3,1
104130,A woman with a handful of tapioca pearls holds...,"{woman, tea, handful, bubble, tapioca, cup, pe...","{tea, bubble, tapioca, cup, pearl}",5,1
127135,"A computer server with a plywood cover, with t...","{plywood, server, wire, cover, computer}","{computer, plywood, server, wire}",4,1
91366,A lab with a computer and an oscilloscope.,"{computer, oscilloscope, lab}","{computer, oscilloscope, lab}",3,1
...,...,...,...,...,...
29566,A man is about to shoot a canonball.,"{canonball, man}",{man},1,28513
74915,A man and a woman dressed as business executives.,"{woman, executive, business, man}",{man},1,28513
74889,The profile of a young man smiling.,"{man, profile}",{man},1,28513
54416,A man and a woman are hugging tightly.,"{woman, man}",{man},1,28513


In [181]:
df_small = df_.head(len(df_) // 10)

In [182]:
len(df_small)

13211

In [183]:
df_small.sample(10)

Unnamed: 0,cap,nouns,objs,n_objs,min_count
55999,A vaulted cellar filled with barrelled wine.,"{wine, cellar}","{wine, cellar}",2,22
108417,A skier in head-to-toe snow gear is posing for...,"{head, snow, skier, gear, toe, photo}","{snow, toe, head, photo}",4,15
66173,A closeup of a woman's face with a black eye a...,"{woman, face, closeup, eye, bandage}","{eye, bandage, face}",3,20
74741,Focused lights on a road through the mist.,"{mist, light, road}","{mist, road}",2,20
70817,The hanging tapestry displays a variety of geo...,"{pattern, shape, tapestry, variety}",{tapestry},1,18
83458,An attractive blonde with a red shirt and a po...,"{ponytail, shirt, blonde}","{ponytail, shirt, blonde}",3,6
21111,A bunch of different sized shrimp in a container.,"{shrimp, container, bunch}","{shrimp, container}",2,33
105685,A vintage telegram is shown.,{telegram},{telegram},1,2
65950,A man is holding a pair of horseshoes up close.,"{pair, horseshoe, man}","{horseshoe, man}",2,12
114368,A child with a bandage over his head being hel...,"{adult, child, bandage, head}","{child, bandage, head}",3,20


In [184]:
df_small.cap.value_counts()

cap
A giant joystick is being maneuvered by a player.                         7
A woman standing in a field flying a hawk.                                6
An open satchel filled with a variety of snacks.                          4
A cowboy riding a bucking bronco.                                         4
A man is using a chainsaw to cut down a large tree.                       3
                                                                         ..
a group of people enjoying the outdoors                                   1
the interior of a car, showing the dashboard and the instrument panel.    1
A picture of a man's face on a postage stamp.                             1
A girl is lying on a bed, showing her little breasts.                     1
An aerial shot of a large ship in a harbor.                               1
Name: count, Length: 13158, dtype: int64