In [245]:
!cd ../src

zsh:cd:1: no such file or directory: ../src


In [246]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [247]:
%autoreload 2

In [248]:
import numpy as np
import spacy

In [249]:
nlp = spacy.load("en_core_web_lg")

### Parameters


In [250]:
# Experiment
group = "combined"  # "controlled" or "free" or "combined"

# Minimum absolute pattern frequency of a descriptor
MIN_APF = 7

### Load descriptions

In [251]:
from src.data import get_descriptions

descriptions = get_descriptions(group)  # 32 patterns X 12 occurrences (= n/2 participants)
descriptions

array([["This one feels like a train or a horse. Excitement is not necessarily the first word I'd be like --. I feel excited, but it's not relaxing particularly. It feels like you're jogging or something. Yes, it's quite rhythmic, I would say. It's quite pulsating, quite up-beat or urgent or whatever. But it doesn't feel alarming to me anyway.",
        "I'm so sorry, I'm thinking of a vibrator again, but the ones that come with different vibration patterns because the switch is so dramatic. It feels very dramatic this one. Other than a vibrator? An engine again, maybe. Something slightly unstable. I don't think my finger feels all that much. I think I'd describe it as touching an engine that's sputtering right in the beginning. I think you need to think of an old-school engine that's just coming to life. And it doesn't like that. I feel a bit like that. That felt a bit different. Very jaggy. Spiky. Not very comfortable. It makes me feel a little anxious.",
        "It's more sporadic.

### Identify descriptors

In [252]:
# k: lemma, v: number of occurrences
global_nouns_count = dict()
global_adjectives_count = dict()

# k: lemma, v: set of patterns the lemma occurs in
global_nouns_patterns = dict()
global_adjectives_patterns = dict()

for index, pattern in enumerate(descriptions):
    pattern_number = index + 1

    for description in pattern:
        doc = nlp(str(description))

        # 1. Identify "local" occurrences in current description
        local_nouns = set()
        local_adjectives = set()

        for token in doc:
            if token.pos_ == "NOUN":
                local_nouns.add(token.lemma_.lower())
            if token.pos_ == "ADJ":
                local_adjectives.add(token.lemma_.lower())

        # 2. Add to "global" occurrences
        for local_noun in local_nouns:
            # a. Count number of times the noun occurs
            if local_noun in global_nouns_count:
                global_nouns_count[local_noun] = global_nouns_count[local_noun] + 1
            else:
                global_nouns_count[local_noun] = 1
            # b. Count number of patterns the noun occurs in
            if local_noun in global_nouns_patterns:
                global_nouns_patterns[local_noun].add(pattern_number)
            else:
                global_nouns_patterns[local_noun] = {pattern_number}

        for local_adjective in local_adjectives:
            # a. Count number of times the adjective occurs
            if local_adjective in global_adjectives_count:
                global_adjectives_count[local_adjective] = global_adjectives_count[local_adjective] + 1
            else:
                global_adjectives_count[local_adjective] = 1
            # b. Count number of patterns the adjective occurs in
            if local_adjective in global_adjectives_patterns:
                global_adjectives_patterns[local_adjective].add(pattern_number)
            else:
                global_adjectives_patterns[local_adjective] = {pattern_number}

global_nouns_min = np.min(list(global_nouns_count.values()))
global_nouns_max = np.max(list(global_nouns_count.values()))
print(f"nouns: {len(global_nouns_count.keys())}, min = {global_nouns_min}, max = {global_nouns_max}")

global_adjectives_min = np.min(list(global_adjectives_count.values()))
global_adjectives_max = np.max(list(global_adjectives_count.values()))
print(f"adjectives: {len(global_adjectives_count.keys())}, min = {global_adjectives_min}, max = {global_adjectives_max}")

assert global_nouns_count.keys() == global_nouns_patterns.keys()
assert global_adjectives_count.keys() == global_adjectives_patterns.keys()

nouns: 967, min = 1, max = 188
adjectives: 620, min = 1, max = 138


### Save to disk

In [253]:
from src.utils import descriptors_dir

noun_descriptors_dict = {k: v for k, v in global_nouns_patterns.items() if len(v) >= MIN_APF}
noun_descriptors = [k for k, v in noun_descriptors_dict.items()]
noun_descriptors = [descriptor for descriptor in noun_descriptors if descriptor != "onomatopoeia"]
print(f"noun descriptors: {len(noun_descriptors)}")
np.savetxt(descriptors_dir() / f"{group}_nouns.txt", noun_descriptors, delimiter=",", fmt="%s")

adjective_descriptors_dict = {k: v for k, v in global_adjectives_patterns.items() if len(v) >= MIN_APF}
adjective_descriptors = [k for k, v in adjective_descriptors_dict.items()]
print(f"adjective descriptors: {len(adjective_descriptors)}")
np.savetxt(descriptors_dir() / f"{group}_adjectives.txt", adjective_descriptors, delimiter=",", fmt="%s")

# TODO: Careful with combining (e.g., "light")
all_descriptors = list(set(noun_descriptors + adjective_descriptors))
print(f"all descriptors: {len(all_descriptors)}")
np.savetxt(descriptors_dir() / f"{group}_all.txt", all_descriptors, delimiter=",", fmt="%s")

noun descriptors: 77
adjective descriptors: 80
all descriptors: 155


In [254]:
noun_descriptors_dict = dict(sorted(noun_descriptors_dict.items(), key=lambda item: len(item[1]), reverse=True))
adjective_descriptors_dict = dict(sorted(adjective_descriptors_dict.items(), key=lambda item: len(item[1]), reverse=True))

In [255]:
noun_descriptors_dict.keys()

dict_keys(['one', 'bit', 'finger', 'sensation', 'feeling', 'way', 'glass', 'vibration', 'thing', 'time', 'lot', 'sound', 'friction', 'kind', 'person', 'surface', 'hand', 'beat', 'onomatopoeia', 'pattern', 'light', 'sort', 'phone', 'pulse', 'stuff', 'sense', 'screen', 'music', 'rhythm', 'emotion', 'skin', 'action', 'window', 'alarm', 'bump', 'attention', 'reason', 'word', 'head', 'plate', 'part', 'mind', 'wave', 'buzz', 'adjective', 'place', 'heartbeat', 'clock', 'day', 'object', 'noise', 'resistance', 'movement', 'ice', 'texture', 'water', 'excitement', 'fingertip', 'background', 'touch', 'people', 'shock', 'other', 'frequency', 'ball', 'game', 'point', 'heart', 'paper', 'life', 'direction', 'danger', 'difference', 'interval', 'line', 'beating', 'motor', 'picture'])

In [256]:
adjective_descriptors_dict.keys()

dict_keys(['little', 'more', 'other', 'good', 'hard', 'small', 'nice', 'smooth', 'first', 'much', 'same', 'soft', 'subtle', 'weird', 'constant', 'light', 'tiny', 'last', 'big', 'similar', 'slow', 'electrical', 'long', 'intense', 'pleasant', 'strong', 'sticky', 'rough', 'annoying', 'bad', 'different', 'slight', 'fast', 'electric', 'sure', 'normal', 'high', 'calm', 'like', 'rhythmic', 'anxious', 'wrong', 'easy', 'gentle', 'low', 'excited', 'uncomfortable', 'happy', 'actual', 'continuous', 'heavy', 'least', 'second', 'quiet', 'sweaty', 'stuck', 'fun', 'aware', 'alert', 'few', 'deep', 'wet', 'super', 'quick', 'angry', 'satisfied', 'funny', 'textured', 'whole', 'aggressive', 'most', 'slippery', 'calming', 'short', 'natural', 'cold', 'specific', 'okay', 'frequent', 'difficult'])

### Nice to have: intersecting elements

In [257]:
noun_adjective = set(noun_descriptors).intersection(set(adjective_descriptors))
print(f"descriptors occurring as noun and adjective: {len(noun_adjective)}, {noun_adjective}")

descriptors occurring as noun and adjective: 2, {'other', 'light'}
