In [17]:
from language_processing import *
import pprint as pp
import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
import json
import os

In [18]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/vonbecker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vonbecker/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
# read unique exhibit phenomena into a list

phenomena = pd.read_csv('data/phenomena/phenomena.csv').squeeze('columns').to_list()
pp.pprint(phenomena)

['Natural Frequency',
 'Electric Fields and Forces',
 'Motion: Accelerated Gravitational Motion',
 'Afterimage',
 'Judgment',
 'Thin Films',
 'Frame of Reference',
 'Nodes and Antinodes',
 'Sublimation',
 'Bernoulli Effect',
 'Color Mixing: Subtractive',
 'Size-Distance',
 'Upwelling',
 'Nonlinear Behavior',
 'Information Processing and Encoding',
 'Resonance',
 'Perception: Spatial',
 'Attention',
 'Melting',
 'Visual Edge Effects',
 'Superposition',
 'Exponentials',
 'Surface Tension of Liquids',
 'Numbers',
 'Color: Complimentary',
 'Binocular Vision',
 'Crystallization',
 'Fatigue',
 'Color Vision',
 'Oscillation',
 'Iridescence',
 'Conservation of Angular Momentum',
 'Decomposition',
 'Polarization',
 'Evaporation',
 'Close Packing',
 'Ferromagnetism',
 'Parabolas',
 'Metacognition',
 'Albedo',
 'Depth Perception',
 'Waves: Transverse',
 'Probability',
 'Shadows',
 'Rhythm',
 'Motion: Composition of Perpendicular Motion',
 'Motion: Simple Harmonic Motion',
 'Social Loafing',
 'Gra

In [20]:
# read unique keywords into a list

keywords = pd.read_csv('data/keywords/keywords.csv').squeeze('columns').to_list()
pp.pprint(keywords)

['lenses',
 'center of gravity',
 'vibration',
 'weather',
 'identity',
 'cornsweet illusion',
 'acoustics',
 'evaporation',
 'chicken wire',
 'Ocean',
 'Sharks',
 'Neon',
 'Migrations',
 "Huygens' principle",
 'mirrors',
 'turbulence',
 'carbon dioxide',
 'parallel',
 'phase angle',
 'parabolas',
 'organisms',
 'cosmic rays',
 'simultaneous contrast',
 'series',
 'chaos',
 'amplification',
 'genetics',
 'lasers',
 'artwork',
 'membrane',
 "Newton's Laws",
 'reflections',
 'ratio',
 'circuit',
 'ping pong balls',
 'DNA',
 'levitation',
 'navigation',
 'curves',
 'illusions',
 'model organisms',
 'Whales',
 'gases',
 'portraits',
 'sun',
 'bicycle wheel',
 'speakers',
 'identical twins',
 'pendulums',
 'dissection',
 'eyes',
 'nerves',
 'wind',
 'zebrafish',
 'heat transfer',
 'volcanoes',
 'Marine',
 'Pacific',
 'complexity',
 'Turtles',
 'condensation',
 'appearance',
 'unicellular organisms',
 'tides',
 'ganzfeld effect',
 'afterimage',
 'fragmentation',
 'CMY retina',
 'balance',
 '

In [21]:
# clean up phenomena

# express 'color mixing: subtractive' as 'subtractive color mixing', etc.
# express 'motion: simple harmonic motion' as 'simple harmonic motion', etc.
# express 'motion: visual motion detection as 'visual motion detection'
# express 'this-that' and 'this/that' as 'this that'
# convert to lower case
# remove apostrophes

phenomena_direct = []
for phenomenon in phenomena:
    this_phenom = phenomenon.lower().replace("'","")
    if ':' in phenomenon:
        split = re.split(': ', this_phenom)
        reordered = ' '.join(split[::-1])
        double_word = r'\b(\w+)\s+\1\b'
        separated_double = r'\b(\w+)\s(\w+)\s+\1\b'
        doubles = re.search(double_word, reordered)
        separated_doubles = re.search(separated_double, reordered)
        if doubles:
            reordered = reordered.replace(doubles.group(), doubles.group().split()[0])
        if separated_doubles:
            reordered = reordered.replace(separated_doubles.group(), ' '.join(separated_doubles.group().split()[:2]))
        this_phenom = reordered
    if any(x in phenomenon for x in ['-', '/']):
        split = re.split('-|/', this_phenom)
        this_phenom = ' '.join(split)
    phenomena_direct.append(this_phenom)

pp.pprint(phenomena_direct)

['natural frequency',
 'electric fields and forces',
 'accelerated gravitational motion',
 'afterimage',
 'judgment',
 'thin films',
 'frame of reference',
 'nodes and antinodes',
 'sublimation',
 'bernoulli effect',
 'subtractive color mixing',
 'size distance',
 'upwelling',
 'nonlinear behavior',
 'information processing and encoding',
 'resonance',
 'spatial perception',
 'attention',
 'melting',
 'visual edge effects',
 'superposition',
 'exponentials',
 'surface tension of liquids',
 'numbers',
 'complimentary color',
 'binocular vision',
 'crystallization',
 'fatigue',
 'color vision',
 'oscillation',
 'iridescence',
 'conservation of angular momentum',
 'decomposition',
 'polarization',
 'evaporation',
 'close packing',
 'ferromagnetism',
 'parabolas',
 'metacognition',
 'albedo',
 'depth perception',
 'transverse waves',
 'probability',
 'shadows',
 'rhythm',
 'composition of perpendicular motion',
 'simple harmonic motion',
 'social loafing',
 'gravitational forces',
 'torque

In [22]:
# clean up keywords
# express 'this-that' and 'this/that' as 'this that'
# convert to lower case
# remove apostrophes

keywords_direct = []
for keyword in keywords:
    this_keyword = keyword.lower().replace("'", "")
    if any(x in keyword for x in ['/']):
        split = re.split('-|/', this_keyword)
        this_keyword = ' '.join(split)
    keywords_direct.append(this_keyword)

pp.pprint(keywords_direct)

['lenses',
 'center of gravity',
 'vibration',
 'weather',
 'identity',
 'cornsweet illusion',
 'acoustics',
 'evaporation',
 'chicken wire',
 'ocean',
 'sharks',
 'neon',
 'migrations',
 'huygens principle',
 'mirrors',
 'turbulence',
 'carbon dioxide',
 'parallel',
 'phase angle',
 'parabolas',
 'organisms',
 'cosmic rays',
 'simultaneous contrast',
 'series',
 'chaos',
 'amplification',
 'genetics',
 'lasers',
 'artwork',
 'membrane',
 'newtons laws',
 'reflections',
 'ratio',
 'circuit',
 'ping pong balls',
 'dna',
 'levitation',
 'navigation',
 'curves',
 'illusions',
 'model organisms',
 'whales',
 'gases',
 'portraits',
 'sun',
 'bicycle wheel',
 'speakers',
 'identical twins',
 'pendulums',
 'dissection',
 'eyes',
 'nerves',
 'wind',
 'zebrafish',
 'heat transfer',
 'volcanoes',
 'marine',
 'pacific',
 'complexity',
 'turtles',
 'condensation',
 'appearance',
 'unicellular organisms',
 'tides',
 'ganzfeld effect',
 'afterimage',
 'fragmentation',
 'cmy retina',
 'balance',
 'bo

In [23]:
# tokenize and pos tag each phenomenon

phenom_token_tag = []
for phenomenon in phenomena_direct:
    words = word_tokenize(phenomenon)
    tags = pos_tag(words)
    phenom_token_tag.append(tags)

pp.pprint(phenom_token_tag)

[[('natural', 'JJ'), ('frequency', 'NN')],
 [('electric', 'JJ'), ('fields', 'NNS'), ('and', 'CC'), ('forces', 'NNS')],
 [('accelerated', 'VBN'), ('gravitational', 'JJ'), ('motion', 'NN')],
 [('afterimage', 'NN')],
 [('judgment', 'NN')],
 [('thin', 'JJ'), ('films', 'NNS')],
 [('frame', 'NN'), ('of', 'IN'), ('reference', 'NN')],
 [('nodes', 'NNS'), ('and', 'CC'), ('antinodes', 'NNS')],
 [('sublimation', 'NN')],
 [('bernoulli', 'NNS'), ('effect', 'NN')],
 [('subtractive', 'JJ'), ('color', 'NN'), ('mixing', 'NN')],
 [('size', 'NN'), ('distance', 'NN')],
 [('upwelling', 'VBG')],
 [('nonlinear', 'JJ'), ('behavior', 'NN')],
 [('information', 'NN'),
  ('processing', 'NN'),
  ('and', 'CC'),
  ('encoding', 'NN')],
 [('resonance', 'NN')],
 [('spatial', 'JJ'), ('perception', 'NN')],
 [('attention', 'NN')],
 [('melting', 'VBG')],
 [('visual', 'JJ'), ('edge', 'NN'), ('effects', 'NNS')],
 [('superposition', 'NN')],
 [('exponentials', 'NNS')],
 [('surface', 'NN'), ('tension', 'NN'), ('of', 'IN'), ('li

In [24]:
# tokenize and tag each keyword

keyword_token_tag = []
for keyword in keywords_direct:
    words = word_tokenize(keyword)
    tags = pos_tag(words)
    keyword_token_tag.append(tags)

pp.pprint(keyword_token_tag)

[[('lenses', 'NNS')],
 [('center', 'NN'), ('of', 'IN'), ('gravity', 'NN')],
 [('vibration', 'NN')],
 [('weather', 'NN')],
 [('identity', 'NN')],
 [('cornsweet', 'NN'), ('illusion', 'NN')],
 [('acoustics', 'NNS')],
 [('evaporation', 'NN')],
 [('chicken', 'NN'), ('wire', 'NN')],
 [('ocean', 'NN')],
 [('sharks', 'NNS')],
 [('neon', 'NN')],
 [('migrations', 'NNS')],
 [('huygens', 'NNS'), ('principle', 'NN')],
 [('mirrors', 'NNS')],
 [('turbulence', 'NN')],
 [('carbon', 'NN'), ('dioxide', 'NN')],
 [('parallel', 'NN')],
 [('phase', 'NN'), ('angle', 'NN')],
 [('parabolas', 'NNS')],
 [('organisms', 'NNS')],
 [('cosmic', 'JJ'), ('rays', 'NNS')],
 [('simultaneous', 'JJ'), ('contrast', 'NN')],
 [('series', 'NN')],
 [('chaos', 'NN')],
 [('amplification', 'NN')],
 [('genetics', 'NNS')],
 [('lasers', 'NNS')],
 [('artwork', 'NN')],
 [('membrane', 'NN')],
 [('newtons', 'NNS'), ('laws', 'NNS')],
 [('reflections', 'NNS')],
 [('ratio', 'NN')],
 [('circuit', 'NN')],
 [('ping', 'VBG'), ('pong', 'NN'), ('ba

In [25]:
# define master list of search terms:
# join keywords and phenomena
# keep unique elements

all_token_tag = phenom_token_tag + keyword_token_tag

unique_token_tag = []
[unique_token_tag.append(x) for x in all_token_tag if x not in unique_token_tag]

print('{} unique phenomena'.format(len(phenom_token_tag)))
print('{} unique keywords'.format(len(keyword_token_tag)))
print('{} unique terms in phenemena and keywords combined'.format(len(unique_token_tag)))

144 unique phenomena
114 unique keywords
252 unique terms in phenemena and keywords combined


In [26]:
# investigate distribution of tag sequences for the search terms
# a vast majority of the terms are noun phrases

all_tag_seq = []
for item in all_token_tag:
    tag_seq = ' '.join([subitem[1] for subitem in item])
    all_tag_seq.append(tag_seq)
all_tag_seq_pd = pd.Series(all_tag_seq)

print(all_tag_seq_pd.value_counts())

NN                           104
NNS                           38
JJ NN                         28
NN NN                         23
JJ NNS                        12
NN IN NN                       7
VBG                            6
NN NNS                         5
JJ NN NN                       4
VBN JJ NN                      3
NNS NN                         3
NNS CC NNS                     2
NN VBG                         2
VBG NNS                        2
NN IN JJ NN                    2
NN NNS IN NN                   1
JJ                             1
NNS NNS                        1
VBG NN NNS                     1
JJ VBZ NNS NN NN NN NN NN      1
NNS TO VB DT JJ JJ             1
JJ JJ                          1
RB VBG                         1
NN CC NN                       1
NN NN NN                       1
NNS VBP                        1
NN JJ NN                       1
JJ NNS CC NNS                  1
NN NN IN NNS                   1
JJ NN NNS                      1
NN NN CC N

In [27]:
# define related search terms (and the pos tags) for each main search terms

all_plus_related = []

for item in all_token_tag:
    text = ' '.join([word[0] for word in item])
    data = {
        'text': text,
        'primary': item
    }
    adj_nouns = get_adj_nouns(item)
    noun_nouns = get_noun_noun(item)
    vb_nouns = get_vb_nouns(item)
    adj_verbs = get_adj_verbs(item)
    nouns = get_nouns(item)
    verbs = get_verbs(item)
    adjectives = get_adjectives(item)
    related_items = adj_nouns
    related_items += nouns
    related_items += noun_nouns
    related_items += vb_nouns
    related_items += verbs
    related_items += adjectives
    related_items = [x for x in related_items if x]
    data['related'] = related_items
    all_plus_related.append(data)

pp.pprint(all_plus_related)

[{'primary': [('natural', 'JJ'), ('frequency', 'NN')],
  'related': [[('frequency', 'NN')], [('natural', 'JJ')]],
  'text': 'natural frequency'},
 {'primary': [('electric', 'JJ'),
              ('fields', 'NNS'),
              ('and', 'CC'),
              ('forces', 'NNS')],
  'related': [[('electric', 'JJ'), ('field', 'NN')],
              [('electric', 'JJ'), ('force', 'NN')],
              [('field', 'NN')],
              [('force', 'NN')],
              [('electric', 'JJ')]],
  'text': 'electric fields and forces'},
 {'primary': [('accelerated', 'VBN'),
              ('gravitational', 'JJ'),
              ('motion', 'NN')],
  'related': [[('gravitational', 'JJ'), ('motion', 'NN')],
              [('motion', 'NN')],
              [('accelerated', 'VBN'), ('motion', 'NN')],
              [('accelerated', 'VBN')],
              [('gravitational', 'JJ')]],
  'text': 'accelerated gravitational motion'},
 {'primary': [('afterimage', 'NN')],
  'related': [[('afterimage', 'NN')]],
  'text'

In [28]:
# separate lists of primary search terms and related search terms
# keep only related search terms which don't coincide with a primary search term

primary_items = []
related_items = []
related_items_unique = []

for item in all_plus_related:
    primary_items += [item['primary']]
    related_items += item['related']

[related_items_unique.append(x) for x in related_items if x not in related_items_unique + primary_items];

primary_items_pd = pd.Series(primary_items)
related_items_pd = pd.Series(related_items_unique)

In [29]:
print('There are {} unique main search terms'.format(len(primary_items)))
print('There are {} unique related search terms, none of which coincides with a main search term'.format((len(related_items_unique))))

There are 258 unique main search terms
There are 255 unique related search terms, none of which coincides with a main search term


In [30]:
# get wordnet synonym sets (with lemma names and definitions) for each word in each search phrase

terms_with_word_syns = []

for term in primary_items:
    this_term = []
    for word in term:
        syns = wn.synsets(word[0])
        syn_names = [syn.name() for syn in syns]
        syn_defs = [syn.definition() for syn in syns]
        this_word = {
            'word': word,
            'syns': syn_names,
            'defs': syn_defs
        }
        this_term.append(this_word)
    terms_with_word_syns.append(this_term)

In [31]:
pp.pprint(terms_with_word_syns)

[[{'defs': ['someone regarded as certain to succeed',
            'a notation cancelling a previous sharp or flat',
            '(craps) a first roll of 7 or 11 that immediately wins the stake',
            'in accordance with nature; relating to or concerning nature',
            'existing in or produced by nature; not artificial or imitation',
            'existing in or in conformity with nature or the observable world; '
            'neither supernatural nor magical',
            'functioning or occurring in a normal way; lacking abnormalities '
            'or deficiencies',
            '(of a musical note) being neither raised nor lowered by one '
            'chromatic semitone',
            'unthinking; prompted by (or as if by) instinct',
            '(used especially of commodities) being unprocessed or '
            'manufactured using only simple or minimal processes',
            'related by blood; not adopted',
            'being talented through inherited qualities',
   

In [32]:
# save all_plus_related to json file
# save primary_items and related_items_unique to csv files
# save terms_with_word_syns to json file

path = 'data/search_terms'
if not os.path.exists(path):
                os.makedirs(path)

all_filepath = os.path.join(path, 'primary_and_related.json')
if os.path.exists(all_filepath):
    os.remove(all_filepath)
with open(all_filepath, "w") as f:
    f.write(json.dumps(all_plus_related, indent=2))

primary_filepath = os.path.join(path, 'primary.csv')
primary_items_pd.to_csv(primary_filepath, index=False)

related_filepath = os.path.join(path, 'related.csv')
related_items_pd.to_csv(related_filepath, index=False)

syns_filepath = os.path.join(path, 'terms_with_word_syns.json')
if os.path.exists(syns_filepath):
    os.remove(syns_filepath)
with open(syns_filepath, 'w') as file:
    file.write(json.dumps(terms_with_word_syns, indent=2))

In [35]:
with open('data/search_terms/terms_with_word_syns.json') as file:
    terms_raw = json.load(file)

pp.pprint(terms_raw)

[[{'defs': ['someone regarded as certain to succeed',
            'a notation cancelling a previous sharp or flat',
            '(craps) a first roll of 7 or 11 that immediately wins the stake',
            'in accordance with nature; relating to or concerning nature',
            'existing in or produced by nature; not artificial or imitation',
            'existing in or in conformity with nature or the observable world; '
            'neither supernatural nor magical',
            'functioning or occurring in a normal way; lacking abnormalities '
            'or deficiencies',
            '(of a musical note) being neither raised nor lowered by one '
            'chromatic semitone',
            'unthinking; prompted by (or as if by) instinct',
            '(used especially of commodities) being unprocessed or '
            'manufactured using only simple or minimal processes',
            'related by blood; not adopted',
            'being talented through inherited qualities',
   

In [38]:
print(type(terms_raw[0][0]['word']))

<class 'list'>
