In [131]:
import json
import pandas as pd
import os
import pprint
from random import randrange
import collections

In [118]:
pp = pprint.PrettyPrinter(indent=2)

In [119]:
# import exhibit-level data

with open('data/scraped_data/exhibits.json', 'r') as f:
    exhibits = json.load(f)

with open('data/entities/exhibits.json', 'r') as f:
    exhibit_entities = json.load(f)

In [120]:
exhibit_fields = list(exhibits[0].keys())
print('Exhibit fields:', exhibit_fields)

Exhibit fields: ['id', 'title', 'aliases', 'location', 'byline', 'collection_id', 'related_id', 'tagline', 'description', 'whats_going_on', 'going_further', 'details', 'phenomena', 'keywords']


In [141]:
N = randrange(len(exhibits))
rand_exhibit_url = 'www.exploratorium.edu/exhibits/' + exhibit_entities[N]['id']   # fix this pointer!
print('Random exhibit:{}'.format(rand_exhibit_url))
print('Random exhibit:')
pp.pprint(exhibits[N])

Random exhibit:www.exploratorium.edu/exhibits/aeolian-harp
Random exhibit:
{ 'aliases': 'Praxinoscope Tower',
  'byline': 'Richard O. Brown and Jessica Strick , 2012',
  'collection_id': [],
  'description': '',
  'details': '',
  'going_further': '',
  'id': 'animation-tower',
  'keywords': [],
  'location': 'Osher Gallery 1: Human Phenomenon',
  'phenomena': [],
  'related_id': [ 'robot-dance',
                  'animation-station',
                  'magic-wand',
                  'flourish-bloom'],
  'tagline': 'Quick-changing views create the illusion of motion.',
  'title': 'Animation Tower',
  'whats_going_on': ''}


In [122]:
exhibit_entity_fields = list(exhibit_entities[0].keys())
print('Exhibit entity fields:', exhibit_entity_fields)

Exhibit entity fields: ['id', 'grouping', 'category', 'entities']


In [140]:
N = randrange(len(exhibit_entities))
rand_exhibit_entities_url = 'www.exploratorium.edu/exhibits/' + exhibit_entities[N]['id']   # fix this pointer!
print('Random exhibit:{}'.format(rand_exhibit_entities_url))
print('Random exhibit entities:')
pp.pprint(exhibit_entities[N])

Random exhibit:www.exploratorium.edu/exhibits/give-heart-cells-a-beat
Random exhibit entities:
{ 'category': 'tagline',
  'entities': [ { 'name': 'heart rate',
                  'salience': 0.46025702357292175,
                  'source': 'gcp',
                  'type': 'OTHER'},
                { 'name': 'beating',
                  'salience': 0.3046810030937195,
                  'source': 'gcp',
                  'type': 'OTHER'},
                { 'name': 'human heart cells',
                  'salience': 0.23506198823451996,
                  'source': 'gcp',
                  'type': 'OTHER'}],
  'grouping': 'exhibits',
  'id': 'give-heart-cells-a-beat'}


In [124]:
# delete items in exhibit_entities with an empty 'entities' field
# (these result fom empty text files, should have been fixed earlier)

empty_items = []
for item in exhibit_entities:
    if len(item['entities']) == 0:
        empty_items.append(exhibit_entities.index(item))

exhibit_entities = [item for item in exhibit_entities if exhibit_entities.index(item) not in empty_items]

In [125]:
# add a 'source' key to the entities dict, and assign a value of 'gcp' to the exisiting entity items

for item in exhibit_entities:
    for entity in item['entities']:
        entity.update({'source': 'gcp'})

In [126]:
# find unique keywords (from scraped data)

keywords = [exhibit['keywords'] for exhibit in exhibits]
keywords = [item for sublist in keywords for item in sublist]
unique_keywords = pd.Series(list(set(keywords)))

print('Number of unique keywords: {}'.format(len(unique_keywords)))
print('Unique keywords: {}'.format(unique_keywords.tolist()))

Number of unique keywords: 114
Unique keywords: ['artwork', 'balance', 'cells movement animation', 'human generosity project', 'membrane', 'musical instruments', 'portraits', 'ratio', 'soap', 'phase angle', 'pressure', 'zebrafish', 'Ocean', 'fragmentation', 'unicellular organisms', 'magnification', 'illusions', 'carbon dioxide', 'attentive listening', 'Sharks', 'eyes', 'ping pong balls', 'chaos', 'proportion', 'boundary', 'Whales', 'Marine', 'identical twins', 'focal point', 'bubbles', 'afterimage', 'ganzfeld effect', 'speakers', 'wind', 'complexity', 'sundial', 'parallel', 'comets', 'magnet', 'appearance', 'air currents', 'prisms', 'magnetic field', 'habitat', 'reflections', 'fish eggs', 'Turtles', 'cornsweet illusion', 'genetics', 'amplification', 'cells', 'photography', 'tides', 'turbulence', 'mirrors', 'scale', 'vibration', 'nerves', 'plankton', 'cooperation', 'dissection', 'Migrations', 'rotation', 'gravity', 'organisms', 'acoustics', 'pendulums', 'identity', 'electrolysis', 'spin

In [127]:
# add entries corresponding to each instance of each keyword above (some keywords correspond to more than one exhibit)
# to the exhibit_entiteis data

for exhibit in exhibits:
    if len(exhibit['keywords']) == 0:
        continue
    exhibit_id = exhibit['id']
    these_entities = []
    for keyword in exhibit['keywords']:
        these_entities.append(
            {'name': keyword,
             'type': 'keyword',
             'salience': 1.0,
             'source': 'exploratorium.com'
            }
        )
    exhibit_entities.append(
            {'id': exhibit_id,
             'grouping': 'exhibits',
             'field': 'exhibit',
             'entities': these_entities
            }
    )

In [128]:
# find unique phenomena (from scraped data)

phenomena = [exhibit['phenomena'] for exhibit in exhibits]
phenomena = [item for sublist in phenomena for item in sublist]
unique_phenomena = pd.Series(list(set(phenomena)))

print('Number of unique phenemena: {}'.format(len(unique_phenomena)))
print('Unique phenomena: {}'.format(unique_phenomena.tolist()))

Number of unique phenemena: 144
Unique phenomena: ['Modeling Landscapes', 'Refraction', 'Metacognition', 'Reproduction', 'Frequency', 'Judgment', 'Motion: Visual Motion Detection', 'Waves: Transverse', 'Fatigue', 'Polarization', 'Binocular Vision', 'Electromagnetic Forces', 'Surface Tension of Liquids', 'Embryonic Development', 'Fluorescence', 'Harmonics', 'Motion: Rotational Motion', 'Numbers', 'Magnetization', 'Images: Real', 'Upwelling', 'Wave Excitation', 'Torque', 'Shadows', 'Electric Fields and Forces', 'Thin Films', 'Phase', 'Persistence of Vision', 'Energy: Transformation of Energy', 'Visual Edge Effects', 'Vortices', 'Depth Perception', 'Projections', 'Melting', 'Randomness', 'Exponentials', 'Conservation of Angular Momentum', 'Parallax', 'Microscopy', 'Temperature: Effect of', 'Sublimation', 'Decomposition', 'Motion: Accelerated Linear Motion', 'Evaporation', 'Image Formation', 'Waves: Standing', 'Dispersion', 'Absorption: Light', 'Attention', 'Motion: Accelerated Rotational 

In [129]:
# add entries corresponding to each instance of each phenomenon above (some phenomena correspond to more than one exhibit)
# to the exhibit_entiteis data

for exhibit in exhibits:
    if len(exhibit['phenomena']) == 0:
        continue
    exhibit_id = exhibit['id']
    these_entities = []
    for phenomenon in exhibit['phenomena']:
        these_entities.append(
            {'name': phenomenon,
             'type': 'phenomenon',
             'salience': 1.0,
             'source': 'exploratorium.com'
            }
        )
    exhibit_entities.append(
            {'id': exhibit_id,
             'grouping': 'exhibits',
             'field': 'exhibit',
             'entities': these_entities
            }
    )

In [134]:
# some items in exhibit_entities list the same entity name more than once
# this is an anttempt to combine these instances
# needs to be fixed

def merge_entities(entities):
    entities_deque = collections.deque(entities)
    orig_entities = [entities[k]['name'] for k in range(len(entities))]
    for name, count in collections.Counter(orig_entities).items():

        for i in range(count):
            # find ith instance of  with given name

            # ind = index of this instance
            # Compute the max, etc. of the salience for entity
        {'name': name},

for item in exhibit_entities:
    orig_entities = []
    for entity in item['entities']:
        orig_entities.append(entity['name'])
    dupes = [item for item, count in collections.Counter(orig_entities).items() if count > 1]
    unique_entities = []
    for entity in item['entities']:
        if entity['name'] in dupes:
            max_salience = max()
            unique_entities.append(
                {'name': entity['name'],

                }
            )


[]
['1986']
['1979']
['1976']
['1977']
['2003']
['2002']
['1975']
['2013']
['2013']
['2001']
['2010']
['2001']
['1988']
['2016']
['1986']
['1986']
['2009']
['2000']
['2012']
['2013']
['1998']
['2013']
[]
['2013']
['1999']
['1979']
['2001']
['1977']
['2003']
['1983']
['2010']
['1984']
[]
['2017']
['2013']
['2000']
['2001']
['2013']
['1971']
['2012']
['1994']
['1989']
['2019']
['1975']
['1980']
['2013']
['1993']
['2016']
['2017']
['1969']
['2012']
['2013']
['2013']
['2006']
['1978']
['1973']
['1985']
['2013']
['1969']
['2019']
['1987']
['2013']
['1976']
[]
['2013']
['2013']
['2013']
['2014']
['2017']
['2014']
['2014']
['1997']
['2006']
['1981']
[]
['2019']
['1994']
['2012']
['2001']
['2016']
['1973']
['1969']
['2015']
['2017']
['2012']
['2006']
[]
[]
['1996']
['2017']
['2019']
['2005']
['2013']
['2013']
['2006']
['2010']
['2013']
['2011']
['1980']
['1970']
['2009']
['2006']
['2014']
['1979']
['2005']
[]
['2019']
['1980']
['1992']
['1993']
['2006']
[]
['2001']
['2007']
['2010']
['2001']
[

In [None]:
# import gallery-level data

with open('data/scraped_data/galleries.json', 'r') as f:
    galleries = json.load(f)

with open('data/entities/galleries.json', 'r') as f:
    gallery_entities = json.load(f)

In [62]:
gallery_fields = list(galleries[0].keys())
print('Gallery fields:', gallery_fields)

del gallery_entities['curator_url']

Gallery fields: ['id', 'title', 'tagline', 'description', 'curator_url', 'curator_statement']


In [143]:
N = randrange(len(galleries))
rand_gallery_url = 'www.exploratorium.edu/visit/' + galleries[N]['id']   # fix this pointer!
print('Random gallery:{}'.format(rand_gallery_url))
print('Random gallery:')
pp.pprint(galleries[N])

Random gallery:www.exploratorium.edu/visit/gallery-5
Random gallery:
{ 'curator_statement': 'The guiding principle of the Gallery 5 is to support '
                       'and expand the Exploratorium’s role as a community '
                       'museum dedicated to awareness. Helping to reinvent the '
                       'civic role of a public museum as a place to gather and '
                       'exchange ideas, the gallery also exemplifies how '
                       'direct observations of natural and urban phenomena can '
                       'blossom into artistic endeavors, scientific '
                       'investigations, and open-ended inquiries. The gallery '
                       'features a combination of large- and small-scale '
                       'exhibits, rotating art installations, and public '
                       'programs  (including vendors, performance artists, and '
                       'public exhibitions). Our defining location on the '


In [64]:
gallery_entity_fields = list(gallery_entities[0].keys())
print('Gallery entity fields:', gallery_entity_fields)

Gallery entity fields: ['id', 'grouping', 'category', 'entities']


In [144]:
N = randrange(len(gallery_entities))
rand_gallery_entities_url = 'www.exploratorium.edu/visit/' + gallery_entities[N]['id']   # fix this pointer!
print('Random gallery:{}'.format(rand_gallery_entities_url))
print('Random gallery entities:')
pp.pprint(gallery_entities[N])

Random gallery:www.exploratorium.edu/visit/gallery-1
Random gallery entities:
{ 'category': 'description',
  'entities': [ { 'name': 'Humans',
                  'salience': 0.4206271469593048,
                  'type': 'PERSON'},
                { 'name': 'phenomena',
                  'salience': 0.16239593923091888,
                  'type': 'OTHER'},
                { 'name': 'exploration',
                  'salience': 0.1154911071062088,
                  'type': 'OTHER'},
                { 'name': 'investigation',
                  'salience': 0.11413095146417618,
                  'type': 'EVENT'},
                { 'name': 'others',
                  'salience': 0.05896306410431862,
                  'type': 'PERSON'},
                { 'name': 'others',
                  'salience': 0.04463204741477966,
                  'type': 'PERSON'},
                { 'name': 'interactions',
                  'salience': 0.039267830550670624,
                  'type': 'OTHER'},
         

In [None]:
# delete items in gallery_entities which have an empty 'entities' field

empty_items = []
for item in gallery_entities:
    if len(item['entities']) == 0:
        empty_items.append(gallery_entities.index(item))

gallery_entities = [item for item in gallery_entities if gallery_entities.index(item) not in empty_items]