In [233]:
import os
from dotenv import load_dotenv
load_dotenv()

import json

from preprocessing import scripts, pretrained_models
from importlib import reload
reload(scripts)
reload(pretrained_models)

import pprint
pp = pprint.PrettyPrinter(indent=2)

In [95]:
# import exhibit, gallery data

home = os.getenv("PROJ_HOME")
exhibits_filepath = os.path.join(home, "data/raw/exhibits.json")
galleries_filepath = os.path.join(home, "data/raw/galleries.json")

with open(exhibits_filepath, 'r') as file:
    exhibits = json.load(file)

with open(galleries_filepath, 'r') as file:
    galleries = json.load(file)

In [140]:
# transform raw exhibit data into data containing the following fields:
# id, title, aliases (list),
# tagline, description, details,
# creators, year (both obtained with Cloud Natural Language models),
# location (coded), related exhibits, collections,
# keywords (some defined in raw data, some obtained with OpenAI models),
# short-summary, long-summary (both obtained with OpenAI models),
# fun-facts (obtained with OpenAI models)

def init_exhibit(exhibit):
    # initialize clean exhibit
    this_exhibit = {
        "id": exhibit["id"],
        "title": exhibit["title"],
        "aliases": scripts.parse_aliases(exhibit["aliases"]),
        "tagline": exhibit["tagline"],
        "description": scripts.remove_lang_settings(exhibit["description"]),
        "location": scripts.get_location_code(exhibit["location"]),
        "details": " ".join([exhibit["whats_going_on"], exhibit["going_further"], exhibit["details"]]).strip(),
        "related_exhibits": exhibit["related_id"],
        "collections": exhibit["collection_id"],
        "keywords": exhibit["phenomena"] + exhibit["keywords"]
    }

    # parse byline into creators and year
    byline_entities = pretrained_models.get_google_entities(exhibit["byline"])
    creators = scripts.get_creators(byline_entities)
    year = scripts.get_year(byline_entities)

    this_exhibit.update({
        "creators": creators,
        "year": year,
    })

    # define text field that will be passed to OpenAI models
    text_info = exhibit["description"] + \
                exhibit["details"] + \
                exhibit["whats_going_on"] + \
                exhibit["going_further"]

    all_text = " ".join(this_exhibit["aliases"] +
                        [
                            exhibit["title"],
                            exhibit["tagline"],
                            text_info
                        ] + \
                        exhibit["keywords"] + exhibit["phenomena"]
                        )

    # extract short summary of exhibit, then process it
    short_summary = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="short-summary",
        domain="exhibit",
        text=all_text,
        audience="an 8th grader",
        temp=0.2
    )
    short_summary = scripts.remove_frag_start(short_summary)

    # extract medium summary of exhibit, if enough data about the exibit is available; then process
    enough_data = True if len(text_info) > 200 else False
    if enough_data:
        medium_summary = pretrained_models.get_openai_completion(
            engine="text-davinci-001",
            prompt_type="medium-summary",
            domain="exhibit",
            text=all_text,
            audience="an 8th grader",
            temp=0.2
        )
    else:
        medium_summary = ""
    medium_summary = scripts.remove_frag_start(medium_summary)

    # extract new keywords from OpenAI model
    new_keywords = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="keywords",
        domain="exhibit",
        text=all_text,
        temp=0.2
    )
    new_keywords = scripts.find_items(new_keywords)
    all_keywords = scripts.process_keywords(this_exhibit["keywords"], new_keywords)

    # get fun facts about exhibit from OpenAI
    fun_facts = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="fun-facts",
        domain="exhibit",
        text=all_text,
        temp=0.2
    )
    fun_facts = scripts.find_items(fun_facts, short=False)

    this_exhibit.update({
        "short-summary": short_summary,
        "medium-summary": medium_summary,
        "keywords": all_keywords,
        "fun-facts": fun_facts,
    })

    return this_exhibit

In [248]:
# transform raw gallery data into data containing the following fields:
# id, title,
# tagline, description,
# keywords (obtained with OpenAI models),
# short-summary, medium-summary (both obtained with OpenAI models),
# fun-facts (obtained with OpenAI models)

def init_gallery(gallery):
    # initialize clean gallery
    this_gallery = {
        "id": gallery["id"],
        "title": gallery["title"],
        "tagline": gallery["tagline"],
        "description": gallery["description"],
        "curator_statement": gallery["curator_statement"]
    }

    # define text field that will be passed to OpenAI models
    all_text = " ".join([gallery["title"],
                         gallery["tagline"],
                         gallery["description"],
                         "Curator Statement:",
                         gallery["curator_statement"],
                        ])

    # extract short summary of gallery, then process it
    short_summary = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="short-summary",
        domain="gallery",
        text=all_text,
        audience="an 8th grader",
    )
    short_summary = scripts.remove_frag_start(short_summary)

    # extract medium summary of gallery, then process it
    medium_summary = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="medium-summary",
        domain="gallery",
        text=all_text,
        audience="an 8th grader",
    )
    medium_summary = scripts.remove_frag_start(medium_summary)

    # extract new keywords from OpenAI model, then process
    keywords = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="keywords",
        domain="gallery",
        text=all_text,
        temp=0.2
    )
    keywords = scripts.find_items(keywords)

    # get fun facts about gallery from OpenAI
    fun_facts = pretrained_models.get_openai_completion(
        engine="text-davinci-001",
        prompt_type="fun-facts",
        domain="gallery",
        text=all_text,
        temp=0.5
    )
    fun_facts = scripts.find_items(fun_facts, short=False)

    this_gallery.update({
        "short-summary": short_summary,
        "medium-summary": medium_summary,
        "keywords": keywords,
        "fun-facts": fun_facts,
    })

    return this_gallery

In [253]:
# clean exhibit data

init_exhibits = []
for exhibit in exhibits:
    init_exhibits.append(init_exhibit(exhibit))

In [None]:
# clean gallery data

init_galleries = []
for gallery in galleries:
    init_galleries.append(init_gallery(gallery))

In [None]:
# save a copy of exhibit and gallery data to disk

init_exhibits_cache_path = os.path.join(home, "data/cache/init_exhibits.json")
with open(init_exhibits_cache_path, "w") as outfile:
    json.dump(init_exhibits, outfile, indent=2)

init_galleries_cache_path = os.path.join(home, "data/cache/init_galleries.json")
with open(init_galleries_cache_path, "w") as outfile:
    json.dump(init_galleries, outfile, indent=2)

In [None]:
# The following fixes are still needed:
# - check for duplicate keywords (does this matter?)
# - unidecode?
# - add sentence-parsing to fun-facts processing