In [5]:
import os

import process_keywords
import json
import numpy as np
from nltk.corpus import stopwords
import pprint

from importlib import reload
reload(process_keywords)

pp = pprint.PrettyPrinter(indent=2)



In [2]:
# import keyword data

HOME = os.getenv("PROJ_HOME")
exhibits_filepath = os.path.join(HOME, "data/institutional/exhibits.json")
galleries_filepath = os.path.join(HOME, "data/institutional/galleries.json")

with open(exhibits_filepath, 'r') as file:
    exhibits = json.load(file)

keyword_data = []
for exhibit in exhibits:
    this_data = {
        "id": exhibit["id"],
        "type": "exhibit",
        "keywords": exhibit["keywords"]
    }
    keyword_data.append(this_data)

with open(galleries_filepath, 'r') as file:
    galleries = json.load(file)

for gallery in galleries:
    this_data = {
        "id": ["id"],
        "type": "gallery",
        "keywords": gallery["keywords"]
    }
    keyword_data.append(this_data)

In [3]:
# import article metadata

metadata_filepath = os.path.join(home, "data/subject_matter/metadata/advanced.json")

with open(metadata_filepath, "r") as infile:
    metadata = json.load(infile)

In [4]:
# create lists of all keywords, and all unique keywords

all_keywords = []
for item in keyword_data:
    all_keywords += item["keywords"]

unique_keywords = list(set(all_keywords))

print("There are {} keywords".format(len(all_keywords)))
print("There are {} unique keywords".format(len(unique_keywords)))

There are 1657 keywords
There are 938 unique keywords


In [6]:
# tokenize and vectorize keywords

keyword_tokens = []
keyword_vecs = []

for keyword in unique_keywords:
    tokens = process_keywords.clean_tokenize(keyword, stopwords=stopwords.words("english"))
    keyword_tokens.append(tokens)
    vectors = process_keywords.vectorize(tokens)
    keyword_vecs.append(vectors)

In [7]:
# get GloVe embeddings for titles

title_embeddings = {}

for item in metadata:
    title = item["title"]
    tokens = process_keywords.clean_tokenize(title, stopwords=stopwords.words("english"))
    vectors = np.array(process_keywords.vectorize(tokens))
    avg_vector = np.mean(vectors, axis=0)
    if not all([x == 0 for x in avg_vector]):
        title_embeddings[title] = avg_vector

unique_titles = [item["title"] for item in metadata]
unique_titles = list(set(unique_titles))

In [8]:
print("There are {} non-trivial title embeddings".format(len(title_embeddings)))

There are 70872 non-trivial title embeddings


In [13]:
# find "nearest" titles for each unique keyword

title_matches = []

for keyword in unique_keywords:
    new_matches = process_keywords.find_nearest_titles(
        keyword,
        titles=unique_titles,
        title_embeddings=title_embeddings,
        method="embedding",
        num=1
    )
    new_matches += process_keywords.find_nearest_titles(
        keyword,
        titles=unique_titles,
        title_embeddings=title_embeddings,
        method="fuzz-matching",
        num=1
    )
    new_matches = list(set(new_matches))
    title_matches += new_matches

In [14]:
unique_title_matches = list(set(title_matches))

print("There are {} titles matched".format(len(title_matches)))
print("There are {} unique titles matched".format(len(unique_title_matches)))

There are 1024 titles matched
There are 919 unique titles matched


In [15]:
# get article_ids for title matches

article_id_matches = []

for title in unique_title_matches:
    for item in metadata:
        if item["title"] == title:
            article_id_matches.append(item["articleId"])

In [16]:
print("There are {} matched articles".format(len(article_id_matches)))

There are 969 matched articles


In [17]:
# cache the list of article_ids

article_ids_filepath = os.path.join(HOME, "data/cache/article_ids.json")

if os.path.exists(article_ids_filepath):
    os.remove(article_ids_filepath)

with open(article_ids_filepath, "w") as outfile:
    outfile.write(json.dumps(article_id_matches))