In [257]:
import os
import re

from dotenv import load_dotenv
load_dotenv()

import json
from bs4 import BeautifulSoup

from get_articles import get_data
from get_articles import process_articles

from importlib import reload
reload(get_data)
reload(process_articles)

import pprint
pp = pprint.PrettyPrinter(indent=2)

In [268]:
home = os.getenv("PROJ_HOME")

institutional_data_path = os.path.join(home, "data/institutional")
subject_matter_path = os.path.join(home, "data/subject_matter")
article_ids_path = os.path.join(home, "data/cache/article_ids.json")
xml_dir = os.path.join(home, "data/subject_matter/xml")
text_dir = os.path.join(home, "data/cache/subject_matter_text")

In [368]:
# get encyclopedia metadata

def get_metadata(sources=["advanced"], path=None):
    articles = {}

    for source in sources:
        articles_in_source = get_data.get_encyclopedia_metadata(source, path)
        articles[source] = articles_in_source

    # remove articles with empty title fields
    for source in sources:
        for article in articles[source]:
            if article['title'] == '':
                articles[source].remove(article)

    if not os.path.exists(path):
        os.makedirs(path)

    for source in sources:
        filepath = os.path.join(path, source + '.json')

        if os.path.exists(filepath):
            os.remove(filepath)

        with open(filepath, "w") as outfile:
            outfile.write(json.dumps(articles[source], indent=2))

    return articles

In [176]:
# advanced_metadata = get_metadata(path=subject_matter_path)

filepath = os.path.join(subject_matter_path, "advanced.json")
with open(filepath, "r") as infile:
    advanced_metadata = json.load(infile)

In [177]:
# load matched article_ids

with open(article_ids_path, "r") as infile:
    matched_article_ids = json.load(infile)

# subset metadata by article_id in matched_article_id

matched_metadata = []
for item in advanced_metadata:
    if item["articleId"] in matched_article_ids:
        matched_item = {
            "article_id": item["articleId"],
            "title": item["title"],
            "last_updated": item["lastUpdated"]
        }
        matched_metadata.append(matched_item)

In [149]:
# get the articles (xml content) from the API

for article_id in matched_article_ids:
    xml = get_data.get_article_xml(article_id=article_id, dir_path=xml_dir)

In [262]:
# get the text of each article as a list of strings, each string representing a paragraph
# encoding?

for filename in os.listdir(xml_dir):
    article_id = int(filename.rstrip(".xml"))
    filepath = os.path.join(xml_dir, filename)
    with open(filepath, "r") as infile:
        data = infile.read()
    xml_data = BeautifulSoup(data, "xml")
    get_data.get_article_paragraphs(xml_data, article_id=article_id, dir_path=text_dir)

In [263]:
# combine matched_metadata and text data on article_id -> matched_data

matched_data = []
for item in matched_metadata:
    this_match = item.copy()
    article_id = item["article_id"]
    filename = str(article_id) + ".json"
    filepath = os.path.join(text_dir, filename)
    with open(filepath, "r") as infile:
        paragraphs = json.load(infile)
    this_match.update({
        "paragraphs": paragraphs
    })
    matched_data.append(this_match)

In [264]:
# clean the first paragraph of each matched article

for item in matched_data:
    title = item["title"]
    par_0 = item["paragraphs"][0]
    par_0_new, title_new, aliases, grouping, is_person = process_articles.clean_par_0(paragraph=par_0, title=title)
    paragraphs = [par_0_new]
    item["title"] = title_new
    for par in item["paragraphs"][1:]:
        paragraphs.append(process_articles.clean_par_n(par))
    item["paragraphs"] = paragraphs
    item.update({
        "aliases": aliases,
        "field": grouping,
        "is_person": is_person
    })

In [269]:
outfile_path = os.path.join(subject_matter_path, "article_data.json")
with open(outfile_path, "w") as outfile:
    json.dump(matched_data, outfile, indent=2)