In [1]:
import pymongo 
import json 
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [2]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.visualizing_sep

In [3]:
def create_sep_network_json(sep_collection):
    """ exports the JSON that makes up the network graph """

    # update this for each archive update 

    sepData = {
        'Edition': 'Winter 2020',
        'EditionURL': 'https://plato.stanford.edu/archives/win2020'
    }
    
    #init empty lists 
    nodes_list = []
    links_list = []

    #extract unique list of domain tags for primary domain menu
    #we have to do this separately, because there aren't any articles where 'jewish philosophy' is the primary domain, but we have to make sure that jewish pihlosophy shows up in the menu. 

    domain_tags = list(db.sep_entries.find( filter={},
                                        projection={'domain_tags':1, '_id':0}))
    domain_tags_individual = []

    for tag in domain_tags:
        semisplit_tags = tag['domain_tags'].split(';')
        for semisplit_tag in semisplit_tags:
            commasplit_tags = semisplit_tag.split(',')
            for commasplit_tag in commasplit_tags:
                if commasplit_tag != '':
                    domain_tags_individual.append(commasplit_tag.strip())

    primary_domains = sorted(set(domain_tags_individual))

    #loop through all entries
    for entry in tqdm(sep_collection, desc='Processing'):
        inpho_json = entry['inpho_json']
        entry_type = inpho_json['type']
        word_count = len(entry['preamble_text'].split()) + len(entry['main_text'].split())
        preamble_text = entry['preamble_text']
        article_url = sepData['EditionURL'] + entry['page_url']
        toc_text = entry['toc'].replace('<a ', '<a target="_blank" ').replace('href="','href="' + article_url)

        node_object = { 'id': entry['page_url'], 
                        'article_url': article_url,
                        'title': entry['title'],
                        'author': entry['author'],
                        'toc':toc_text,
                        'pubdate':entry['pubdate'],
                        'entry_type': entry_type,
                        'preamble_text': preamble_text,
                        'word_count': f"{word_count:,}",
                        'primary_domain': entry['primary_domain'],
                        'domain_tags': entry['domain_tags'].replace(', ',',').strip()
                    }

        nodes_list.append(node_object)

        for link in entry['outlinks']:
            out_link = {'source':entry['page_url'], 'target':link['link'], 'targetTitle':entry['title']}
            if out_link not in links_list:
                links_list.append(out_link)

    articles_object = {'nodes': nodes_list, 'links':links_list}
    network_object = {'sepData': sepData, 'articles' : articles_object, 'domains':primary_domains}
    with open('../../static/sep_network.json', 'w', encoding='UTF-8') as f:
        json.dump(network_object,f,ensure_ascii=False, indent=4)

In [4]:
#get sep entries stored in mongo
collection_to_export = db.sep_entries

sep_articles = list(collection_to_export.find({}).sort('title'))
create_sep_network_json(sep_articles)

Processing: 100%|██████████| 1705/1705 [00:05<00:00, 293.95it/s]
