In [1]:
### THIS SCRIPT CLEANS THE COLLECTED DATA AND ORGANZIES MULTIPLE MONGODB-RELATED DATABASE OPERATIONS ###   

In [2]:
import pymongo 
import tqdm
import requests
import json

from bson.objectid import ObjectId
from pprint import pprint

##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#create database
db = client.visualizing_sep

#collection to hold SEP TOC
toc_collection = db.sep_toc_win2019

#collection to hold entry objects
entries_collection = db.sep_entries

#collection to hold InPhO data 
inpho_collection = db.inpho_apidata

In [52]:
#After downloading all of the JSON data from InPhO and storing it inside the related SEP entry's document, we have to test the downloaded data for any errors or mistakes. 

#First thing to do is identify any records in SEP that didn't have any associated InPho entries.
sep_no_inpho = entries_collection.find( filter={'inpho_api': 'Error: No InPhO entry'},
                                        projection = ['title'],
                                        sort=[('title',1)]
                                        )
list_sep_no_inpho = list(sep_no_inpho)
pprint(list_sep_no_inpho)

#Thhere were 14 SEP articles that originally didn't link into InPhO at all. I went into InPhO and found the appropriate API endpoint for those, and indicated the URL for those articles that do have InPhO entries. Not all of them had InPhO entries though. When we run this test now. We only see 4 articles left that don't link into InPhO

#Corrected Articles:
# 'Algebraic Propositional Logic', inpho: idea/6267
# 'Basil [Cardinal] Bessarion', thinker/6297
# 'Dynamic Epistemic Logic', idea/5642
# 'Ibn Sina [Avicenna]', thinker/2577
# 'Identity and Individuality in Quantum Theory', idea/1191
# 'Legalism in Chinese Philosophy', idea/5595
# 'Martin Buber', thinker/2718
# 'Mental Causation', idea/1240
# 'Religious Daoism', idea/5663
# 'Supererogation', idea/1128.html

#Articles that don't link
# {'title': 'Hegel’s Dialectics'}, NONE
# {'title': 'Ibn Sina’s Natural Philosophy'}, NONE
# {'title': 'Nietzsche’s Life and Works'} NONE
# {'title': 'Sophie de Grouchy'}, NONE

#There was also an article that linked to the WRONG article in InPho.
#Mencius (SEP) properly linked to Mencius (InPho) 
#Wang Yangming (SEP) improperly linked to Mencius (InPho) -- this has been updated

[{'_id': ObjectId('5e8298d880ca4dfd7911d03e'), 'title': 'Hegel’s Dialectics'},
{'_id':ObjectId('5e82997380ca4dfd7911d06e'),
'title':'Ibn Sina’s Natural Philosophy'},
{'_id':ObjectId('5e8392bc42ea6bed61c109b5'),
'title':'Nietzsche’s Life and Works'},
{'_id': ObjectId('5e83977c42ea6bed61c10b33'), 'title': 'Sophie de Grouchy'}]


In [56]:
def update_sep_json(id_title,inpho_api):
    """ This function updates the JSON data for individually passed articles. """

    #create correct api endoint url
    inpho_api = f'http://inpho.cogs.indiana.edu/{api_endpoint}.json'

    #get JSON data from InPhO
    request_inpho = requests.get(inpho_api)
    inpho_json = json.loads(request_inpho.text)

    #create udpate string
    new_values = {
        'inpho_api': inpho_api,
        'inpho_json': inpho_json
    }

    #udpate MongoDB
    result = entries_collection.update_one(
        { 'title': id_title },
        { '$set': new_values}
    )

    # boolean confirmation that the API call went through
    print (f"acknowledged: {inpho_api}\n", result.acknowledged)

In [57]:
#Update the individual records identified above
title = "Wang Yangming"
api_endpoint = 'thinker/4092'

update_sep_json(title,api_endpoint)

#get Projection to test update
doc = list(entries_collection.find(
    filter={'title': title},
    projection=['title','inpho_api', 'inpho_json']))

pprint(doc)

acknowledged: http://inpho.cogs.indiana.edu/thinker/4092.json
True
[{'_id':ObjectId('5e83992b42ea6bed61c10bbb'),
'inpho_api':'http://inpho.cogs.indiana.edu/thinker/4092.json',
'inpho_json':{'ID':4092,
'aliases':[],
'birth':[{'day': 0, 'month': 0, 'year': 1472}],
'birth_strings':['1472'],
'death':[{'day': 0, 'month': 0, 'year': 1529}],
'death_strings':['1529'],
'influenced':[],
'influenced_by':[4173],
'label':'Wang Yangming',
'nationalities':['Chinese'],
'professions':[],
'related_ideas':[6347,
148,
1870,
1343,
5338,
5510,
5602,
1544,
681],
'related_thinkers':[4173,
3563,
2844,
4203,
4195,
3060,
3177,
4198],
'sep_dir':'wang-yangming',
'students':[],
'teachers':[],
'type':'thinker',
'url':'/thinker/4092',
'wiki':'Wang_Yangming'},
'title':'Wang Yangming'}]


In [None]:
# Move the JSON data for each SEP entry into its own collection. We do this because we want to download additional API data from InPhO for those entities that are NOT in SEP. This will give us a broader picture of the history of philosophy than simply what exists in SEP.
 
#HOWEVER: we also have to clean up the JSON. In 95 cases, the JSON data from InPhO was structured slightly different, in that those data were wrapped inside ["responseData"]["result"] keys, when the rest of the data was not. We want to create a uniform dataset, so we remove ["responseData"]["result"] from those data so that every InPhO object has the same structure. 

#get all SEP entries
sep_entries = list(entries_collection.find({}))

for entry in sep_entries:
    #get "inpho_json" object from SEP
    json_data = entry['inpho_json']

    #some SEP entries didn't have an InPhO reference, so we note that here.
    if json_data == 'Error: No InPhO entry':
        final_json_data = {'Error': 'No InhPhoEntry'}

    #process main data elements
    else:
        if 'responseData' in json_data:
            #if ["responseData"]["result"] in current object, pull the child references from it
            final_json_data = json_data["responseData"]["result"]
        else:
            #all good here
             final_json_data = json_data

    inpho_collection.insert_one(final_json_data)


In [None]:
#get counts of different aspects of the the dataset
count_ids = inpho_collection.count_documents({'ID': {'$exists': True}})
distinct_ids = inpho_collection.distinct('ID')

print(count_ids)
print(len(distinct_ids))

In [None]:
dups = inpho_collection.aggregate([
    {"$match": {"ID" :{ "$exists" : True } } }, 
    {"$group" : {"_id": "$ID", "count": { "$sum": 1 } } },
    {"$match": {"count" : {"$gt": 1} } } 
    # {"$project": {"ID" : "$_id", "_id" : 0} }
])

for dup in dups:
    print(dup)

In [None]:
dup_entry = inpho_collection.find_one({'ID':3563})
print(dup_entry)

In [None]:
entries_collection.count_documents({'inpho_json.label': {'$regex': 'Mencius'}})
mencius_entries = entries_collection.find({'inpho_json.label': {'$regex': 'Mencius'}})
for entry in mencius_entries:
    pprint(entry)


In [None]:
#count docs between collections 

sep_yes = entries_collection.count_documents({'inpho_json.ID': {'$exists': True}})
inp_yes = inpho_collection.count_documents({'ID': {'$exists': True}})
sep_no = entries_collection.count_documents({'inpho_json.ID': {'$exists': False}})
inp_no = inpho_collection.count_documents({'inpho_json.ID': {'$exists': False}})
sep_error = entries_collection.count_documents({'inpho_json': 'Error: No InPhO entry'})
sep_noerror = entries_collection.count_documents({'inpho_json': {'$ne': 'Error: No InPhO entry'}})
inp_error = inpho_collection.count_documents({'inpho_json': 'Error: No InPhO entry'})
inp_noerror = inpho_collection.count_documents({'inpho_json': {'$ne': 'Error: No InPhO entry'}})
print(sep_yes)
print(sep_no)
print(inp_yes)
print(inp_no)
print(sep_error)
print(sep_noerror)
print(inp_error)
print(inp_noerror)