In [1]:
### THIS SCRIPT CLEANS THE COLLECTED DATA AND ORGANZIES MULTIPLE MONGODB-RELATED DATABASE OPERATIONS ###   

In [2]:
import pymongo 
import tqdm
import requests
import json

from bson.objectid import ObjectId
from pprint import pprint

##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#create database
db = client.visualizing_sep

In [3]:
##### Cleaning the Winter 2019 Data ########

#1. Check for errors with InPhO apis.

sep_no_inpho = list(db.sep_entries.find( filter={'inpho_api': 'Error: No InPhO entry'},
                                        projection = ['title'],
                                        sort=[('title',1)]
                                        ))
pprint(sep_no_inpho)

#There were 14 SEP articles that originally didn't link into InPhO at all. I went into InPhO and found the appropriate API endpoint for those, and indicated the URL for those articles that do have InPhO entries. Not all of them had InPhO entries though. When we run this test now. We only see 4 articles left that don't link into InPhO

#Corrected Articles:
# 'Algebraic Propositional Logic', inpho: idea/6267
# 'Basil [Cardinal] Bessarion', thinker/6297
# 'Dynamic Epistemic Logic', idea/5642
# 'Ibn Sina [Avicenna]', thinker/2577
# 'Identity and Individuality in Quantum Theory', idea/1191
# 'Legalism in Chinese Philosophy', idea/5595
# 'Martin Buber', thinker/2718
# 'Mental Causation', idea/1240
# 'Religious Daoism', idea/5663
# 'Supererogation', idea/1128.html

#Articles that don't link
# {'title': 'Hegel’s Dialectics'}, NONE
# {'title': 'Ibn Sina’s Natural Philosophy'}, NONE
# {'title': 'Nietzsche’s Life and Works'} NONE
# {'title': 'Sophie de Grouchy'}, NONE


[{'_id': ObjectId('5e8298d880ca4dfd7911d03e'), 'title': 'Hegel’s Dialectics'},
{'_id':ObjectId('5e82997380ca4dfd7911d06e'),
'title':'Ibn Sina’s Natural Philosophy'},
{'_id':ObjectId('5e8392bc42ea6bed61c109b5'),
'title':'Nietzsche’s Life and Works'},
{'_id': ObjectId('5e83977c42ea6bed61c10b33'), 'title': 'Sophie de Grouchy'}]


In [None]:
##### Cleaning the Winter 2019 Data ########

#2. Check for duplicate InPhO JSON data.

#There was also an article that linked to the WRONG article in InPho.
#Mencius (SEP) properly linked to Mencius (InPho) 
#Wang Yangming (SEP) improperly linked to Mencius (InPho) -- this has been updated


In [45]:
def update_sep_json(id_title,inpho_api,collection_to_update):
    """ This function updates the JSON data for individually passed articles. """

    #create correct api endoint url
    inpho_api = f'http://inpho.cogs.indiana.edu/{api_endpoint}.json'

    #get JSON data from InPhO
    request_inpho = requests.get(inpho_api)
    inpho_json = json.loads(request_inpho.text)

    #create udpate string
    new_values = {
        'inpho_api': inpho_api,
        'inpho_json': inpho_json
    }

    #udpate MongoDB
    result = collection_to_update.update_one(
        { 'title': id_title },
        { '$set': new_values}
    )

    # boolean confirmation that the API call went through
    print (f"acknowledged: {inpho_api}\n", result.acknowledged)

    #get Projection to test update
    doc = list(collection_to_update.find(
            filter={'title': title},
            projection=['title','inpho_api', 'inpho_json']))

    pprint(doc)

In [4]:
##### Cleaning the Spring 2020 Data ########

#1. Check for errors with InPhO apis.

sep_no_inpho = list(db.sep_entries_2020_spring.find( filter={'inpho_api': 'Error: No InPhO entry'},
                                        projection = ['title'],
                                        sort=[('title',1)]
                                        ))
pprint(sep_no_inpho)

#Two articles were missing JSON Data, but were then properly updated
# [{'_id': ObjectId('5e917f180688f7a16fb5b4fe'), 'title': 'Divine Illumination'},
# {'_id': ObjectId('5e917f210688f7a16fb5b501'), 'title': 'Kumārila'}]

[]


In [46]:
#Update the individual records identified above
title = "Divine Illumination"
api_endpoint = 'idea/1383'
collection_to_update = db.sep_entries_2020_spring

update_sep_json(title,api_endpoint,collection_to_update)

acknowledged: http://inpho.cogs.indiana.edu/idea/1383.json
True
[{'_id':ObjectId('5e91f66aa152a41eedca074f'),
'inpho_api':'http://inpho.cogs.indiana.edu/idea/1383.json',
'inpho_json':{'ID':1383,
'classes':[],
'hyponyms':[1709,
1345,
1544,
1539,
1504,
1854,
1490,
1257,
2166],
'instances':[],
'label':'Divine Illumination',
'links':[],
'nodes':[],
'occurrences':[5701, 989, 1253],
'related':[1709,
1345,
1544,
1539,
5701,
1504,
1854,
1257,
1490],
'related_thinkers':[4180,
2543,
3186,
2680,
2553,
2493,
3128,
2915,
3516],
'sep_dir':'illumination',
'type':'idea',
'url':'/idea/1383'},
'title':'Divine Illumination'}]


In [6]:
### Upsert data from Spring 2020  into the main SEP collection ###
### Completed: 4.11.2020 (58 Document Updates and 9 New Documents added)####

new_collection = list(db.sep_entries_2020_spring.find({}))
sep_collection = db.sep_entries

for doc in new_collection:
    #page_url is the update key. replace documents in sep_collection that have matching values in new_collection
    page_url = doc['page_url']
    query = {'page_url': page_url}

    # #remove the "_id" field from the new collection, so we add only data to sep_collection
    doc = dict(doc)
    del doc["_id"]

    #Update existing documents with matching page_urls. Insert new documents.
    result = sep_collection.replace_one( query, doc, upsert=True )

    #confirmation that the upsert went through
    print (f"acknowledged: {page_url}\n", result.acknowledged)


acknowledged: /entries/abraham-daud/
True
acknowledged: /entries/al-kindi/
True
acknowledged: /entries/albert-great/
True
acknowledged: /entries/albo-joseph/
True
acknowledged: /entries/altruism-empirical/
True
acknowledged: /entries/arabic-islamic-influence/
True
acknowledged: /entries/authenticity/
True
acknowledged: /entries/biology-developmental/
True
acknowledged: /entries/brouwer/
True
acknowledged: /entries/carnap/
True
acknowledged: /entries/change/
True
acknowledged: /entries/chinese-room/
True
acknowledged: /entries/cohen/
True
acknowledged: /entries/computational-mind/
True
acknowledged: /entries/computational-philosophy/
True
acknowledged: /entries/confirmation/
True
acknowledged: /entries/connectives-logic/
True
acknowledged: /entries/consciousness-17th/
True
acknowledged: /entries/conway/
True
acknowledged: /entries/crescas/
True
acknowledged: /entries/descartes-ontological/
True
acknowledged: /entries/discourse-representation-theory/
True
acknowledged: /entries/epistemol

In [7]:
# Move the JSON data for each SEP entry into its own collection.#####

# We do this because we want to download additional API data from InPhO for those entities that are NOT in SEP. This will give us a broader picture of the history of philosophy than simply what exists in SEP.
 
#HOWEVER: we also have to clean up the JSON. In 95 documents in the Winter 2019 edition, the JSON data from InPhO was structured slightly different, in that those data were wrapped inside ["responseData"]["result"] keys, when the rest of the data was not. We want to create a uniform dataset, so we remove ["responseData"]["result"] from those data so that every InPhO object has the same structure. This was fixed for the Spring 2020 dataset. 

#get all SEP entries
sep_collection = db.sep_entries
inpho_collection = db.inpho_data

sep_entries = list(sep_collection.find({}))

for entry in sep_entries:
    #get "inpho_json" object from SEP
    json_data = entry['inpho_json']

    #some SEP entries didn't have an InPhO reference, so we note that here.
    if json_data == 'Error: No InPhO entry':
        final_json_data = {'Error': 'No InPhO entry'}

    #process main data elements
    else:
        if 'responseData' in json_data:
            #if ["responseData"]["result"] in current object, pull the child references from it
            final_json_data = json_data["responseData"]["result"]
        else:
            #all good here
             final_json_data = json_data

    inpho_collection.insert_one(final_json_data)


In [8]:
#get counts of different aspects of the the dataset
count_ids = inpho_collection.count_documents({'ID': {'$exists': True}})
distinct_ids = inpho_collection.distinct('ID')

print(count_ids)
print(len(distinct_ids))

1676
1676
