In [1]:
import time
import pymongo
import re
import json
import requests
import pandas as pd

from tqdm import tqdm
from pprint import pprint
from pathlib import Path
from bs4 import BeautifulSoup

#import local libraries
import lib_sepinpho as sep
import lib_fileops as io

##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [None]:
#export list of all articles with domain_tags and primary_domain

collection_to_update = db.sep_entries
all_domains = pd.DataFrame(collection_to_update.find({},
                                              {'title':1, 'page_url', 'author': 1, 'primary_domain':1, 'domain_tags':1}))
all_domains.to_csv('domains.csv')

In [None]:
#import updated domain data

collection_to_update = db.sep_entries


updated_domains = pd.read_csv('domains.csv')

for index,row in tqdm(updated_domains.iterrows()):
    title = row['title']
    domain_tags = row['domain_tags']
    primary_domain = row['primary_domain']
    # update function to include page_url
    sep.update_domain_info(title,domain_tags, primary_domain, collection_to_update)

In [49]:
#get distinct list of primary_domains
db.sep_entries.distinct('primary_domain')

[&#39;Aesthetics and Philosophy of Art&#39;,
 &#39;African and African-American Philosophy&#39;,
 &#39;Arabic and Islamic Philosophy&#39;,
 &#39;Chinese Philosophy&#39;,
 &#39;Continental Philosophy&#39;,
 &#39;Epistemology&#39;,
 &#39;Ethics&#39;,
 &#39;Feminist Philosophy&#39;,
 &#39;Indian Philosophy&#39;,
 &#39;Japanese Philosophy&#39;,
 &#39;Latin American Philosophy&#39;,
 &#39;Logic&#39;,
 &#39;Metaphysics&#39;,
 &#39;Philosophy of Biology&#39;,
 &#39;Philosophy of Computer Science&#39;,
 &#39;Philosophy of Economics&#39;,
 &#39;Philosophy of Language&#39;,
 &#39;Philosophy of Law&#39;,
 &#39;Philosophy of Mathematics&#39;,
 &#39;Philosophy of Mind&#39;,
 &#39;Philosophy of Physics&#39;,
 &#39;Philosophy of Religion&#39;,
 &#39;Philosophy of Science&#39;,
 &#39;Social and Political Philosophy&#39;,
 &#39;Thinker&#39;]

In [43]:
#extract unique list of domain tags
domain_tags = list(db.sep_entries.find( filter={},
                                     projection={'domain_tags':1, '_id':0}))
domain_tags_individual = []

for tag in domain_tags:
    semisplit_tags = tag['domain_tags'].split(';')
    for semisplit_tag in semisplit_tags:
        commasplit_tags = semisplit_tag.split(',')
        for commasplit_tag in commasplit_tags:
            if commasplit_tag != '':
                domain_tags_individual.append(commasplit_tag.strip())

individual_tags = sorted(set(domain_tags_individual))
pprint(individual_tags)

[&#39;Aesthetics and Philosophy of Art&#39;,
 &#39;African and African-American Philosophy&#39;,
 &#39;Arabic and Islamic Philosophy&#39;,
 &#39;Chinese Philosophy&#39;,
 &#39;Continental Philosophy&#39;,
 &#39;Epistemology&#39;,
 &#39;Ethics&#39;,
 &#39;Feminist Philosophy&#39;,
 &#39;Indian Philosophy&#39;,
 &#39;Japanese Philosophy&#39;,
 &#39;Latin American and Iberian Philosophy&#39;,
 &#39;Logic&#39;,
 &#39;Metaphysics&#39;,
 &#39;Philosophy of Biology&#39;,
 &#39;Philosophy of Computer Science&#39;,
 &#39;Philosophy of Economics&#39;,
 &#39;Philosophy of Language&#39;,
 &#39;Philosophy of Law&#39;,
 &#39;Philosophy of Mathematics&#39;,
 &#39;Philosophy of Mind&#39;,
 &#39;Philosophy of Physics&#39;,
 &#39;Philosophy of Religion&#39;,
 &#39;Philosophy of Science&#39;,
 &#39;Social and Political Philosophy&#39;,
 &#39;Thinker&#39;]


In [48]:
#update db with new domain names
collection_to_update = db.sep_entries

search_filter = 'Latin American Philosophy'
replace_with = 'Latin American and Iberian Philosophy'

results = list(collection_to_update.find({'primary_domain': re.compile(search_filter)}, 
                                         {'page_url': 1, 'title':1, 'primary_domain':1}))

for result in results:
    print(result['primary_domain'])
    new_domain_tags = result['primary_domain'].replace(search_filter, replace_with)
# # # # #     # print(new_domain_tags)
    collection_to_update.update_one({'page_url': result['page_url']}, { '$set': {'domain_tags': new_domain_tags}})

new_results = list(collection_to_update.find({'domain_tags': re.compile(search_filter)}, 
                                         {'page_url': 1, 'title':1, 'domain_tags':1}))
pprint(new_results)

Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
Latin American Philosophy
[]


In [None]:
# create text search on preamble_text and main_text
# completed 10.1.2020

collection_to_update = db.sep_entries

collection_to_update.create_index([('preamble_text','text'), ('main_text','text')], name='preamble_textindex')
collection_to_update.index_information()

In [None]:
#Prior Updates
# African American Philosophy: Completed 10.2.2020
# text_filter = 'African African-American'
# sep_filter = 'African and African-American Philosophy'
#########################################################

In [52]:
#generate dataframes of new search and current db designations, and then export them to csv for content updates

collection_to_update = db.sep_entries

text_filter = 'Existentialism Phenomenology Hermeneutics Intentionality Postmodernism Post-Modernism Post-Structuralism'
sep_filter = 'Continental Philosophy'

text_results = pd.DataFrame(collection_to_update.find(
                                         {'$text':{'$search': text_filter }},
                                         {'score': { '$meta': 'textScore'}, 
                                            'title':1, 
                                            'page_url':1,
                                            'primary_domain':1, 
                                            'domain_tags':1}
                                         ))
text_results_above1 = text_results[text_results.score > 1]

text_results_above1.to_csv('continental.csv')

sep_results = pd.DataFrame(collection_to_update.find({ 'domain_tags': { '$regex': sep_filter } },
                                              {'title':1, 'primary_domain':1, 'domain_tags':1}))

sep_results.to_csv('continental_sep.csv')

In [55]:
#update domains from csv 
collection_to_update = db.sep_entries
domains_to_update = pd.read_csv('continental.csv')

for index,row in tqdm(domains_to_update.iterrows()):
    title = row['title']
    domain_tags = row['domain_tags']
    primary_domain = row['primary_domain']
    page_url = row['page_url']

    sep.update_domain_info(page_url,domain_tags, primary_domain, collection_to_update)

29it [00:00, 279.47it/s]acknowledged: /entries/merleau-ponty/
 True
[{&#39;_id&#39;: ObjectId(&#39;5f1bb5bd896f82cdbda612bd&#39;), &#39;title&#39;: &#39;Maurice Merleau-Ponty&#39;, &#39;domain_tags&#39;: &#39;Thinker,Continental Philosophy&#39;, &#39;primary_domain&#39;: &#39;Thinker&#39;}]
acknowledged: /entries/existentialism/
 True
[{&#39;_id&#39;: ObjectId(&#39;5f1bb2ad896f82cdbda610bb&#39;), &#39;title&#39;: &#39;Existentialism&#39;, &#39;domain_tags&#39;: &#39;Continental Philosophy&#39;, &#39;primary_domain&#39;: &#39;Continental Philosophy&#39;}]
acknowledged: /entries/heidegger-aesthetics/
 True
[{&#39;_id&#39;: ObjectId(&#39;5f1bb3a6896f82cdbda61152&#39;), &#39;title&#39;: &#39;Heidegger’s Aesthetics&#39;, &#39;domain_tags&#39;: &#39;Thinker,Aesthetics and Philosophy of Art,Continental Philosophy&#39;, &#39;primary_domain&#39;: &#39;Thinker&#39;}]
acknowledged: /entries/femapproach-continental/
 True
[{&#39;_id&#39;: ObjectId(&#39;5f1bb2c5896f82cdbda610c9&#39;), &#39;title&#3