In [1]:
import pymongo 
import json 
import pandas as pd
from tqdm import tqdm
from pprint import pprint

In [2]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.visualizing_sep

In [5]:
def return_nodelinks(page_url, outlinks, inlinks):
    """ returns the nodes linked to page_url, and identifies the direction of the link """

    nodelinks = []
    # we need to get the common elements between outlinks and inlinks
    # these are the dir="both" links
    outlinks_common = [value for value in outlinks if value in inlinks]
    inlinks_common = [value for value in inlinks if value in outlinks]
    linkdirection_both = set(outlinks_common + inlinks_common)

    for link in linkdirection_both:
        link_object = {'source': page_url,
                       'target': link,
                       'dir':'both'}
        nodelinks.append(link_object)

    # we then need to get the difference between the both links and the out links
    # these are these dir='out links
    linkdirection_out = [value for value in outlinks if value not in linkdirection_both]

    for link in linkdirection_out:
        link_object = {'source': page_url,
                       'target': link,
                       'dir':'out'}
        nodelinks.append(link_object)

    # we then need to get the difference between the both links and the in links
    # these are the dir='in' links
    linkdirection_in = [value for value in inlinks if value not in linkdirection_both]

    for link in linkdirection_in:
        link_object = {'source': page_url,
                       'target': link,
                       'dir':'in'}
        nodelinks.append(link_object)

    # sorted_nodelinks = sorted(nodelinks, key=(1))

    sorted_nodelinks = sorted(nodelinks, key=lambda k: k['target']) 
    
    return sorted_nodelinks


def create_sep_network_json(sep_collection):
    """ exports the JSON that makes up the network graph 

    Things to watch for in the export file: 
    1. entries with 'entries//'
    2. depiction needs a fix at 'reference'
    3. type theory needs a fix for 'russell'
    4. sartre links to himself
    """
    #init empty lists 
    nodes_list = []
    links_list = []

    #loop through all entries
    for entry in tqdm(sep_collection, desc='Processing'):

        page_url = entry['page_url']
        title = entry['title']
        first_paragraph = entry['first_paragraph']
        inpho_data = entry['inpho_api']
        outlinks = [value['link'] for value in entry['outlinks']]
        inlinks = [value['link'] for value in entry['inlinks']]
        primary_domain = entry['primary_domain']
        domain_tags = entry['domain_tags']
        
        if 'thinker' in inpho_data:
            entry_type = 'thinker'
        else:
            entry_type = 'idea'

        num_outlinks = len(outlinks)
        num_inlinks = len(inlinks)

        node_object = { 'id': page_url, 
                        'title': title,
                        'entry_type': entry_type,
                        'first_paragraph': first_paragraph,
                        'num_outlinks': num_outlinks,
                        'num_inlinks': num_inlinks,
                        'primary_domain': primary_domain,
                        'domain_tags': domain_tags,
                        'links': return_nodelinks(page_url, outlinks, inlinks)
                    }

        nodes_list.append(node_object)

        for link in outlinks:
            if '//' not in link:
                links_list.append({'source':entry['page_url'], 'target':link})

    network_object = {'nodes': nodes_list, 'links':links_list}

    with open('static/sep_network_test.json', 'w', encoding='UTF-8') as f:
        json.dump(network_object,f,ensure_ascii=False, indent=4)

In [4]:
#get sep entries stored in mongo
sep_entries = list(db.sep_data.find({}).sort('title'))
create_sep_network_json(sep_entries)



Processing: 100%|██████████| 1680/1680 [00:00<00:00, 46789.98it/s]


In [9]:
# export page_text for every SEP page

sep_entries = list(db.sep_data.find({}).sort('title'))
for sep in sep_entries:
    page_name = sep['title'] + '.txt'
    file_name = 'texttraining/' + page_name.replace("/","_")
    page_text = sep['pagetext'].strip()
    with open(file_name,'w', encoding='UTF-8') as f:
        f.write(page_text)


In [7]:
#export title and id to csv
sep_entries = list(db.sep_data.find( filter={},
                                     projection={'title'},
                                     sort=[('title',1)]))

sep_df = pd.DataFrame(sep_entries)
sep_df.head()
sep_df.to_csv('sep.csv')

In [5]:
#extract unique list of domain tags
domain_tags = list(db.sep_data.find( filter={},
                                     projection={'domain_tags':1, '_id':0}))
domain_tags_individual = []

for tag in domain_tags:
    semisplit_tags = tag['domain_tags'].split(';')
    for semisplit_tag in semisplit_tags:
        commasplit_tags = semisplit_tag.split(',')
        for commasplit_tag in commasplit_tags:
            if commasplit_tag != '':
                domain_tags_individual.append(commasplit_tag.strip())

individual_tags = sorted(set(domain_tags_individual))
pprint(individual_tags)

['Aesthetics',
 'African and African-American Philosophy',
 'Arabic and Islamic Philosophy',
 'Biology',
 'Chinese Philosophy',
 'Computer Science',
 'Economics',
 'Epistemology',
 'Ethics and Morality',
 'Evolution',
 'Existentialism and Phenomenology',
 'Feminism',
 'Genetics',
 'Indian Philosophy',
 'Japanese Philosophy',
 'Language',
 'Latin American Philosophy',
 'Law',
 'Logic',
 'Mathematics',
 'Metaphysics',
 'Mind',
 'Physics',
 'Political and Social Theory',
 'Quantum Mechanics',
 'Religion',
 'Scientific Methods',
 'Thinker',
 'Tibetan Philosophy']
