In [32]:
import pandas as pd 
import pymongo
from pprint import pprint

import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt

from corextopic import corextopic as ct
from corextopic import vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

In [33]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [34]:
sep_df = pd.DataFrame(list(db.sep_entries.find({}, 
                                                projection=['title', 'main_text'], 
                                                sort=[('title',1)])))

In [35]:
sep_df.head()

Unnamed: 0,_id,title,main_text
0,5f1bb235896f82cdbda61077,17th and 18th Century Theories of Emotions,1. Introduction 1.1 Difficulties of Approach: ...
1,5f1baf2c896f82cdbda60ed2,18th Century British Aesthetics,1. Internal-Sense Theories 1.1 Shaftesbury Sh...
2,5f1baf2d896f82cdbda60ed3,18th Century French Aesthetics,1. The Classical Legacy French thinkers consi...
3,5f1baf2f896f82cdbda60ed4,18th Century German Aesthetics,1. Leibniz and Wolff: Perfection and Truth T...
4,5f1baf09896f82cdbda60ebf,18th Century German Philosophy Prior to Kant,1. Christian Thomasius (1655–1728) Although ...


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=20000,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(sep_df['main_text'])
tfidf = vectorizer.transform(sep_df['main_text'])
vocab = vectorizer.get_feature_names()


In [37]:
pprint(tfidf)
pprint(len(vocab))

<1692x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 4651783 stored elements in Compressed Sparse Row format>
20000


In [38]:
# Anchors designed to nudge the model towards measuring specific genres
anchor_words = [
    ["Political Philosophy", "Political Theory",  "Social Theory", "Government", "Justice", "Rights", "Civic", "Political", "Political Philosopher", "Democracy"],
    ["Philosophy of Science", "Science", "natural philosophy", "scientific"],
    ["Philosophy of Biology", 'biology', 'biological'],
    ["Evolution", "biology", "evolutionary"],
    ["Genetics", "DNA", "genes"],
    ["Physics", "relativity", "space", "time"],
    ['Quantum', "Quantum Theory", "Quantum Mechanics"],
    ["Philosophy of Religion", "religion", "god", "faith", "theology"],
    ["Philosophy of Mind", "Metaphysics of Mind", "mental", "consciousness", "perception", "cognition", "emotion"],
    ["Philosophy of Mathematics", "Mathematics", "mathematical", "algebra", "calculus", "geometry", "probability", "statistics", "set theory"],
    ["Philosophy of Law", "law", "tort", "legal", "testimony"],
    ["Philosophy of Language", "semantics", "pragmatics", "linguistics", "grammar"],
    ["Metaphysics", "causation", "ontology", "metaphysical", "mereology"],
    ["Logic", "inference", "logician", "conditionals", "classical logic", "modal logic"],
    ["Latin American Philosophy", "Latin America", "Iberian Peninsula", "mexico", "chile"],
    ["Jewish philosophy", "Judaic Philosophy", "Hebrew Philosophy", "rabbi", "Jewish Philosopher"],
    ["Japanese philosophy"],
    ["Feminist Philosophy", "feminism", "feminist", "political"],
    ["Ethics", "Ethical Philosophy", "Moral Philosophy", "Morals", "Morality"],
    ["Epistemology",  "epistemic", "knowledge", "truth"],
    ["Existentialism", 'existential', "Jean-Paul Sartre", "Camus", "Kierkagaard", "Nietzsche"],
    ["Phenomenolgy", "Intentionality", "structures of experience", "Edmund Husserl", "Martin Heidegger", "Maurice Merleau-Ponty", "Jean-Paul Sartre"],
    ["Hermeneutics", "Gadamer"],
    ["Critical Theory", "Habermas", "Adorno"],
    ["PostModernism", "Foucault", "Derrida"],
    ["Chinese Philosophy", "Chinese Medicine", "Tibet"],
    ["Arabic and Islamic", "Islamic", "Middle Eastern", "Koran", "Muslim"],
    ["Africa", "African", "Africana", "African-American", "political"],
    ["Aesthetic", "Aesthetics", "Art", "Film", "Music", "Dance", "Theater"],
    # history of philosophy
    ['latin', 'greek', 'ancient', 'ancient philosophy'],
    ['medieval philosophy', 'medieval', 'middle ages'],
    ['modern philosophy', '16th Century', 'Sixteenth Century', '17th Century', 'Seventeenth Century', '18th Century', 'Eighteenth Century', '19th Century', 'Nineteenth Century'],
    ['contemporary philosophy', '20th Century', 'Twentieth Century']
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchor_words
]

model = ct.Corex(n_hidden=35, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=10 # Tell the model how much it should rely on the anchors
)

In [39]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: natural philosophy, he was, was born, wrote, published, writings, he had, the university, became, university
Topic #2: biological, biology, the biological, of biological, organisms, organism, biologically, genetic, the organism, reproduction
Topic #3: biology, evolutionary, biology and, of biology, the evolutionary, natural selection, evolution, of evolution, selection, darwin
Topic #4: relativity, properties of, atomic, relative to, primitive, lewis, be represented, representation, paper, specified
Topic #5: god, theology, religion, faith, of god, divine, god is, god and, that god, theological
Topic #6: consciousness, perception, cognition, emotion, experience of, experience and, experiences, of consciousness, conscious, of experience
Topic #7: mathematical, calculus, geometry, algebra, set theory, probability, statistics, mathematics, theorem, axioms
Topic #8: legal, morally, of moral, the moral, morality, harm, goods, well being, justice, is morally
Topic #9: semantics, gr

In [40]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(35)]
).astype(float)
topic_df.index = sep_df.index
df = pd.concat([sep_df, topic_df], axis=1)

In [41]:
df.sample(5, random_state=42)

Unnamed: 0,_id,title,main_text,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35
989,5f1bb603896f82cdbda612ec,Moral Particularism,1. Two Conceptions of Moral Principles If we...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1637,5f1baffe896f82cdbda60f4d,Walter Benjamin,1. Biographical Sketch Walter Bendix Schoenfli...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
1217,5f1bb777896f82cdbda613b9,Proclus,1. Life and Works Since Proclus’ extant work...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
1175,5f1bb724896f82cdbda6138a,Plato’s Timaeus,1. Overview of the Dialogue The opening conve...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1373,5f1bb8c7896f82cdbda6147b,Shared Agency,1. The traditional ontological problem and the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df.loc[df['topic_23']==1.0]

Unnamed: 0,_id,title,main_text,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35
3,5f1baf2f896f82cdbda60ed4,18th Century German Aesthetics,1. Leibniz and Wolff: Perfection and Truth T...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,5f1baf30896f82cdbda60ed5,19th Century Romantic Aesthetics,1. The Primacy of the Aesthetic One common co...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
9,5f1baf0e896f82cdbda60ec2,Abhidharma,1. Abhidharma: its origins and texts The ear...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
12,5f1baf13896f82cdbda60ec6,Abraham Ibn Daud,1. Introduction In the introduction to ha-Em...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
13,5f1bb3f3896f82cdbda6117a,Abraham Ibn Ezra,1. Life and Works Ibn Ezra was likely born i...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,5f1baf4b896f82cdbda60ee6,al-Kindi,1. Life and Works 1.1 Life Al-Kindi was a me...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1686,5f1bb22e896f82cdbda61073,Émilie du Châtelet,1. Du Châtelet's Magnum Opus on Natural Philos...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
1687,5f1bb0fb896f82cdbda60fd4,Étienne Bonnot de Condillac,1. Life and Works Condillac was born on Septe...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
1690,5f1bb8c5896f82cdbda6147a,Śāntideva,1. Biography: History and Legends There is ve...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
