In [1]:
import pandas as pd 
import pymongo
from pprint import pprint

import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt

from corextopic import corextopic as ct
from corextopic import vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

In [2]:
##### INIT GLOBAL VARIABLES#####

#init Mongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#connect to database
db = client.visualizing_sep

In [3]:
sep_df = pd.DataFrame(list(db.sep_entries.find({}, 
                                                projection=['title', 'pagetext'], 
                                                sort=[('title',1)])))

In [4]:
sep_df.head()

Unnamed: 0,_id,title,pagetext
0,5f109ada789a3d802df9a75b,17th and 18th Century Theories of Emotions,Early modern philosophy in Europe and Great Br...
1,5f10994d789a3d802df9a5b6,18th Century British Aesthetics,18th-century British aesthetics addressed itse...
2,5f10994e789a3d802df9a5b7,18th Century French Aesthetics,French philosophers of the Ancien Régime wrote...
3,5f10994f789a3d802df9a5b8,18th Century German Aesthetics,The philosophical discipline of aesthetics did...
4,5f10993b789a3d802df9a5a3,18th Century German Philosophy Prior to Kant,"In Germany, the eighteenth century was the age..."


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=10,
    max_features=20000,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(sep_df['pagetext'])
tfidf = vectorizer.transform(sep_df['pagetext'])
vocab = vectorizer.get_feature_names()


In [7]:
pprint(tfidf)
pprint(len(vocab))

<1692x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 4312997 stored elements in Compressed Sparse Row format>
20000


In [10]:
# Anchors designed to nudge the model towards measuring specific genres
anchor_words = [
    ["Political Philosophy", "Political Theory",  "Social Theory", "Government", "Justice", "Rights", "Civic", "Political", "Political Philosopher", "Democracy"],
    ["Philosophy of Science", "Science", "natural philosophy", "scientific"],
    ["Philosophy of Religion", "religion", "god", "faith", "theology"],
    ["Philosophy of Mind", "Metaphysics of Mind", "mental", "consciousness", "perception", "cognition", "emotion"],
    ["Philosophy of Mathematics", "Mathematics", "mathematical", "algebra", "calculus", "geometry", "probability", "statistics", "set theory"],
    ["Philosophy of Law", "law", "tort", "legal", "testimony"],
    ["Philosophy of Language", "semantics", "pragmatics", "linguistics", "grammar"],
    ["Metaphysics", "causation", "ontology"],
    ["Logic", "inference", "paradox"],
    ["Latin American Philosophy", "Latin America", "Iberian Peninsula", "mexico", "chile"],
    ["Jewish philosophy", "Judaic Philosophy", "Hebrew Philosophy"],
    ["Japanese philosophy"],
    ["Feminist Philosophy", "feminism", "feminist"],
    ["Ethics", "Ethical Philosophy", "Moral Philosophy", "Morals", "Morality"],
    ["Epistemology",  "epistemic", "knowledge", "truth"],
    ["Existentialism", "Phenomenolgy", "Intentionality", "Hermeneutics", "Critical Theory", "PostModernism"],
    ["Chinese Philosophy", "Chinese Medicine", "Tibet"],
    ["Arabic and Islamic", "Islamic", "Middle Eastern", "Koran", "Muslim"],
    ["Africa", "African", "Africana", "African-American"],
    ["Aesthetic", "Aesthetics", "Art", "Film", "Music", "Dance", "Theater"]
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchor_words
]

model = ct.Corex(n_hidden=20, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=10 # Tell the model how much it should rely on the anchors
)

In [11]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: scientific, natural philosophy, sciences, of science, of scientific, science and, philosophy of, the scientific, ofscience, the sciences
Topic #2: god, theology, faith, religion, divine, of god, god is, god and, soul, the divine
Topic #3: consciousness, perception, cognition, mental, emotion, experience of, the mind, experiences, sensory, conscious
Topic #4: mathematical, calculus, set theory, algebra, probability, geometry, statistics, theorem, theorems, axioms
Topic #5: legal, morally, the moral, of moral, harm, morality, justice, goods, well being, persons
Topic #6: semantics, grammar, linguistics, semantic, sentences, sentence, semantics of, the semantics, the sentence, quantifiers
Topic #7: ontology, causation, ontological, entities, metaphysical, spatial, entity, of objects, properties and, causal
Topic #8: inference, paradox, if and, is false, true in, only if, for any, be true, iff, operator
Topic #9: feminist, feminism, political, society, rights, and political, econ

In [15]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(21)]
).astype(float)
topic_df.index = sep_df.index
df = pd.concat([sep_df, topic_df], axis=1)

In [16]:
df.sample(5, random_state=42)

Unnamed: 0,_id,title,pagetext,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21
989,5f109d41789a3d802df9a9d0,Moral Particularism,"Moral Particularism, at its most trenchant, is...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1637,5f1099be789a3d802df9a631,Walter Benjamin,Walter Benjamin's importance as a philosopher ...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1217,5f109e05789a3d802df9aa9d,Proclus,Proclus of Athens (*412–485 C.E.) was the most...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1175,5f109dd6789a3d802df9aa6e,Plato’s Timaeus,In the Timaeus Plato presents an elaborately w...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1373,5f109eca789a3d802df9ab5f,Shared Agency,"Sometimes individuals act together, and someti...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
