In [3]:
reset -fs

In [4]:
import pandas as pd

In [5]:
# Load data
df = pd.read_pickle("../../../corpora/nyt_articles.pkl")

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectorizer = TfidfVectorizer(max_df=0.95,  
                            min_df=2,
                            max_features=1000,
                            stop_words='english')

vectorized = vectorizer.fit_transform(df.content)

In [8]:
from sklearn.decomposition import LatentDirichletAllocation


In [27]:
lda = LatentDirichletAllocation(n_topics=10,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)

In [28]:
lda.fit(vectorized)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=42,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [29]:
def print_top_words(model, feature_names, n_top_words=20):
    for topic_n, topic in enumerate(model.components_, 1):
        print("Topic #{}:".format(topic_n))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

In [30]:
print("Topics in LDA model:")
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names)

Topics in LDA model:
Topic #1:
republican cruz debt house shutdown percent senate bond government investor senator ceiling stock congress said democrat vote boehner market spending
Topic #2:
twitter follow visit web view site international work police chinese internet merkel party europe pakistan mr attack said government golden
Topic #3:
mr said new ms like music work year art people time sept company city york world life night series school
Topic #4:
game season team said player league yankee cup yard play coach year win second run rivera inning time race sunday
Topic #5:
korea south north oil said music festival government award defense military giant economy gas game nuclear mr cut meeting new
Topic #6:
song race film bank mr said music new year art like time government design history people republican political work ms
Topic #7:
miller drug federal county rule law department said court final judge state race employee group year vice nbc company play
Topic #8:
mr said government st

-----

Explore Document-by-Topic

In [31]:
lda.transform(vectorized).shape[0] # Number of documents

1405

In [32]:
# Pick a random document
import random

doc_id = random.randint(0, lda.transform(vectorized).shape[0])

In [33]:
print("For Document:", df.snippet[doc_id])
print("-"*10)
for topic_number, topic_weighting in enumerate(lda.transform(vectorized)[doc_id], 1):
    print("topic number: {} \t weight: {:.2}".format(topic_number, topic_weighting))

For Document: The National Coalition Against Censorship disputes claims that fantasy violence leads to antisocial behavior. Readers are invited to respond.
----------
topic number: 1 	 weight: 0.013
topic number: 2 	 weight: 0.013
topic number: 3 	 weight: 0.88
topic number: 4 	 weight: 0.013
topic number: 5 	 weight: 0.013
topic number: 6 	 weight: 0.013
topic number: 7 	 weight: 0.013
topic number: 8 	 weight: 0.013
topic number: 9 	 weight: 0.013
topic number: 10 	 weight: 0.013


In [35]:
df.content[doc_id]

'to editor recent article sunday review “does medium violence lead real thing aug. 25 claim “a consensus exposure medium violence linked actual violent behavior.” the claim fantasy violence cause real violence debated endlessly not long ago question came supreme court far finding consensus court found evidence support “not compelling.” at best show correlation “minuscule real-world effects” indistinguishable effect watching “cartoons starring bug bunny road runner.” even study article refers equivocal the 2013 pediatrics article studied “excessive” television watching new zealand child noted “we know program viewed therefore certain viewing violence contributes antisocial behavior.” it also said “the relationship television viewing negative outcome may complicated simple violence-begets-violence model.” good point given crime rate fallen medium consumption increased there may good reason parent limit amount time kid spend front tv computer screen game console but justify dispensing hal