In [1]:
reset -fs

In [2]:
import pandas as pd

In [3]:
# Load data
df = pd.read_pickle("../../../corpora/nyt_articles.pkl")

In [4]:
df.head(n=2)

Unnamed: 0,document_type,web_url,lead_paragraph,abstract,snippet,news_desk,word_count,source,section_name,subsection_name,_id,pub_date,print_page,headline,content
0,article,http://www.nytimes.com/2013/10/03/sports/footb...,You would think that in a symmetric zero-sum s...,,You would think that in a symmetric zero-sum s...,Sports,347,The New York Times,Sports,Pro Football,524d4e3a38f0d8198974001f,2013-10-03T00:00:00Z,,Week 5 Probabilities: Why Offense Is More Impo...,the original goal building model football fore...
1,article,http://www.nytimes.com/2013/10/03/us/new-immig...,House Democrats on Wednesday unveiled an immig...,House Democrats unveil immigration bill that p...,House Democrats on Wednesday unveiled an immig...,National,83,The New York Times,U.S.,,524cf71338f0d8198973ff7b,2013-10-03T00:00:00Z,21.0,New Immigration Bill Put Forward,house unveiled immigration bill provides path ...


In [5]:
df.content[0][:100]

'the original goal building model football forecasting weigh importance facet game in particular want'

-----

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vectorizer = CountVectorizer(max_df=0.95,  
                            min_df=2,
                            max_features=1000,
                            stop_words='english')

vectorized = vectorizer.fit_transform(df.content)

In [8]:
print("Created document-term matrix of size {} x {}".format(vectorized.shape[0],
                                                            vectorized.shape[1]))

Created document-term matrix of size 1405 x 1000


In [9]:
import sklearn

assert sklearn.__version__ == '0.18' # Make sure we are in the modern age

In [10]:
from sklearn.decomposition import NMF

In [11]:
model = NMF(init="nndsvd",
            n_components=4,
            max_iter=200)
W = model.fit_transform(vectorized)
H = model.components_

In [12]:
print("Generated factor W of size {} and factor H of size {}".format(W.shape,
                                                                     H.shape))

Generated factor W of size (1405, 4) and factor H of size (4, 1000)


Store the list of all terms whose indices correspond to the columns of the document-term matrix.

In [13]:
terms = [""] * len(vectorizer.vocabulary_)
for term in vectorizer.vocabulary_.keys():
    terms[vectorizer.vocabulary_[term]] = term

In [14]:
terms[-5:]

['yard', 'year', 'york', 'young', 'zone']

In [15]:
import numpy as np

In [16]:
for topic_index in range(H.shape[0]):
    top_indices = np.argsort(H[topic_index,:])[::-1][0:10]
    term_ranking = [terms[i] for i in top_indices]
    print("Topic {}: {}".format(topic_index, ", ".join(term_ranking)))

Topic 0: said, year, new, people, state, company, gun, work, like, percent
Topic 1: game, season, said, team, year, player, time, play, yankee, league
Topic 2: republican, government, house, health, law, care, party, shutdown, senate, president
Topic 3: mr, said, iran, rouhani, united, nuclear, president, obama, state, netanyahu


In [None]:
for topic_index, row in enumerate(H):
    term_ranking = [terms[i] for i in topic_index]
    print("Topic {}: {}".format(topic_index, ", ".join(term_ranking)))

-----

In [17]:
n_ny_topics = len(df.news_desk.unique())

In [18]:
n_ny_topics

18

In [19]:
model = NMF(init="nndsvd",
            n_components=n_ny_topics,
            max_iter=200)
W = model.fit_transform(vectorized)
H = model.components_

In [20]:
for topic_index in range(H.shape[0]):
    top_indices = np.argsort(H[topic_index,:])[::-1][0:10]
    term_ranking = [terms[i] for i in top_indices]
    print("Topic {}: {}".format(topic_index, ", ".join(term_ranking)))

Topic 0: said, year, official, day, added, time, wednesday, police, week, area
Topic 1: team, season, game, player, year, point, league, coach, giant, cup
Topic 2: republican, house, government, party, senate, president, shutdown, obama, democrat, law
Topic 3: mr, state, official, leader, minister, later, year, member, work, party
Topic 4: work, worker, job, student, new, course, people, ms, american, program
Topic 5: gun, child, death, year, law, state, shooting, time, old, killed
Topic 6: iran, rouhani, nuclear, obama, iranian, netanyahu, israel, united, president, nation
Topic 7: davis, state, story, texas, woman, democratic, city, republican, thing, hour
Topic 8: percent, year, government, market, bank, economy, rate, month, country, price
Topic 9: united, syria, government, weapon, chemical, state, security, attack, official, nation
Topic 10: new, york, president, executive, agency, director, vice, post, station, joined
Topic 11: health, care, insurance, law, state, exchange, plan

-----

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(max_df=0.95,  
                            min_df=2,
                            max_features=1000,
                            stop_words='english')

vectorized = vectorizer.fit_transform(df.content)

In [23]:
model = NMF(init="nndsvd",
            n_components=18,
            max_iter=200)
W = model.fit_transform(vectorized)
H = model.components_

In [24]:
for topic_index in range(H.shape[0]):
    top_indices = np.argsort(H[topic_index,:])[::-1][0:10]
    term_ranking = [terms[i] for i in top_indices]
    print("Topic {}: {}".format(topic_index, ", ".join(term_ranking)))

Topic 0: company, said, china, oil, executive, business, plant, gas, year, agency
Topic 1: team, season, game, player, said, coach, league, play, year, played
Topic 2: republican, house, health, government, senate, care, shutdown, obama, law, democrat
Topic 3: iran, rouhani, nuclear, iranian, israel, obama, netanyahu, united, president, sanction
Topic 4: mr, said, official, year, president, john, minister, later, prime, interview
Topic 5: percent, bank, market, rate, economy, government, debt, investor, price, growth
Topic 6: yankee, rivera, pettitte, game, stadium, season, baseball, inning, fan, pitch
Topic 7: attack, said, police, official, killed, people, mall, militant, kenya, shabab
Topic 8: party, merkel, election, ms, germany, political, german, european, government, europe
Topic 9: cup, race, team, club, america, san, francisco, won, match, world
Topic 10: sept, editor, 2013, health, child, school, care, new, writer, york
Topic 11: syria, chemical, weapon, syrian, united, natio