In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

n_samples = 2000
n_features = 1000
n_topics = 3
n_top_words = 20


In [2]:
print("Loading dataset...")

t0 = time()

dataset = fetch_20newsgroups(shuffle=False, random_state=1, categories=['comp.graphics', 
                                                                       'rec.sport.baseball', 
                                                                       'talk.politics.guns'], 
                             remove=('headers', 'footera', 'quotes'))
data_samples = dataset.data[:n_samples]
data_samples_target = dataset.target[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.429s.


In [3]:
data_samples

["\n\nmorgan and guzman will have era's 1 run higher than last year, and\n the cubs will be idiots and not pitch harkey as much as hibbard.\n castillo won't be good (i think he's a stud pitcher)\n\n       This season so far, Morgan and Guzman helped to lead the Cubs\n       at top in ERA, even better than THE rotation at Atlanta.\n       Cubs ERA at 0.056 while Braves at 0.059. We know it is early\n       in the season, we Cubs fans have learned how to enjoy the\n       short triumph while it is still there.\n",
 '\n\n\n\n\nUm, the header said *career.*  Hodapp managed about 3000 PA in his\nnine years in the majors.\n\nAs for his "consistently over .300," make that "three years in a row, \npreceded by a part-time year, plus his last year, with Boston."  Hodapp\nonly qualified for the batting title five times.  \n\nWas he injured?  He retired right around his 28th birthday.\n\nAnyway, Hodapp put up flashy numbers the year *everybody* put up\nflashy numbers.  That was his only really goo

In [4]:
dataset.target

array([1, 1, 0, ..., 2, 0, 2])

In [5]:
print(OneHotEncoder().fit_transform(dataset.target.reshape(-1,1)))

  (0, 1)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 1)	1.0
  (8, 2)	1.0
  (9, 1)	1.0
  (10, 0)	1.0
  (11, 2)	1.0
  (12, 2)	1.0
  (13, 2)	1.0
  (14, 0)	1.0
  (15, 1)	1.0
  (16, 0)	1.0
  (17, 1)	1.0
  (18, 1)	1.0
  (19, 1)	1.0
  (20, 1)	1.0
  (21, 2)	1.0
  (22, 1)	1.0
  (23, 0)	1.0
  (24, 1)	1.0
  :	:
  (1702, 1)	1.0
  (1703, 0)	1.0
  (1704, 2)	1.0
  (1705, 0)	1.0
  (1706, 2)	1.0
  (1707, 2)	1.0
  (1708, 2)	1.0
  (1709, 1)	1.0
  (1710, 0)	1.0
  (1711, 1)	1.0
  (1712, 1)	1.0
  (1713, 1)	1.0
  (1714, 0)	1.0
  (1715, 2)	1.0
  (1716, 2)	1.0
  (1717, 1)	1.0
  (1718, 2)	1.0
  (1719, 0)	1.0
  (1720, 0)	1.0
  (1721, 0)	1.0
  (1722, 0)	1.0
  (1723, 0)	1.0
  (1724, 2)	1.0
  (1725, 0)	1.0
  (1726, 2)	1.0


In [6]:
len(dataset.data)

1727

In [7]:
print("tfidf")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=n_features, 
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

tfidf
done in 0.280s.


In [8]:
adj_mat = cosine_similarity(tfidf)

In [9]:
adj_mat

array([[1.        , 0.04984018, 0.0074477 , ..., 0.        , 0.04102849,
        0.        ],
       [0.04984018, 1.        , 0.03133302, ..., 0.01573643, 0.        ,
        0.04293862],
       [0.0074477 , 0.03133302, 1.        , ..., 0.0204962 , 0.04945468,
        0.04779618],
       ...,
       [0.        , 0.01573643, 0.0204962 , ..., 1.        , 0.        ,
        0.01095564],
       [0.04102849, 0.        , 0.04945468, ..., 0.        , 1.        ,
        0.08075372],
       [0.        , 0.04293862, 0.04779618, ..., 0.01095564, 0.08075372,
        1.        ]])

In [11]:
#NMF

print('NMF')
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, 
         alpha=.1, l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

print('Topics: ')
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

def print_top_words(model, feature_name, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_name[i]
                       for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

print_top_words(nmf, tfidf_feature_names, n_top_words)

NMF
done in 0.046s.
Topics: 
Topic #0:
gun people don guns government think just right law weapons com like edu make state fbi rights crime firearms say

Topic #1:
thanks graphics edu image files program file know mail does help format need looking use windows ftp advance software email

Topic #2:
year team game games think runs good pitching hit baseball win season players don better just like edu time braves



In [32]:
a[:,1].reshape(-1,1)

array([[0.61992745],
       [0.51477403],
       [0.23968668]])

In [33]:
import numpy as np
a=np.random.random((3,1))

from sklearn.metrics import pairwise_distances
b = pairwise_distances(X=a, metric=lambda x, y: (x+y)/2)
b

array([[0.27214584, 0.28741123, 0.22486629],
       [0.28741123, 0.30267661, 0.24013167],
       [0.22486629, 0.24013167, 0.17758673]])

In [38]:

def adj_to_bias(adj, sizes, nhood=1):
    nb_graphs = adj.shape[0]
    mt = np.empty(adj.shape)
    for g in range(nb_graphs):
        mt[g] = np.eye(adj.shape[1])
        for _ in range(nhood):
            mt[g] = np.matmul(mt[g], (adj[g] + np.eye(adj.shape[1])))
        for i in range(sizes[g]):
            for j in range(sizes[g]):
                if mt[g][i][j] > 0.0:
                    mt[g][i][j] = 1.0
    return -1e9 * (1.0 - mt)
adj_to_bias(b[np.newaxis],[3], nhood=1)

array([[[-0., -0., -0.],
        [-0., -0., -0.],
        [-0., -0., -0.]]])

In [79]:
nmf.transform(tfidf).shape

(1727, 3)

In [72]:
nmf.components_.shape

(3, 1000)

In [73]:
tfidf.shape

(1727, 1000)