In [27]:
import pandas as pd

In [1]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
n_samples = 2500
n_features = 1000
n_components = 10
n_top_words = 20

In [9]:
random_documents = documents[:n_samples]

In [10]:
# max_df - sets threshold for words that are filtered out according to their frequency of appearance in the documents
#          with 1.0 corresponding to appearance in all docs
vectorizer = CountVectorizer(max_df=0.95, min_df = 2, max_features=n_features, stop_words='english')

In [11]:
doc_vectors = vectorizer.fit_transform(random_documents)

In [12]:
doc_vectors.shape

(2500, 1000)

In [18]:
[(key, vectorizer.vocabulary_[key]) for key in vectorizer.vocabulary_.keys()[:25]]

[(u'usenet', 934),
 (u'want', 951),
 (u'wrong', 989),
 (u'fit', 369),
 (u'service', 811),
 (u'needed', 622),
 (u'saying', 787),
 (u'lots', 549),
 (u'nature', 619),
 (u'dave', 266),
 (u'begin', 138),
 (u'os2', 659),
 (u'choice', 200),
 (u'ground', 410),
 (u'address', 78),
 (u'working', 981),
 (u'following', 374),
 (u'years', 996),
 (u'didn', 286),
 (u'seriously', 809),
 (u'internet', 477),
 (u'types', 922),
 (u'turned', 919),
 (u'printer', 708),
 (u'wants', 953)]

In [22]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [23]:
lda = lda.fit(doc_vectors)

In [24]:
lda_allocation = lda.components_

In [25]:
lda_allocation.shape

(10, 1000)

In [29]:
allocations = pd.DataFrame(lda_allocation.T, index=vectorizer.vocabulary_.keys())

In [37]:
allocations.stack().groupby(level=1).nlargest(10).loc[0:5]

0  worked      0    211.727966
   excellent   0    192.410831
   follow      0    165.539641
   added       0    161.833748
   claim       0    151.827698
   completely  0    146.279304
   medical     0    135.931053
   went        0    127.027810
   heads       0    124.866495
   ones        0    114.008946
1  heads       1    629.161479
   follow      1    564.071298
   money       1    509.455978
   added       1    440.065241
   went        1    431.489990
   worked      1    430.308725
   just        1    309.023552
   political   1    305.457884
   earth       1    272.837920
   copies      1    265.157250
2  wrong       2    345.237320
   video       2    204.642010
   books       2    186.736562
   service     2    180.904424
   didn        2    171.701434
   sell        2    158.369609
   paul        2    154.681324
   usenet      2    142.233546
   allowed     2    138.491948
   dave        2    137.395122
3  31          3    132.353373
   rate        3    112.980009
   clear

In [38]:
allocations.stack().groupby(level=1).nlargest(10).loc[5:]

5  rights        5    409.982091
   try           5    141.722691
   ok            5    127.767619
   words         5    119.137755
   looks         5    104.497460
   message       5    102.685036
   money         5    101.649764
   cards         5     92.742219
   later         5     90.792592
   california    5     81.789969
6  let           6    270.039255
   set           6    242.852168
   read          6    237.013240
   22            6    236.553809
   rates         6    220.494282
   required      6    202.911669
   great         6    193.660760
   states        6    192.185932
   sorry         6    186.992189
   received      6    178.120694
7  view          7    148.878403
   ones          7    126.892980
   ac            7    122.847683
   84            7    122.740312
   money         7    110.088729
   paul          7    102.990334
   agree         7     86.450001
   certainly     7     73.609572
   human         7     71.618176
   lk            7     68.376092
8  rate   