## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

* create a variable called `'no_features'` and set its value to 100.

In [3]:
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [4]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [5]:
tf_idf = TfidfVectorizer(max_df=0.95, 
                         min_df=2,
                         max_features=no_features,
                         stop_words='english')

* use fit_transform method of TfidfVectorizer to transform the documents

In [6]:
tf_fit = tf_idf.fit_transform(documents)

* get the features names from TfidfVectorizer

In [9]:
tf_idf.get_feature_names()

['00',
 '10',
 '12',
 '14',
 '15',
 '16',
 '20',
 '25',
 'a86',
 'available',
 'ax',
 'b8f',
 'believe',
 'best',
 'better',
 'bit',
 'case',
 'com',
 'come',
 'course',
 'data',
 'day',
 'did',
 'didn',
 'different',
 'does',
 'doesn',
 'don',
 'drive',
 'edu',
 'fact',
 'far',
 'file',
 'g9v',
 'god',
 'going',
 'good',
 'got',
 'government',
 'help',
 'information',
 'jesus',
 'just',
 'key',
 'know',
 'law',
 'let',
 'like',
 'line',
 'list',
 'little',
 'll',
 'long',
 'look',
 'lot',
 'mail',
 'make',
 'max',
 'mr',
 'need',
 'new',
 'number',
 'people',
 'point',
 'power',
 'probably',
 'problem',
 'program',
 'question',
 'read',
 'really',
 'right',
 'run',
 'said',
 'say',
 'second',
 'set',
 'software',
 'space',
 'state',
 'sure',
 'tell',
 'thanks',
 'thing',
 'things',
 'think',
 'time',
 'true',
 'try',
 'use',
 'used',
 'using',
 've',
 'want',
 'way',
 'windows',
 'work',
 'world',
 'year',
 'years']

* instantiate NMF and fit transformed data

In [10]:
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tf_fit)


## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [11]:
cv = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

* use fit_transform method of CountVectorizer to transform documents

In [12]:
cv_fit = cv.fit_transform(documents)

* get the features names from TfidfVectorizer

In [14]:
cv.get_feature_names()

['00',
 '10',
 '12',
 '14',
 '15',
 '16',
 '20',
 '25',
 'a86',
 'available',
 'ax',
 'b8f',
 'believe',
 'best',
 'better',
 'bit',
 'case',
 'com',
 'come',
 'course',
 'data',
 'day',
 'did',
 'didn',
 'different',
 'does',
 'doesn',
 'don',
 'drive',
 'edu',
 'fact',
 'far',
 'file',
 'g9v',
 'god',
 'going',
 'good',
 'got',
 'government',
 'help',
 'information',
 'jesus',
 'just',
 'key',
 'know',
 'law',
 'let',
 'like',
 'line',
 'list',
 'little',
 'll',
 'long',
 'look',
 'lot',
 'mail',
 'make',
 'max',
 'mr',
 'need',
 'new',
 'number',
 'people',
 'point',
 'power',
 'probably',
 'problem',
 'program',
 'question',
 'read',
 'really',
 'right',
 'run',
 'said',
 'say',
 'second',
 'set',
 'software',
 'space',
 'state',
 'sure',
 'tell',
 'thanks',
 'thing',
 'things',
 'think',
 'time',
 'true',
 'try',
 'use',
 'used',
 'using',
 've',
 'want',
 'way',
 'windows',
 'work',
 'world',
 'year',
 'years']

* instantiate LatentDirichletAllocation and fit transformed data 

In [15]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(cv_fit)

* create a function `display_topics` that is able to display the top words in a topic for different models

In [16]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

* display top 10 words from each topic from NMF model

In [18]:
display_topics(nmf, tf_idf.get_feature_names(), 10)

Topic 0:
did didn make time like know people just say said
Topic 1:
thanks mail know help does god doesn don drive edu
Topic 2:
does know like use thanks work make say just way
Topic 3:
edu mail new got don drive fact far file g9v
Topic 4:
know don does like thanks let need just want help
Topic 5:
like just don know use make people does think new
Topic 6:
just like don think way good ve say people right
Topic 7:
use used like using does need work just way want
Topic 8:
people think don just like make say government know time
Topic 9:
good just like think don really time make better thing
Topic 10:
think don people just good like really say way time
Topic 11:
god jesus believe say people does things fact said think
Topic 12:
time long like just years good don think know did
Topic 13:
windows using file thanks use program run problem software help
Topic 14:
drive thanks problem work years god doesn don edu fact
Topic 15:
problem using help know time just work god doesn don
Topic 16:
don 

* display top 10 words from each topic from LDA model

In [19]:
display_topics(lda, cv.get_feature_names(), 10)

Topic 0:
point state right need long second fact does people things
Topic 1:
day tell like going just read problem need think know
Topic 2:
sure point want good help did government say use question
Topic 3:
think need best 00 a86 tell want using new people
Topic 4:
ax max b8f g9v a86 14 mr ll 25 probably
Topic 5:
god true say jesus believe things people does did know
Topic 6:
said second years tell work new right ll true like
Topic 7:
probably look like tell need used point want long run
Topic 8:
available software run like new need people used probably work
Topic 9:
mail list like new time tell does different use 15
Topic 10:
think 16 program point com space case let probably use
Topic 11:
help set use like does tell things work used new
Topic 12:
com know g9v way mr let look don point try
Topic 13:
don like know just think say ll tell need try
Topic 14:
time years long like far just work make better people
Topic 15:
good like tell just day think probably lot believe right
Topic 16:
j

### Stretch: Use LDA w/ Gensim to do the same thing.