In [48]:
from sklearn import datasets

In [49]:
newsgroups_train = datasets.fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = datasets.fetch_20newsgroups(subset='test', shuffle=True)

In [50]:
dir(newsgroups_train)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [51]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [52]:
print(newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [53]:
print(len(newsgroups_train.data))
print(len(newsgroups_test.data))

11314
7532


### Preprocessing

In [54]:
import gensim
# converts document into lowercase tokens
from gensim.utils import simple_preprocess
# Removes stopwords
from gensim.parsing.preprocessing import STOPWORDS
# Removes punctuation
from string import punctuation
# lemmatizes and stems
from nltk.stem import WordNetLemmatizer, SnowballStemmer

import numpy as np
np.random.seed(400)

In [63]:
gensim.utils.simple_preprocess(newsgroups_train.data[0])

['from',
 'lerxst',
 'wam',
 'umd',
 'edu',
 'where',
 'my',
 'thing',
 'subject',
 'what',
 'car',
 'is',
 'this',
 'nntp',
 'posting',
 'host',
 'rac',
 'wam',
 'umd',
 'edu',
 'organization',
 'university',
 'of',
 'maryland',
 'college',
 'park',
 'lines',
 'was',
 'wondering',
 'if',
 'anyone',
 'out',
 'there',
 'could',
 'enlighten',
 'me',
 'on',
 'this',
 'car',
 'saw',
 'the',
 'other',
 'day',
 'it',
 'was',
 'door',
 'sports',
 'car',
 'looked',
 'to',
 'be',
 'from',
 'the',
 'late',
 'early',
 'it',
 'was',
 'called',
 'bricklin',
 'the',
 'doors',
 'were',
 'really',
 'small',
 'in',
 'addition',
 'the',
 'front',
 'bumper',
 'was',
 'separate',
 'from',
 'the',
 'rest',
 'of',
 'the',
 'body',
 'this',
 'is',
 'all',
 'know',
 'if',
 'anyone',
 'can',
 'tellme',
 'model',
 'name',
 'engine',
 'specs',
 'years',
 'of',
 'production',
 'where',
 'this',
 'car',
 'is',
 'made',
 'history',
 'or',
 'whatever',
 'info',
 'you',
 'have',
 'on',
 'this',
 'funky',
 'looking',


In [55]:
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if (token not in STOPWORDS) and (len(token)>3) and (token not in punctuation):
            result.append(lemmatize_stemming(token))
            
    return result

In [56]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))
    
print(processed_docs[:2])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

### Create Bag of words and Corpus

In [57]:
dictionary = gensim.corpora.Dictionary(processed_docs)

# words appearing less than 15 times
# words appearing in more than 10% of all documents
# Keep and 100000 most frequent words
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [58]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bring
3 bumper
4 call
5 colleg
6 door
7 earli
8 engin
9 enlighten
10 histori


In [59]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [60]:
bow_doc_x = bow_corpus[20]
print(len(bow_doc_x))
for i in range(10):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

36
Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.


## Running LDA using Bag of Words

In [61]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=8, 
                                       id2word=dictionary,                                    
                                       passes=10,
                                       workers=2)

In [62]:
topics = lda_model.print_topics(num_words=-1)

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print()

Topic: 0 
Words: 0.006*"bike" + 0.006*"presid" + 0.005*"game" + 0.005*"team" + 0.004*"run" + 0.004*"player" + 0.003*"play" + 0.003*"clinton" + 0.003*"pitch" + 0.003*"virginia"

Topic: 1 
Words: 0.009*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"countri" + 0.004*"weapon" + 0.004*"jew"

Topic: 2 
Words: 0.015*"game" + 0.013*"team" + 0.010*"play" + 0.008*"hockey" + 0.008*"player" + 0.005*"canada" + 0.005*"season" + 0.004*"leagu" + 0.004*"score" + 0.004*"andrew"

Topic: 3 
Words: 0.010*"card" + 0.010*"window" + 0.007*"driver" + 0.006*"sale" + 0.005*"price" + 0.005*"appl" + 0.005*"speed" + 0.005*"video" + 0.005*"engin" + 0.005*"monitor"

Topic: 4 
Words: 0.014*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"imag" + 0.006*"data" + 0.006*"avail" + 0.005*"version" + 0.004*"code"

Topic: 5 
Words: 0.012*"space" + 0.009*"nasa" + 0.006*"scienc" + 0.005*"orbit" + 0.005*"research" + 0.004

### Classification of topics

* 0: Sports
* 1: Politics
* 2: Hockey
* 3: Graphic cards
* 4: Computers
* 5: Space
* 6: Automobiles
* 7: Atheism


In [92]:
num = 2
unseen_document = newsgroups_test.data[num]
print(unseen_document)

print(list(newsgroups_train.target_names)[newsgroups_test.target[num]])

From: mathew <mathew@mantis.co.uk>
Subject: Re: STRONG & weak Atheism
Organization: Mantis Consultants, Cambridge. UK.
X-Newsreader: rusnews v1.02
Lines: 9

acooper@mac.cc.macalstr.edu (Turin Turambar, ME Department of Utter Misery) writes:
> Did that FAQ ever got modified to re-define strong atheists as not those who
> assert the nonexistence of God, but as those who assert that they BELIEVE in 
> the nonexistence of God?

In a word, yes.


mathew

alt.atheism


In [94]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
lda_model[bow_vector]

[(7, 0.9634918)]

In [98]:
i[0]

7

In [109]:
for i in lda_model[bow_vector]:
    print(lda_model.print_topic(i[0], 5))

0.012*"christian" + 0.008*"jesus" + 0.006*"exist" + 0.005*"moral" + 0.005*"bibl"
