In [1]:
# Download news group dataset from sklearn
from sklearn.datasets import fetch_20newsgroups

In [2]:
# assign train and test data
ng_train = fetch_20newsgroups(subset = "train",shuffle=True)
ng_test = fetch_20newsgroups(subset = "train",shuffle=True)

In [3]:
print(list(ng_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
# sample news
ng_train.data[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [5]:
# shape of the subsets
print(ng_train.filenames.shape,ng_train.target.shape)

(11314,) (11314,)


In [6]:
# Data Preprocessing
# Tokenization
# Loading gensim and nltk libraries.............
import gensim

# Converts into token (Alternative to word_tokenize)
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

# from nltk.stem import *
import numpy as np
np.random.seed(400)

In [7]:
# import nltk
# nltk.download('wordnet')

In [8]:
# testing the steming part before preprocessing. this should be charging any plural into singular
import pandas as pd
stemmer = SnowballStemmer("english")
original_words=['caresses', 'files', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned',
                'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational',
                'traditional', 'reference', 'colonizer', 'plotted']

singles = [WordNetLemmatizer().lemmatize(plural,pos='v') for plural in original_words]

pd.DataFrame(data={'Original Words':original_words, 'Lemma':singles})

# Stemma is not performing well
# singles = [stemmer.stem(plural) for plural in original_word]
# singles2 = [Stemmer.stem(plural2) for plural2 in singles]
# Stemma = pd.DataFrame(data={'Lemma':singles, 'Stemmed':singles2})

Unnamed: 0,Original Words,Lemma
0,caresses,caress
1,files,file
2,dies,die
3,mules,mules
4,denied,deny
5,died,die
6,agreed,agree
7,owned,own
8,humbled,humble
9,sized,size


In [9]:
# writing function for the entire dataset
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

# Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
            
    return result

In [10]:
# preview a document before preprocessing

# document_num = 50
doc_sample = 'this disk has failed many times. I would like to get it replaced.'

print("Original Document: ")
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original Document: 
['this', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [11]:
processed_docs = []

for doc in ng_train.data:
    processed_docs.append(preprocess(doc))

In [12]:
len(processed_docs)

11314

In [13]:
print(processed_docs[:2])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

In [14]:
# bag of words on the data set
dictionary = gensim.corpora.Dictionary(processed_docs)
print(dictionary)

Dictionary(61411 unique tokens: ['addit', 'bodi', 'bricklin', 'bring', 'bumper']...)


In [15]:
# Lets see if dictionary created successfully
count = 0
for k,v in dictionary.iteritems():
    print(k, v)
    count +=1
    if count >10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


In [16]:
# remove rare and repeatative words
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)

In [17]:
# Create the bag-of-words model for each document i.e for each document we create a dictionary reporting how many
# words and how many times those words appear. Save theis to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 5),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 2),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 3),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 3),
  (65, 1),
  (66, 4)],
 [(8, 2),
  (11, 2),
  (23, 1),
  (26, 1),
  (36, 2),
  (40, 2),
  (43, 1),
  (49, 1),
  (63, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 2),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 3),
  (81, 1),
  (82, 1

In [18]:
# preview
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                    dictionary[bow_doc_x[i][0]],
                                                    bow_doc_x[i][1]))

Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.
Word 520 ("wors") appears 2 time.
Word 721 ("keith") appears 3 time.
Word 732 ("punish") appears 1 time.
Word 803 ("california") appears 1 time.
Word 859 ("institut") appears 1 time.
Word 917 ("similar") appears 1 time.
Word 990 ("allan") appears 1 time.
Word 991 ("anti") appears 1 time.
Word 992 ("arriv") appears 1 time.
Word 993 ("austria") appears 1 time.
Word 994 ("caltech") appears 2 time.
Word 995 ("distinguish") appears 1 time.
Word 996 ("german") appears 1 time.
Word 997 ("germani") appears 3 time.
Word 998 ("hitler") appears 1 time.
Word 999 ("livesey") appears 2 time.
Word 1000 ("motto") appears 2 time.
Word 1001 ("order") appear

In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                      num_topics = 2,
                                      id2word = dictionary,
                                      passes = 10,
                                      workers = 2)

In [20]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.005*"drive" + 0.005*"game" + 0.004*"space" + 0.004*"team" + 0.003*"card" + 0.003*"play" + 0.003*"nasa" + 0.003*"engin" + 0.003*"power" + 0.002*"player"


Topic: 1 
Words: 0.005*"file" + 0.004*"window" + 0.003*"govern" + 0.003*"program" + 0.003*"christian" + 0.003*"public" + 0.002*"exist" + 0.002*"group" + 0.002*"jesus" + 0.002*"armenian"




In [21]:
num = 15
unseen_document = ng_test.data[num]
print(unseen_document)

From: mathew <mathew@mantis.co.uk>
Subject: Re: <Political Atheists?
Organization: Mantis Consultants, Cambridge. UK.
X-Newsreader: rusnews v1.01
Lines: 22

kmr4@po.CWRU.edu (Keith M. Ryan) writes:
> ( I am almost sure that Zyklon-B is immediate and painless method of 
> death. If not, insert soem other form. )
> 
>         And, ethnic and minority groups have been killed, mutilated and 
> exterminated through out history, so I guess it was not unusual.
> 
>         So, you would agree that the holocost would be allowed under the US 
> Constitution?  [ in so far, the punishment. I doubt they recieved what would 
> be considered a "fair" trial by US standards.

Don't be so sure.  Look what happened to Japanese citizens in the US during
World War II.  If you're prepared to say "Let's round these people up and
stick them in a concentration camp without trial", it's only a short step to
gassing them without trial.  After all, it seems that the Nazis originally
only intended to imprison the

In [22]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))

Score: 0.08362928777933121	 Topic: 0.005*"drive" + 0.005*"game" + 0.004*"space" + 0.004*"team" + 0.003*"card" + 0.003*"play" + 0.003*"nasa" + 0.003*"engin" + 0.003*"power" + 0.002*"player"
Score: 0.9163707494735718	 Topic: 0.005*"file" + 0.004*"window" + 0.003*"govern" + 0.003*"program" + 0.003*"christian" + 0.003*"public" + 0.002*"exist" + 0.002*"group" + 0.002*"jesus" + 0.002*"armenian"


In [23]:
#bow_vector = dictionary.doc2bow(preprocess(unseen_document))

#for index, score in sorted(lda_model[bow_vector], key=Lambda x: x[1]):
    #print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

In [24]:
# print(ng_test.target[1])