In [1]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phrases
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models import LdaModel

In [2]:
api.info("20-newsgroups")

{'num_records': 18846,
 'record_format': 'dict',
 'file_size': 14483581,
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/20-newsgroups/__init__.py',
 'license': 'not found',
 'fields': {'topic': 'name of topic (20 variant of possible values)',
  'set': "marker of original split (possible values 'train' and 'test')",
  'data': '',
  'id': 'original id inferred from folder name'},
 'description': 'The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.',
 'checksum': 'c92fd4f6640a86d5ba89eaad818a9891',
 'file_name': '20-newsgroups.gz',
 'read_more': ['http://qwone.com/~jason/20Newsgroups/'],
 'parts': 1}

In [3]:
# Gensim data returns an iterator interface to the tar file
# This is really cool for big files because it doesn't require holding the full dataset in RAM
# It also downloads it locally on your machine, whcih you can read directly
dataset = api.load("20-newsgroups")
print(dataset)

<20-newsgroups.Dataset object at 0x11e7f9a58>


In [4]:
# Need to get the generator from the dataset to use with gensim
generator = dataset.__iter__()
print(generator.__next__()) # example doc

{'topic': 'soc.religion.christian', 'set': 'train', 'data': 'From: db7n+@andrew.cmu.edu (D. Andrew Byler)\nSubject: Re: Serbian genocide Work of God?\nOrganization: Freshman, Civil Engineering, Carnegie Mellon, Pittsburgh, PA\nLines: 61\n\nVera Shanti Noyes writes;\n\n>this is what indicates to me that you may believe in predestination.\n>am i correct?  i do not believe in predestination -- i believe we all\n>choose whether or not we will accept God\'s gift of salvation to us.\n>again, fundamental difference which can\'t really be resolved.\n\nOf course I believe in Predestination.  It\'s a very biblical doctrine as\nRomans 8.28-30 shows (among other passages).  Furthermore, the Church\nhas always taught predestination, from the very beginning.  But to say\nthat I believe in Predestination does not mean I do not believe in free\nwill.  Men freely choose the course of their life, which is also\naffected by the grace of God.  However, unlike the Calvinists and\nJansenists, I hold that gr

In [5]:
# Gensim can directly read the iterator
model = Word2Vec(dataset)
print(model.vocabulary.cum_table)

[ 536870912 1073741824 1610612735 2147483647]


## Pre-processing

In [6]:
import json
import gzip

corpusJson = []
file = gzip.open("/Users/jaybooth/gensim-data/20-newsgroups/20-newsgroups.gz")
corpusJson = [json.loads(line) for line in file]
file.close()
print(len(corpusJson))

18846


In [7]:
print(corpusJson[0])

{'topic': 'soc.religion.christian', 'set': 'train', 'data': 'From: db7n+@andrew.cmu.edu (D. Andrew Byler)\nSubject: Re: Serbian genocide Work of God?\nOrganization: Freshman, Civil Engineering, Carnegie Mellon, Pittsburgh, PA\nLines: 61\n\nVera Shanti Noyes writes;\n\n>this is what indicates to me that you may believe in predestination.\n>am i correct?  i do not believe in predestination -- i believe we all\n>choose whether or not we will accept God\'s gift of salvation to us.\n>again, fundamental difference which can\'t really be resolved.\n\nOf course I believe in Predestination.  It\'s a very biblical doctrine as\nRomans 8.28-30 shows (among other passages).  Furthermore, the Church\nhas always taught predestination, from the very beginning.  But to say\nthat I believe in Predestination does not mean I do not believe in free\nwill.  Men freely choose the course of their life, which is also\naffected by the grace of God.  However, unlike the Calvinists and\nJansenists, I hold that gr

In [8]:
corpus = list(map(lambda jsonDoc: jsonDoc["data"],corpusJson))
corpusTopics = list(map(lambda jsonDoc: jsonDoc["topic"],corpusJson))

In [9]:
import spacy
nlp = spacy.load("en")

In [10]:
my_stop_words = []
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [11]:
nlpCorpus = list(map(lambda doc: nlp(doc), corpus[:10]))

In [13]:
def filterWords(word):
    return word.is_alpha and not word.is_stop

def convertWords(word):
    return word.lemma_.lower()

def cleanDoc(doc):
    return list(map(convertWords, filter(filterWords, doc)))

cleanCorpus = list(map(lambda doc: cleanDoc(doc), nlpCorpus))

In [14]:
print(cleanCorpus[0])

['from', 'andrew', 'byler', 'subject', 're', 'serbian', 'genocide', 'work', 'god', 'organization', 'freshman', 'civil', 'engineering', 'carnegie', 'mellon', 'pittsburgh', 'pa', 'lines', 'vera', 'shanti', 'noyes', 'write', 'indicate', 'believe', 'predestination', 'correct', 'believe', 'predestination', 'believe', 'choose', 'accept', 'god', 'gift', 'salvation', 'fundamental', 'difference', 'resolve', 'of', 'course', '-pron-', 'believe', 'predestination', '-pron-', 'biblical', 'doctrine', 'romans', 'show', 'passage', 'furthermore', 'church', 'teach', 'predestination', 'beginning', 'but', '-pron-', 'believe', 'predestination', 'mean', '-pron-', 'believe', 'free', 'men', 'freely', 'choose', 'course', 'life', 'affect', 'grace', 'god', 'however', 'unlike', 'calvinists', 'jansenists', '-pron-', 'hold', 'grace', 'resistable', 'end', 'idiocy', 'deny', 'universal', 'saving', 'god', 'timothy', 'for', 'god', 'grace', 'save', 'but', 'elect', 'foreknew', 'predestine', 'receive', 'grace', 'final', 'pe

In [15]:
# We need to mess around with these
threshold = 1
minCount = 1
bigram = Phrases(cleanCorpus, min_count=minCount, threshold=threshold)

In [16]:
cleanBigramCorpus = [bigram[doc] for doc in cleanCorpus]

In [17]:
cleanBigramCorpus[0][:10]

['from_andrew',
 'byler_subject',
 're',
 'serbian',
 'genocide',
 'work',
 'god',
 'organization_freshman',
 'civil_engineering',
 'carnegie_mellon']

In [18]:
dictionary = Dictionary(cleanBigramCorpus)
bow = [dictionary.doc2bow(doc) for doc in cleanBigramCorpus]

In [19]:
bow[0][:10]

[(0, 7),
 (1, 2),
 (2, 2),
 (3, 2),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [20]:
model = TfidfModel(bow)  # fit model
vector = model[bow[0]]

In [21]:
print(vector[:20])

iterator = iter(dictionary.token2id.items())
for i in range(20):
    print(next(iterator))

[(0, 0.019450470953619004), (1, 0.08489036814489959), (2, 0.12145065971112816), (3, 0.12145065971112816), (4, 0.03175198428161197), (5, 0.03175198428161197), (6, 0.04244518407244979), (7, 0.06072532985556408), (8, 0.06072532985556408), (9, 0.06072532985556408), (10, 0.06072532985556408), (11, 0.06072532985556408), (12, 0.08489036814489959), (13, 0.04244518407244979), (14, 0.06072532985556408), (15, 0.06072532985556408), (16, 0.06072532985556408), (17, 0.04244518407244979), (18, 0.18217598956669226), (19, 0.04244518407244979)]
('-pron-', 0)
('-pron-_believe', 1)
('-pron-_bless', 2)
('-pron-_hold', 3)
('-pron-_think', 4)
('accept', 5)
('action', 6)
('ad', 7)
('adulterous', 8)
('adultery', 9)
('adventure', 10)
('affect', 11)
('and_-pron-', 12)
('andy_byler', 13)
('annias', 14)
('bar', 15)
('beginning', 16)
('believe', 17)
('believe_predestination', 18)
('bible', 19)


In [22]:
ldamodel = LdaModel(bow, num_topics=10, id2word=dictionary)

In [23]:
ldamodel.show_topics()

[(0,
  '0.008*"god" + 0.007*"assumption" + 0.007*"mean" + 0.006*"heaven" + 0.005*"-pron-" + 0.005*"virgin_mary" + 0.004*"pittsburgh_pa" + 0.004*"jung" + 0.004*"lines" + 0.004*"civil_engineering"'),
 (1,
  '0.023*"-pron-" + 0.016*"response" + 0.011*"question" + 0.011*"sherlette" + 0.010*"post" + 0.008*"people" + 0.008*"from" + 0.008*"subject_re" + 0.007*"-pron-_know" + 0.007*"sin"'),
 (2,
  '0.016*"-pron-" + 0.010*"god" + 0.010*"if" + 0.010*"jesus" + 0.008*"the" + 0.008*"kingdom_heaven" + 0.008*"christians" + 0.007*"earth" + 0.007*"messiah" + 0.007*"write"'),
 (3,
  '0.028*"-pron-" + 0.011*"god" + 0.011*"judge" + 0.008*"activity" + 0.006*"sin" + 0.006*"repent" + 0.006*"judgement" + 0.005*"christians" + 0.005*"national" + 0.005*"folk"'),
 (4,
  '0.011*"-pron-" + 0.006*"god" + 0.005*"jesus" + 0.005*"the" + 0.004*"if" + 0.004*"but" + 0.004*"year" + 0.004*"write" + 0.004*"return" + 0.004*"jewish_people"'),
 (5,
  '0.028*"-pron-" + 0.020*"prayer" + 0.018*"sin" + 0.011*"think" + 0.011*"god" +