# Gensim Tutorial
Somewhat based on this: https://radimrehurek.com/gensim/tutorial.html

In [1]:
import os
from operator import itemgetter
from gensim import corpora, models, similarities
from gensim.utils import lemmatize

## Load all the files from the `data/wired` folder
Start by loading all of the articles and splitting them into arrays of words, removing stop words and any characters that are not alpha numeric

In [2]:
docs = []
stop_words = set()

with open('./data/lists/stopwords.txt') as f:
    stop_words = [line.rstrip().lower() for line in f]
    
illegal_chars = [word for word in stop_words if len(word) == 1 and not word.isalnum()]

for fname in os.listdir('./data/wired'):
    if fname.endswith('.txt'):
        with open('./data/wired/{}'.format(fname)) as f:
            # split words on spaces
            words = f.read().lower().split(' ')
            # ignore illegal terms
            words = [''.join([c for c in w if c not in illegal_chars]) for w in words] 
            # only take words with two or more chars that aren't stop words
            words = [w for w in words if len(w) > 1 and w not in stop_words]
            docs += [words]

## Create a dictionary and bag of words corpus

In [3]:
dictionary = corpora.Dictionary(docs) # dictionary for the entire corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in docs] # word count vectors (bag of words)

## Only need to save the dictionary and model if the data changes

In [4]:
# saving dictionary and corpus to disk
#dictionary.save('./data/wired/models/dictionary.dict') 
#corpora.MmCorpus.serialize('./data/wired/models/corpus.mm', bow_corpus)

## Create the TFIDF model

In [5]:
# create tfidf representation
mm = corpora.MmCorpus('./data/wired/models/corpus.mm')
tfidf = models.TfidfModel(mm)
tfidf_corpus = tfidf[bow_corpus]

## Create an LSI (Latent Semantic Indexing) Model (uses fold-in method)

In [6]:
lsi = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
lsi_corpus = lsi[tfidf_corpus]

In [7]:
for doc in lsi_corpus:
    top_match = max(doc, key=itemgetter(1))
    print(top_match)
    print(lsi.show_topic(top_match[0], 10))

(0, 0.57753467744174392)
[('election', 0.21564844798358612), ('iot', 0.19825962077619444), ('hackers', 0.19684563213896544), ('hacking', 0.1627971215067246), ('work', 0.15850655860976157), ('putin', 0.15603826643982785), ('devices', 0.1559621128664444), ('cashatt', 0.15049586846400648), ('government', 0.15044906856260609), ('russian', 0.12252544387212985)]
(0, 0.31008489508030485)
[('election', 0.21564844798358612), ('iot', 0.19825962077619444), ('hackers', 0.19684563213896544), ('hacking', 0.1627971215067246), ('work', 0.15850655860976157), ('putin', 0.15603826643982785), ('devices', 0.1559621128664444), ('cashatt', 0.15049586846400648), ('government', 0.15044906856260609), ('russian', 0.12252544387212985)]
(6, 0.55270118709987459)
[('iot', 0.35669893366998379), ('devices', 0.27991478241583284), ('cashatt', 0.21223480749975665), ('hotel', 0.15638354236824156), ('standards', 0.14267957346799356), ('orwl', -0.13259546154311583), ('men', -0.12433444098364053), ('election', -0.10171147925

## Create an LDA (Latent Dirichlet Allocation) Model

In [8]:
lda = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
lda_corpus = lda[bow_corpus]

In [9]:
for doc in lda_corpus:
    print(doc)
    print(lda.show_topic(doc[0][0], 10))

[(10, 0.99742857142857344)]
[('election', 0.011194782380322729), ('intercept', 0.0099475342618827126), ('russian', 0.0097870238167443759), ('hacking', 0.0094002726781685052), ('winner', 0.0082058178408981955), ('cashatt', 0.0075804363980225943), ('document', 0.0072448252656573241), ('hackers', 0.0066411739974194545), ('nsa', 0.0063848006079605998), ('security', 0.0063120555913331999)]
[(78, 0.88639561667327005), (82, 0.11031579272270381)]
[('social', 0.0084568429008106041), ('information', 0.0079890196262725598), ('cashatt', 0.0078712996231648461), ('news', 0.0069038042858163857), ('media', 0.0067068376859457943), ('fomo', 0.0061350287720187717), ('time', 0.0055435640775123897), ('sundar', 0.0051536789099410141), ('end', 0.0049002289894641057), ('kross', 0.0046443086522118589)]
[(10, 0.012383446827134407), (73, 0.12941071058651663), (98, 0.85258330544135308)]
[('election', 0.011194782380322729), ('intercept', 0.0099475342618827126), ('russian', 0.0097870238167443759), ('hacking', 0.009

## Create an HDP (Hierarchical Dirichlet process) Model

In [10]:
hdp = models.HdpModel(bow_corpus, id2word=dictionary)
hdp_corpus = hdp[bow_corpus]

In [11]:
for doc in hdp_corpus:
    print(doc)
    print(hdp.show_topic(doc[0][0], num_words=10))

[(3, 0.99767327567723307)]
[('intercept', 0.013400636917937442), ('election', 0.010706111926800913), ('russian', 0.010155845408113724), ('winner', 0.0096564256269573087), ('document', 0.0082750098866019844), ('nsa', 0.0081720667145457788), ('report', 0.0068801932235540361), ('classified', 0.0068659443930109889), ('complaint', 0.0067893334907725726), ('hacking', 0.0067383569308189974)]
[(5, 0.99678060327906504)]
[('social', 0.0081168386813532615), ('news', 0.0077859717592866223), ('information', 0.0075838873044937593), ('media', 0.006539111349089485), ('fomo', 0.0061403070619438531), ('sundar', 0.0047077399969778421), ('kross', 0.0046120245044174255), ('studies', 0.0035575139826622853), ('stems', 0.0035007190869300801), ('describing', 0.0034576059229343278)]
[(1, 0.9989382324081133)]
[('iot', 0.041059652327126502), ('devices', 0.031986866740795934), ('security', 0.024626573956499576), ('standards', 0.011036513591412717), ('data', 0.0076184812034459856), ('moyer', 0.007369621761888075), 