# Gensim Tutorial
Somewhat based on this: https://radimrehurek.com/gensim/tutorial.html

In [1]:
import os
from operator import itemgetter
from gensim import corpora, models, similarities
from gensim.utils import lemmatize

## Load all the files from the `data/wired` folder
Start by loading all of the articles and splitting them into arrays of words, removing stop words and any characters that are not alpha numeric

In [2]:
docs = []
stop_words = set()

with open('./data/lists/stopwords.txt') as f:
    stop_words = [line.rstrip().lower() for line in f]
    
illegal_chars = [word for word in stop_words if len(word) == 1 and not word.isalnum()]

for fname in os.listdir('./data/wired'):
    if fname.endswith('.txt'):
        with open('./data/wired/{}'.format(fname)) as f:
            # split words on spaces
            words = f.read().lower().split(' ')
            # ignore illegal terms
            words = [''.join([c for c in w if c not in illegal_chars]) for w in words] 
            # only take words with two or more chars that aren't stop words
            words = [w for w in words if len(w) > 1 and w not in stop_words]
            docs += [words]

## Create a dictionary and bag of words corpus

In [3]:
dictionary = corpora.Dictionary(docs) # dictionary for the entire corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in docs] # word count vectors (bag of words)

## Only need to save the dictionary and model if the data changes

In [4]:
# saving dictionary and corpus to disk
#dictionary.save('./data/wired/models/dictionary.dict') 
#corpora.MmCorpus.serialize('./data/wired/models/corpus.mm', bow_corpus)

## Create the TFIDF model

In [5]:
# create tfidf representation
mm = corpora.MmCorpus('./data/wired/models/corpus.mm')
tfidf = models.TfidfModel(mm)
tfidf_corpus = tfidf[bow_corpus]

## Create an LSI (Latent Semantic Indexing) Model (uses fold-in method)

In [6]:
lsi = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
lsi_corpus = lsi[tfidf_corpus]

In [7]:
for doc in lsi_corpus:
    top_match = max(doc, key=itemgetter(1))
    print(top_match)
    print(lsi.show_topic(top_match[0], 10))

(0, 0.62667819875024466)
[('security', 0.3243300562915431), ('devices', 0.20674016899881523), ('hackers', 0.19091617569088229), ('iot', 0.17493656545226605), ('hacking', 0.1666091725931729), ('work', 0.15509231067982518), ('cashatt', 0.15090803035671491), ('election', 0.14853902974129921), ('putin', 0.13625624098176559), ('russia', 0.13625624098176559)]
(4, 0.45188759599824152)
[('men', -0.27756536368622081), ('work', -0.26769162417213321), ('blue', 0.23840877099568347), ('plants', 0.14900548187230206), ('information', 0.1282867941156102), ('news', 0.12726899803280117), ('cashatt', -0.12125214654702605), ('jobs', -0.12037473819773574), ('crops', 0.11920438549784174), ('weeds', 0.11920438549784174)]
(0, 0.51183405476670241)
[('security', 0.3243300562915431), ('devices', 0.20674016899881523), ('hackers', 0.19091617569088229), ('iot', 0.17493656545226605), ('hacking', 0.1666091725931729), ('work', 0.15509231067982518), ('cashatt', 0.15090803035671491), ('election', 0.14853902974129921), (

## Create an LDA (Latent Dirichlet Allocation) Model

In [8]:
lda = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
lda_corpus = lda[bow_corpus]

In [9]:
for doc in lda_corpus:
    print(doc)
    print(lda.show_topic(doc[0][0], 10))

[(64, 0.034638572852052553), (77, 0.96281597260249341)]
[('intercept', 0.0097306287828162284), ('hacking', 0.0091077121234444357), ('election', 0.0086715252195577498), ('security', 0.0084208854065380753), ('russian', 0.0079518836635260066), ('winner', 0.0070191272657862949), ('cashatt', 0.0069448646907382299), ('document', 0.0065947558876756169), ('nsa', 0.0062013536766551986), ('hotel', 0.0058033186511500705)]
[(86, 0.9966778523489932)]
[('information', 0.010369054017427487), ('social', 0.0076013267751999054), ('news', 0.0063845212143228466), ('media', 0.0063455833678707418), ('kross', 0.0055668943951487403), ('fomo', 0.005555015361700992), ('cashatt', 0.0050483305993948048), ('work', 0.0049185067142384986), ('sundar', 0.004672859378210722), ('spoke', 0.0044740851625853046)]
[(0, 0.012288137543785061), (24, 0.031254233972190751), (34, 0.025631487559149495), (40, 0.49537903437055975), (51, 0.39205258143563837), (77, 0.020610432101242294)]
[('blue', 0.018024649015629311), ('technology',

## Create an HDP (Hierarchical Dirichlet process) Model

In [10]:
hdp = models.HdpModel(bow_corpus, id2word=dictionary)
hdp_corpus = hdp[bow_corpus]

In [11]:
for doc in hdp_corpus:
    print(doc)
    print(hdp.show_topic(doc[0][0], num_words=10))

[(3, 0.99767319478602967)]
[('intercept', 0.013438100200322871), ('election', 0.010769563067823543), ('russian', 0.0095713241512019875), ('winner', 0.0095164170097326844), ('nsa', 0.0083091376193723202), ('document', 0.0082167559911182856), ('hacking', 0.00733547941544901), ('complaint', 0.0069801407692424563), ('report', 0.0068785012805911991), ('security', 0.0067907494756502236)]
[(5, 0.99678097903289931)]
[('news', 0.0076446069322060736), ('social', 0.0076282419170066278), ('information', 0.0076186587428537979), ('media', 0.0065323474824670541), ('fomo', 0.0064392947868381666), ('sundar', 0.0048972778664197912), ('kross', 0.0047247049251415143), ('time', 0.0037079384436114341), ('hits', 0.003523171809125696), ('spoke', 0.0033909605568209094)]
[(1, 0.99893822057655901)]
[('iot', 0.041388125108707992), ('devices', 0.032092402055070995), ('security', 0.024826026242345797), ('standards', 0.011080846363557389), ('data', 0.0074366382090941966), ('industry', 0.0073943665423178541), ('moyer