# Gensim Tutorial

In [113]:
import os
from operator import itemgetter
from gensim import corpora, models, similarities
from gensim.utils import lemmatize

## Load all the files from the `data/wired` folder

In [46]:
docs = []
stop_words = set()

with open('./data/lists/stopwords.txt') as f:
    stop_words = [line.rstrip().lower() for line in f]
    
illegal_chars = [word for word in stop_words if len(word) == 1 and not word.isalnum()]

for fname in os.listdir('./data/wired'):
    if fname.endswith('.txt'):
        with open('./data/wired/{}'.format(fname)) as f:
            # split words on spaces
            words = f.read().lower().split(' ')
            # ignore illegal terms
            words = [''.join([c for c in w if c not in illegal_chars]) for w in words] 
            # only take words with two or more chars that aren't stop words
            words = [w for w in words if len(w) > 1 and w not in stop_words]
            docs += [words]

## Create a dictionary and bag of words corpus

In [47]:
dictionary = corpora.Dictionary(docs) # dictionary for the entire corpus
corpus = [dictionary.doc2bow(doc) for doc in docs] # word count vectors (bag of words)

# saving dictionary and corpus to disk
dictionary.save('./data/wired/models/dictionary.dict') 
corpora.MmCorpus.serialize('./data/wired/models/corpus.mm', corpus)

## Create the TFIDF model

In [48]:
# create tfidf representation
mm = corpora.MmCorpus('./data/wired/models/corpus.mm')
tfidf = models.TfidfModel(mm)
corpus_tfidf = tfidf[corpus]

## Create an LSI (Latent Semantic Indexing) Model (uses fold-in method)

In [149]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
lsi_corpus = lsi[corpus_tfidf]

In [164]:
for doc in lsi_corpus:
    top_match = max(doc, key=itemgetter(1))
    print(top_match)
    print(lsi.show_topic(top_match[0], 10))

(1, 0.39896670366275572)
[('men', -0.2546495414771705), ('russia', 0.18125058180085657), ('putin', 0.18125058180085657), ('blue', -0.16350456160840468), ('intercept', 0.16266997293091615), ('cashatt', -0.14785049158315747), ('russian', 0.12961653012356955), ('jobs', -0.11086476329152289), ('hotel', -0.10894246748232811), ('plants', -0.1021903510052531)]
(3, 0.55376455575008388)
[('blue', -0.29428489413668935), ('iot', -0.28597034811732663), ('plants', -0.18392805883543081), ('crops', -0.14714244706834467), ('weeds', -0.14714244706834467), ('devices', -0.14713603529043509), ('fomo', 0.12963960007867167), ('river', -0.12147381993470134), ('rivers', -0.11035683530125853), ('stone', -0.11035683530125853)]
(1, -0.017054552516255761)
[('men', -0.2546495414771705), ('russia', 0.18125058180085657), ('putin', 0.18125058180085657), ('blue', -0.16350456160840468), ('intercept', 0.16266997293091615), ('cashatt', -0.14785049158315747), ('russian', 0.12961653012356955), ('jobs', -0.11086476329152289

## Create an LDA (Latent Dirichlet Allocation) Model

In [151]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
lda_corpus = lda[corpus]

In [163]:
for doc in lda_corpus:
    print(doc)
    print(lda.show_topic(doc[0][0], 10))

[(41, 0.99742857142857277)]
[('intercept', 0.016886292027933084), ('security', 0.015382264777401339), ('election', 0.014762716613021143), ('russian', 0.013063025195397008), ('nsa', 0.012094903749429516), ('winner', 0.01169764781165868), ('document', 0.0094289221584154207), ('report', 0.0090322655730869384), ('hacking', 0.0085002118037788652), ('intelligence', 0.0078956270088499109)]
[(81, 0.99667785234899386)]
[('news', 0.009617345227395992), ('information', 0.0090573461727165953), ('media', 0.0084382460893484574), ('social', 0.0064256078788696071), ('fomo', 0.0063461397910499666), ('sundar', 0.005861571143503475), ('time', 0.0053683028373453703), ('covfefe', 0.004655631871490669), ('post', 0.0045941368111045927), ('primitive', 0.0043745383636073143)]
[(31, 0.35374535574370325), (34, 0.021266444929492742), (41, 0.039408941133509408), (54, 0.42395341254988023), (65, 0.084192898210249598), (74, 0.037472825480813571), (93, 0.021454509581439955)]
[('iot', 0.023172238091545328), ('devices',

## Create an HDP (Hierarchical Dirichlet process) Model

In [54]:
hdp = models.HdpModel(corpus, id2word=dictionary)
hdp_corpus = hdp[corpus]

In [162]:
for doc in hdp_corpus:
    print(doc)
    print(hdp.show_topic(doc[0][0], num_words=10))

[(3, 0.99767297789960163)]
[('intercept', 0.013143543835143517), ('election', 0.011079615775221754), ('russian', 0.0094829081793543057), ('winner', 0.0094267346425895154), ('nsa', 0.0079487446188197843), ('document', 0.0079037260051970266), ('security', 0.0069102531355166282), ('report', 0.0068579600079323134), ('classified', 0.0067405413927190188), ('hacking', 0.0066115326360728982)]
[(5, 0.99678078877525922)]
[('news', 0.0078901302638375095), ('information', 0.0077918293635574794), ('social', 0.0075452027961061315), ('fomo', 0.0060413954864230977), ('media', 0.0060281325447939203), ('sundar', 0.0046609650546366082), ('kross', 0.0046191980186413139), ('post', 0.0036502815378919294), ('university', 0.0036015110184239468), ('brains', 0.003482831667659103)]
[(1, 0.99893816413484815)]
[('iot', 0.041059392047090873), ('devices', 0.031903726138331291), ('security', 0.02476244401667263), ('standards', 0.010934651589103124), ('moyer', 0.0075722320931523096), ('industry', 0.0074120329042859871