# Gensim Tutorial

In [3]:
import os
from gensim import corpora, models, similarities
from gensim.utils import lemmatize

## Load all the files from the `data/wired` folder

In [4]:
docs = []
stop_words = set()

with open('./data/lists/stopwords.txt') as f:
    stop_words = [line.rstrip().lower() for line in f]
    
illegal_chars = [word for word in stop_words if len(word) == 1 and not word.isalnum()]

for fname in os.listdir('./data/wired'):
    if fname.endswith('.txt'):
        with open('./data/wired/{}'.format(fname)) as f:
            # split words on spaces
            words = f.read().lower().split(' ')
            # ignore illegal terms
            words = [''.join([c for c in w if c not in illegal_chars]) for w in words] 
            # only take words with two or more chars that aren't stop words
            words = [w for w in words if len(w) > 1 and w not in stop_words]
            docs += [words]

## Create a dictionary and bag of words corpus

In [5]:
dictionary = corpora.Dictionary(docs) # dictionary for the entire corpus
corpus = [dictionary.doc2bow(doc) for doc in docs] # word count vectors (bag of words)

# saving dictionary and corpus to disk
dictionary.save('./data/wired/models/dictionary.dict') 
corpora.MmCorpus.serialize('./data/wired/models/corpus.mm', corpus)

## Create the TFIDF model

In [6]:
# create tfidf representation
mm = corpora.MmCorpus('./data/wired/models/corpus.mm')
tfidf = models.TfidfModel(mm)
corpus_tfidf = tfidf[corpus]

## Create an LSI (Latent Semantic Indexing) Model (uses fold-in method)

In [7]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(100)

[(0,
  '-0.230*"iot" + -0.207*"intercept" + -0.182*"cashatt" + -0.182*"russia" + -0.182*"putin" + -0.150*"russian" + -0.136*"men" + -0.134*"hotel" + -0.124*"nsa" + -0.122*"devices"'),
 (1,
  '-0.255*"men" + 0.181*"russia" + 0.181*"putin" + -0.164*"blue" + 0.163*"intercept" + -0.148*"cashatt" + 0.130*"russian" + -0.111*"jobs" + -0.109*"hotel" + -0.102*"plants"'),
 (2,
  '-0.253*"men" + 0.211*"iot" + 0.183*"orwl" + 0.115*"cashatt" + 0.111*"devices" + -0.107*"jobs" + 0.092*"dedicated" + 0.092*"crafty" + 0.092*"os" + 0.092*"options"'),
 (3,
  '-0.294*"blue" + -0.286*"iot" + -0.184*"plants" + -0.147*"weeds" + -0.147*"crops" + -0.147*"devices" + 0.130*"fomo" + -0.121*"river" + -0.110*"rivers" + -0.110*"farmers"'),
 (4,
  '-0.628*"iot" + -0.325*"devices" + 0.169*"blue" + -0.167*"standards" + -0.112*"moyer" + 0.105*"plants" + 0.084*"weeds" + 0.084*"crops" + 0.081*"russia" + 0.081*"putin"'),
 (5,
  '0.364*"men" + -0.213*"cashatt" + -0.157*"hotel" + 0.154*"jobs" + 0.130*"automation" + 0.130*"lab

In [8]:
# Which topic does each document relate to the most?
for doc in corpus_lsi:
    print(doc)

[(0, -0.5552031501777358), (1, 0.3989667036627576), (2, -0.003291503346408224), (3, 0.016767955718257607), (4, 0.18776255635391356), (5, 0.099269751001368048), (6, -0.10099620921643927), (7, -0.690632708289595)]
[(0, -0.34409806533206955), (1, -0.1837499223892976), (2, -0.33707602300979284), (3, 0.5537645557499713), (4, -0.16373969952145509), (5, -0.33320793451961511), (6, 0.53798576208732019), (7, -0.017034140093194976)]
[(0, -0.33982407401618564), (1, -0.017054552516311106), (2, 0.28025766165927984), (3, -0.37614147118747315), (4, -0.80972739004038874), (5, -0.054509610438913191), (6, 0.072270269758792208), (7, -0.018688017253660011)]
[(0, -0.20775235326151448), (1, -0.38991815974892235), (2, -0.16931514242368922), (3, -0.68427142194994295), (4, 0.38478472139335346), (5, -0.27985567045638066), (6, 0.27929809940447259), (7, -0.05934547687618185)]
[(0, -0.2772748258090495), (1, -0.47498574366306928), (2, -0.46529580468077997), (3, 0.01298990656054071), (4, -0.11678256818346262), (5, 0.

## Create an LDA (Latent Dirichlet Allocation) Model

In [9]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
lda_corpus = lda[corpus]

In [10]:
lda.print_topics()

[(81,
  '0.012*"hotel" + 0.010*"cashatt" + 0.006*"brocious" + 0.005*"hed" + 0.005*"lock" + 0.005*"cashatts" + 0.004*"room" + 0.004*"locks" + 0.004*"onity" + 0.003*"device"'),
 (87,
  '0.019*"blue" + 0.013*"technology" + 0.011*"plants" + 0.011*"river" + 0.009*"cashatt" + 0.008*"equipment" + 0.007*"weeds" + 0.007*"crops" + 0.006*"herbicide" + 0.006*"stone"'),
 (40,
  '0.011*"cashatt" + 0.009*"hotel" + 0.007*"lock" + 0.005*"brocious" + 0.005*"onity" + 0.005*"room" + 0.004*"device" + 0.004*"cashatts" + 0.004*"hed" + 0.004*"found"'),
 (3,
  '0.013*"men" + 0.011*"work" + 0.007*"jobs" + 0.005*"problem" + 0.005*"cashatt" + 0.005*"automation" + 0.004*"hotel" + 0.004*"robots" + 0.004*"pay" + 0.004*"unpaid"'),
 (85,
  '0.013*"information" + 0.010*"news" + 0.009*"media" + 0.009*"social" + 0.007*"sundar" + 0.007*"fomo" + 0.005*"anxiety" + 0.005*"kross" + 0.005*"hits" + 0.004*"tweet"'),
 (2,
  '0.010*"cashatt" + 0.008*"hotel" + 0.004*"room" + 0.004*"brocious" + 0.004*"lock" + 0.004*"found" + 0.004*"

In [43]:
for doc in lda_corpus:
    print(doc)
    print(lda.show_topic(doc[0][0], 3))

[(57, 0.99742857142857244)]
[('election', 0.014680058284148147), ('intercept', 0.014102742441108195), ('russian', 0.012775778624936516)]
[(82, 0.063892365814885346), (85, 0.93281904358108825)]
[('men', 0.019015486696988257), ('work', 0.017925408567381611), ('jobs', 0.011175940573869553)]
[(82, 0.010693882592706393), (83, 0.98184953154770427)]
[('men', 0.019015486696988257), ('work', 0.017925408567381611), ('jobs', 0.011175940573869553)]
[(87, 0.99561946902654919)]
[('blue', 0.018985245748879054), ('technology', 0.012859526059707202), ('plants', 0.010770000441417861)]
[(82, 0.99750629722921957)]
[('men', 0.019015486696988257), ('work', 0.017925408567381611), ('jobs', 0.011175940573869553)]
[(98, 0.99715517241379303)]
[('putin', 0.014766648311347006), ('russia', 0.01295154513233096), ('election', 0.0072804295684530179)]
[(7, 0.31616016003473535), (13, 0.19256707074671789), (21, 0.1516320927553334), (25, 0.20319214478593445), (57, 0.040046588546529656), (82, 0.047025516694510508), (83, 0.

## Create an HDP (Hierarchical Dirichlet process) Model

In [12]:
hdp = models.HdpModel(corpus, id2word=dictionary)

In [13]:
hdp_corpus = hdp[corpus]

In [16]:
len(hdp.print_topics())

20

In [59]:
for doc in hdp_corpus:
    print(doc)

[(3, 0.99767324789440082)]
[(5, 0.99678104064091966)]
[(1, 0.99893826492628346)]
[(6, 0.99567487712287894)]
[(2, 0.99787685871067133)]
[(4, 0.99731896388285901)]
[(0, 0.99975584069053314)]
[(7, 0.98603970056616019)]


In [44]:
hdp.show_topics(20, 5)

[(0,
  '0.018*cashatt + 0.013*hotel + 0.009*onity + 0.008*brocious + 0.007*lock'),
 (1,
  '0.041*iot + 0.032*devices + 0.025*security + 0.011*standards + 0.008*moyer'),
 (2, '0.018*men + 0.017*work + 0.012*jobs + 0.008*problem + 0.007*automation'),
 (3,
  '0.013*intercept + 0.011*election + 0.010*winner + 0.009*russian + 0.008*nsa'),
 (4,
  '0.013*russia + 0.013*putin + 0.006*hackers + 0.006*election + 0.006*russias'),
 (5,
  '0.008*news + 0.008*social + 0.008*information + 0.006*fomo + 0.006*media'),
 (6,
  '0.014*blue + 0.011*technology + 0.009*plants + 0.009*river + 0.007*weeds'),
 (7,
  '0.005*orwl + 0.005*data + 0.004*interior + 0.003*accelerometer + 0.003*password'),
 (8,
  '0.002*bulked + 0.002*florida + 0.002*badge + 0.002*loose + 0.002*dresser'),
 (9,
  '0.003*defeated + 0.003*wink + 0.003*toaster + 0.003*protection + 0.002*include'),
 (10,
  '0.002*commercial + 0.002*newsbreak + 0.002*bad + 0.002*vc + 0.002*devastated'),
 (11,
  '0.003*tore + 0.003*common + 0.003*image + 0.00