# Gensim Tutorial

In [4]:
import os
from gensim import corpora, models, similarities
from gensim.parsing.porter import PorterStemmer
from gensim.utils import lemmatize

## Load all the files from the `data/wired` folder

In [21]:
docs = []
stop_words = set()
stemmer = PorterStemmer()

with open('./data/lists/stopwords.txt') as f:
    stop_words = [line.rstrip().lower() for line in f]

illegal_chars = [word for word in stop_words if len(word) == 1 and not word.isalnum()]

for fname in os.listdir('./data/wired'):
    if fname.endswith('.txt'):
        with open('./data/wired/{}'.format(fname)) as f:
            words = f.read().lower().split(' ') # split words on spaces
            words = [w for w in words if w not in stop_words] # take only words not in stop words file
            words = [''.join([c for c in w if c not in illegal_chars]) for w in words] # ignore illegal terms
            words = [w for w in words if len(w) > 1] # only take words with two or more chars
            docs += [words]

## Create a dictionary and bag of words corpus

In [22]:
dictionary = corpora.Dictionary(docs) # dictionary for the entire corpus
corpus = [dictionary.doc2bow(doc) for doc in docs] # word count vectors (bag of words)

# saving dictionary and corpus to disk
dictionary.save('./data/wired/models/dictionary.dict') 
corpora.MmCorpus.serialize('./data/wired/models/corpus.mm', corpus)

## Create the TFIDF model

In [23]:
# create tfidf representation
mm = corpora.MmCorpus('./data/wired/models/corpus.mm')
tfidf = models.TfidfModel(mm)
corpus_tfidf = tfidf[corpus]

## Create an LSI (Latent Semantic Indexing) Model (uses fold-in method)

In [24]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(100)

[(0,
  '0.223*"iot" + 0.193*"cashatt" + 0.165*"intercept" + 0.164*"russia" + 0.164*"putin" + 0.157*"men" + 0.142*"hotel" + 0.126*"russian" + 0.118*"devices" + 0.099*"nsa"'),
 (1,
  '0.238*"men" + -0.201*"intercept" + -0.168*"russia" + -0.168*"putin" + -0.144*"russian" + 0.144*"blue" + -0.121*"nsa" + 0.111*"cashatt" + 0.103*"jobs" + -0.101*"complaint"'),
 (2,
  '-0.207*"men" + 0.196*"orwl" + 0.145*"iot" + 0.132*"plus" + 0.098*"drill" + 0.098*"dedicated" + 0.098*"motherboard" + 0.098*"thanks" + 0.098*"impervious" + 0.098*"bluetooth"'),
 (3,
  '0.342*"iot" + 0.277*"blue" + 0.176*"devices" + 0.173*"plants" + 0.138*"weeds" + 0.138*"crops" + -0.122*"fomo" + 0.114*"river" + 0.104*"herbicide" + 0.104*"deere"'),
 (4,
  '-0.579*"iot" + -0.299*"devices" + 0.207*"blue" + -0.154*"standards" + 0.129*"plants" + -0.127*"men" + 0.103*"weeds" + 0.103*"crops" + -0.103*"moyer" + 0.087*"river"'),
 (5,
  '-0.381*"men" + -0.162*"jobs" + 0.150*"iot" + -0.136*"labor" + -0.136*"automation" + -0.132*"intercept" 

In [25]:
# Which topic does each document relate to the most?
for doc in corpus_lsi:
    print(doc)

[(0, 0.46405693080986837), (1, -0.4975223526943412), (2, 0.054809208300446131), (3, -0.097889236748156791), (4, 0.20471033493426063), (5, -0.30788549940249527), (6, 0.084769529301734731), (7, 0.61696891857037361)]
[(0, 0.3823846180616261), (1, 0.19641391716682063), (2, -0.26788404084227152), (3, -0.55304109527437872), (4, -0.1302077979394439), (5, 0.45887019528461526), (6, -0.42435470620029542), (7, 0.17318897532368149)]
[(0, 0.34732985151243745), (1, -0.078430771785807304), (2, 0.19528912841816742), (3, 0.45720100406162045), (4, -0.75829118166089426), (5, 0.1937249130383327), (6, 0.019304057891772912), (7, 0.11459704064536227)]
[(0, 0.26900319868311517), (1, 0.34487934077107757), (2, -0.23165803282949635), (3, 0.65036755017874248), (4, 0.47677980667598296), (5, 0.21621144570108231), (6, -0.15822961467530444), (7, 0.18151898149007936)]
[(0, 0.33868403972744382), (1, 0.45085439915079017), (2, -0.38749612480819379), (3, -0.044861004070406975), (4, -0.23038179415392107), (5, -0.6815818224

## Create an LDA (Latent Dirichlet Allocation) Model

In [30]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
lda_corpus = lda[corpus]

In [31]:
lda.print_topics()

[(67,
  '0.007*"hotel" + 0.006*"cashatt" + 0.005*"one" + 0.005*"onity" + 0.003*"brocious" + 0.003*"lock" + 0.003*"says" + 0.003*"room" + 0.003*"first" + 0.003*"cashatts"'),
 (72,
  '0.012*"iot" + 0.012*"security" + 0.010*"devices" + 0.005*"standards" + 0.005*"said" + 0.005*"will" + 0.004*"industry" + 0.003*"hackers" + 0.003*"data" + 0.002*"system"'),
 (52,
  '0.008*"one" + 0.006*"social" + 0.006*"hotel" + 0.006*"cashatt" + 0.005*"it" + 0.005*"news" + 0.004*"information" + 0.004*"new" + 0.004*"says" + 0.004*"work"'),
 (63,
  '0.009*"cashatt" + 0.007*"hotel" + 0.006*"one" + 0.004*"lock" + 0.004*"brocious" + 0.004*"onity" + 0.003*"cashatts" + 0.003*"says" + 0.003*"first" + 0.003*"room"'),
 (21,
  '0.007*"one" + 0.007*"work" + 0.007*"cashatt" + 0.006*"security" + 0.005*"men" + 0.005*"iot" + 0.004*"hotel" + 0.004*"devices" + 0.004*"jobs" + 0.003*"will"'),
 (75,
  '0.007*"intercept" + 0.005*"hacking" + 0.004*"winner" + 0.004*"election" + 0.004*"says" + 0.004*"blue" + 0.004*"russian" + 0.004*

In [32]:
for doc in lda_corpus:
    print(doc)

[(78, 0.99771889400921709)]
[(52, 0.99737400530504094)]
[(62, 0.99889877641824287)]
[(31, 0.99623574144486926)]
[(14, 0.98472720677046877), (62, 0.013272793229532774)]
[(81, 0.99783842794759858)]
[(22, 0.20309028338479629), (31, 0.36674462850174222), (50, 0.01087922338610909), (52, 0.28083391818873826), (54, 0.015660999137945948), (62, 0.056011298372814808), (78, 0.03790224753998863), (81, 0.025965059178403704)]
[(50, 0.98835294117646766)]
