In [2]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import re 

STOPWORDS = stopwords.words('english')


In [3]:
data = ['Romeo and Juliet',
        'Juliet: O happy dagger!',
        'Romeo died by dagger.',
        '“Live free or die”, that’s the New-Hampshire’s motto.',
        'Did you know, New-Hampshire is in New-England.']

In [4]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

In [5]:
tokenized_data

[['romeo', 'juliet'],
 ['juliet', 'happy', 'dagger'],
 ['romeo', 'died', 'dagger'],
 ['live', 'free', 'die', 'new-hampshire', 'motto'],
 ['know', 'new-hampshire', 'new-england']]

In [8]:
from gensim import corpora, models

In [10]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
print(dictionary)

Dictionary(12 unique tokens: ['juliet', 'romeo', 'dagger', 'happy', 'died']...)


In [11]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [12]:
print(corpus)

[[(0, 1), (1, 1)], [(0, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(9, 1), (10, 1), (11, 1)]]


In [21]:
[print(v) for v in dictionary.values()]

juliet
romeo
dagger
happy
died
die
free
live
motto
new-hampshire
know
new-england


[None, None, None, None, None, None, None, None, None, None, None, None]

In [13]:
NUM_TOPICS = 2
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

LSI Model:
Topic #0: 0.562*"new-hampshire" + 0.397*"free" + 0.397*"die" + 0.397*"motto" + 0.397*"live" + 0.164*"know" + 0.164*"new-england" + -0.000*"dagger" + -0.000*"happy" + 0.000*"juliet"
Topic #1: -0.577*"dagger" + -0.500*"romeo" + -0.500*"juliet" + -0.289*"died" + -0.289*"happy" + -0.000*"die" + -0.000*"live" + -0.000*"motto" + -0.000*"free" + -0.000*"new-hampshire"


In [15]:
lsi_model.show_topic(0)

[('new-hampshire', 0.561516668266344),
 ('free', 0.39705224388040894),
 ('die', 0.39705224388040894),
 ('motto', 0.3970522438804089),
 ('live', 0.3970522438804089),
 ('know', 0.164464424385935),
 ('new-england', 0.16446442438593498),
 ('dagger', -5.499073418846479e-16),
 ('happy', -4.0592529337857296e-16),
 ('juliet', 1.7000290064572714e-16)]