In [46]:
import gensim
from gensim import corpora
from pprint import pprint
from gensim import models
import numpy as np
%matplotlib inline

In [16]:
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

In [17]:
# Splits up the different sentences into individual words
# Words in individual sentences will still be enclosed within a list
# Meaning ['word1', 'word2'], ['word1', 'word2']
texts = [[word for word in sentence.split()] for sentence in documents]

In [18]:
# Creates a GenSim dictionary
# A dictionary is a collection of words (or bag of words)
dictionary = corpora.Dictionary(texts)
# Shows how many unique words there are in the dictionary
print(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [19]:
# In a GenSim dictionary, individual words are assigned unique numerical ids
# .token2id returns a dictionary of the unique ids attached to each word
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [20]:
# Adding a new 'document' into the dictionary
documents2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]
texts2 = [[word for word in sentence.split()] for sentence in documents2]

In [30]:
texts2

[['The', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'],
 ['Graph',
  'minors',
  'IV',
  'Widths',
  'of',
  'trees',
  'and',
  'well',
  'quasi',
  'ordering'],
 ['Graph', 'minors', 'A', 'survey']]

In [26]:
dictionary.add_documents(texts2)
print(dictionary)

Dictionary(48 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [31]:
# Converting the dictionary into a bag-of-words corpus format

# Combining the list of tokenized words together
combined_text = texts + texts2

# Creating a corpus
# Use .doc2bow() to convert tokenized words into a bag-of-words corpus
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in combined_text]

In [33]:
# Interpreting the corpus
# [] Encloses each sentence/document
# (n1, n2) n1 is the unique numerical id for the word
# n2 is the number of times it appears in the sentence/document
pprint(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)],
 [(9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(7, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)],
 [(23, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)],
 [(1, 1), (15, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)],
 [(15, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(38, 1), (42, 1), (46, 1), (47, 1)]]


In [36]:
# Displaying 
count = [[(dictionary[id], count) for id, count in line] for line in corpus]
pprint(count)

[[('Saudis', 1),
  ('The', 1),
  ('a', 1),
  ('acknowledge', 1),
  ('are', 1),
  ('preparing', 1),
  ('report', 1),
  ('that', 2),
  ('will', 1)],
 [('Jamal', 1),
  ("Khashoggi's", 1),
  ('Saudi', 1),
  ('an', 1),
  ('death', 1),
  ('journalist', 1),
  ('of', 1),
  ('result', 1),
  ('the', 1),
  ('was', 1)],
 [('that', 2),
  ('was', 1),
  ('intended', 1),
  ('interrogation', 1),
  ('lead', 1),
  ('one', 1),
  ('to', 1),
  ('went', 1),
  ('wrong,', 1)],
 [('to', 2),
  ('Turkey,', 1),
  ('abduction', 1),
  ('according', 1),
  ('from', 1),
  ('his', 1),
  ('sources.', 1),
  ('two', 1)],
 [('The', 1),
  ('of', 1),
  ('graph', 1),
  ('in', 1),
  ('intersection', 1),
  ('paths', 1),
  ('trees', 1)],
 [('of', 1),
  ('trees', 1),
  ('Graph', 1),
  ('IV', 1),
  ('Widths', 1),
  ('and', 1),
  ('minors', 1),
  ('ordering', 1),
  ('quasi', 1),
  ('well', 1)],
 [('Graph', 1), ('minors', 1), ('A', 1), ('survey', 1)]]


In [50]:
# Training the tfidf model
# Need to go look up what the hyperparameters mean
# Use corpus to train the tfidf model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

In [52]:
# Print out the weights for each word in the tfidf model
# The more a word appears, the smaller the value of its weight
# If word appears in all sentences/documents, its removed altogether
for sentence in tfidf[corpus]:
    pprint([[dictionary[id], 
            np.around(freq, decimals=2)] 
            for id, freq in sentence])

[['Saudis', 0.33],
 ['The', 0.22],
 ['a', 0.33],
 ['acknowledge', 0.33],
 ['are', 0.33],
 ['preparing', 0.33],
 ['report', 0.33],
 ['that', 0.44],
 ['will', 0.33]]
[['Jamal', 0.34],
 ["Khashoggi's", 0.34],
 ['Saudi', 0.34],
 ['an', 0.34],
 ['death', 0.34],
 ['journalist', 0.34],
 ['of', 0.16],
 ['result', 0.34],
 ['the', 0.34],
 ['was', 0.23]]
[['that', 0.45],
 ['was', 0.23],
 ['intended', 0.34],
 ['interrogation', 0.34],
 ['lead', 0.34],
 ['one', 0.34],
 ['to', 0.23],
 ['went', 0.34],
 ['wrong,', 0.34]]
[['to', 0.45],
 ['Turkey,', 0.34],
 ['abduction', 0.34],
 ['according', 0.34],
 ['from', 0.34],
 ['his', 0.34],
 ['sources.', 0.34],
 ['two', 0.34]]
[['The', 0.29],
 ['of', 0.21],
 ['graph', 0.44],
 ['in', 0.44],
 ['intersection', 0.44],
 ['paths', 0.44],
 ['trees', 0.29]]
[['of', 0.17],
 ['trees', 0.24],
 ['Graph', 0.24],
 ['IV', 0.36],
 ['Widths', 0.36],
 ['and', 0.36],
 ['minors', 0.24],
 ['ordering', 0.36],
 ['quasi', 0.36],
 ['well', 0.36]]
[['Graph', 0.39], ['minors', 0.39], ['A'