### Creating a Dictionary and Corpus using Gensim

In [2]:
!pip install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/09/ed/b59a2edde05b7f5755ea68648487c150c7c742361e9c8733c6d4ca005020/gensim-3.8.1-cp37-cp37m-win_amd64.whl (24.2MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/37/c0/25d19badc495428dec6a4bf7782de617ee0246a9211af75b302a2681dea7/smart_open-1.8.4.tar.gz (63kB)
Collecting boto3 (from smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/1d/84/d7c12ecefe2963706d984e47ee1b3f21929df289fc486c36ea17b6f5bdef/boto3-1.9.249-py2.py3-none-any.whl (128kB)
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3->smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e

In [5]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize

In [6]:
docs = ['Hello Everyone, I am creating Dictionary',
       'How are you all. Hows the work',
       'I am not liking my time mismanagement',
       'I have to work harder',
       'Its me, ME, ME, ME']

In [9]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in docs]

In [10]:
tokenized_docs

[['hello', 'everyone', ',', 'i', 'am', 'creating', 'dictionary'],
 ['how', 'are', 'you', 'all', '.', 'hows', 'the', 'work'],
 ['i', 'am', 'not', 'liking', 'my', 'time', 'mismanagement'],
 ['i', 'have', 'to', 'work', 'harder'],
 ['its', 'me', ',', 'me', ',', 'me', ',', 'me']]

In [11]:
dictionary = Dictionary(tokenized_docs)

In [14]:
print(dictionary)

Dictionary(25 unique tokens: [',', 'am', 'creating', 'dictionary', 'everyone']...)


In [15]:
dictionary.token2id

{',': 0,
 'am': 1,
 'creating': 2,
 'dictionary': 3,
 'everyone': 4,
 'hello': 5,
 'i': 6,
 '.': 7,
 'all': 8,
 'are': 9,
 'how': 10,
 'hows': 11,
 'the': 12,
 'work': 13,
 'you': 14,
 'liking': 15,
 'mismanagement': 16,
 'my': 17,
 'not': 18,
 'time': 19,
 'harder': 20,
 'have': 21,
 'to': 22,
 'its': 23,
 'me': 24}

In [16]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [17]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(1, 1), (6, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(6, 1), (13, 1), (20, 1), (21, 1), (22, 1)],
 [(0, 3), (23, 1), (24, 4)]]

### Tf - idf
Term frequency - inverse document frequency

In [18]:
from gensim.models.tfidfmodel import TfidfModel

In [19]:
tfidf = TfidfModel(corpus)

In [22]:
tfidf[corpus[0]]

[(0, 0.2612510893013631),
 (1, 0.2612510893013631),
 (2, 0.45887990913790905),
 (3, 0.45887990913790905),
 (4, 0.45887990913790905),
 (5, 0.45887990913790905),
 (6, 0.14564564063520627)]