In [1]:
import gensim
from gensim import corpora, models, similarities
import os
import numpy as np

#### Load some data

In [2]:
if os.path.isfile('vocab.dict'):
    dictionary = corpora.Dictionary.load('vocab.dict')
    corpus = corpora.MmCorpus('corpus.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

Used files generated from first tutorial


#### Tfidf model 

In [3]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [4]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.7071067811865475), (1, 0.7071067811865475)]


Apply to the entire corpus

In [5]:
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0]

[(0, 0.39510679503439006),
 (1, 0.39510679503439006),
 (2, 0.270464478621662),
 (3, 0.39510679503439006),
 (4, 0.270464478621662),
 (5, 0.270464478621662),
 (6, 0.39510679503439006),
 (7, 0.39510679503439006)]

#### compare gensim tfidf with customized model 

In [6]:
from tf_idf_model import bow_extractor,display_features,build_df,build_idf,build_idf_diag_matrix
from tf_idf_model import calculate_tfidf

In [7]:
tf  = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary)) ## column as a document
tf = tf.T ## convert to normal document term matrix 

In [8]:
tf[0,:]

array([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [9]:
## if we change our tfidf to be the same defination as gensim, we got same results 
## be very careful here since there are a lot viations of tfidf, especially when data size is small
## they can make a big difference 
shape = tf.shape
df = build_df(tf)
idf = build_idf(corpus,df)
idf_matrix = np.repeat(idf,shape[0],axis=0).reshape(shape[1],shape[0]).T
tfidf = np.multiply(tf,idf_matrix)
norms = np.linalg.norm(tfidf,axis=1) ## get norm along the second axis
tfidf = tfidf/norms[:,None] 

In [10]:
tfidf[0,:]

array([0.3951068 , 0.3951068 , 0.27046448, 0.3951068 , 0.27046448,
       0.27046448, 0.3951068 , 0.3951068 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])