In [9]:
#https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.XpsAFatKhQI
import pandas as pd
 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

In [3]:
#instantiate CountVectorizer()
cv=CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(5, 16)

In [8]:
cv.get_feature_names()

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'from',
 'had',
 'house',
 'little',
 'mouse',
 'of',
 'ran',
 'saw',
 'story',
 'the',
 'tiny']

In [5]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [7]:
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


In [13]:
 # count matrix
count_vector=cv.transform(docs)
 
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)
tf_idf_vector

<5x16 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [12]:
feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


In [14]:
#simpler method with this way
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)
#or two steps
#tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
#fitted_vectorizer=tfidf_vectorizer.fit(docs)
#tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)
 
    
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


In [17]:
first_vector_tfidfvectorizer.T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.49356209],
        [0.39820278],
        [0.49356209],
        [0.23518498],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23518498],
        [0.49356209]])

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer

# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(docs)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(5, 20)
[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.5        -0.5         0.          0.
  -0.5         0.5       ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.37796447  0.          0.         -0.37796447  0.          0.
  -0.75592895  0.37796447]
 [ 0.          0.          0.57735027  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.57735027  0.          0.          0.
  -0.57735027  0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.35355339  0.          0.35355339
   0.35355339  0.          0.          0.          0.          0.
  -0.70710678  0.35355339]
 [ 0.          0.          0.         -0.35355339  0.          0.35355339
   0.     

In [21]:
vector.T.todense()

matrix([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.57735027,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        , -0.35355339],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.35355339],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.35355339,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.35355339,  0.        ],
        [ 0.        ,  0.37796447,  0.        ,  0.35355339,  0.35355339],
        [ 0.        ,  0.

In [27]:
vectorizer.stop_words

In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans

In [30]:
hasher = HashingVectorizer(n_features=20,
                                   stop_words='english', alternate_sign=False,
                                   norm=None)
vectorizer = make_pipeline(hasher, TfidfTransformer())

In [31]:
X = vectorizer.fit_transform(docs)

In [35]:
from time import time
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

done in 0.336343s
Explained variance of the SVD step: 100%


In [44]:
km = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1,verbose=True)

In [46]:
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
#print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
#print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
#print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
#print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=4, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=True)
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 0.5078668977190479
start iteration
done sorting
end inner loop
Iteration 1, inertia 0.5078668977190479
center shift 0.000000e+00 within tolerance 1.222305e-05
done in 0.007s


