## Libraries

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data

In [2]:
dataset = ["the house had a tiny little mouse",
           "the cat saw the mouse",
           "the house mouse ran away from the house",
           "the cat finally ate the mouse",
           "the end of the mouse story"]
print(dataset)

['the house had a tiny little mouse', 'the cat saw the mouse', 'the mouse ran away from the house', 'the cat finally ate the mouse', 'the end of the mouse story']


## Count Vectorizer

In [3]:
# Count Vectorizer 

cv_model = CountVectorizer(stop_words='english')
X = cv_model.fit_transform(dataset)
print(X)

  (0, 5)	1
  (0, 11)	1
  (0, 6)	1
  (0, 7)	1
  (1, 7)	1
  (1, 2)	1
  (1, 9)	1
  (2, 5)	1
  (2, 7)	1
  (2, 8)	1
  (2, 1)	1
  (3, 7)	1
  (3, 2)	1
  (3, 4)	1
  (3, 0)	1
  (4, 7)	1
  (4, 3)	1
  (4, 10)	1


In [4]:
print(cv_model.vocabulary_)

{'house': 5, 'tiny': 11, 'little': 6, 'mouse': 7, 'cat': 2, 'saw': 9, 'ran': 8, 'away': 1, 'finally': 4, 'ate': 0, 'end': 3, 'story': 10}


In [5]:
print(X.toarray())

[[0 0 0 0 0 1 1 1 0 0 0 1]
 [0 0 1 0 0 0 0 1 0 1 0 0]
 [0 1 0 0 0 1 0 1 1 0 0 0]
 [1 0 1 0 1 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 1 0 0 1 0]]


In [6]:
pd.DataFrame(X.toarray(), 
             index = dataset,
             columns=cv_model.get_feature_names())

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
the house had a tiny little mouse,0,0,0,0,0,1,1,1,0,0,0,1
the cat saw the mouse,0,0,1,0,0,0,0,1,0,1,0,0
the mouse ran away from the house,0,1,0,0,0,1,0,1,1,0,0,0
the cat finally ate the mouse,1,0,1,0,1,0,0,1,0,0,0,0
the end of the mouse story,0,0,0,1,0,0,0,1,0,0,1,0


## TF-IDF Vectorizer

In [7]:
tfidf_model = TfidfVectorizer(max_features=20, stop_words='english')
X = tfidf_model.fit_transform(dataset)
print(X)

  (0, 7)	0.2808823162882302
  (0, 6)	0.5894630806320427
  (0, 11)	0.5894630806320427
  (0, 5)	0.47557510189256375
  (1, 9)	0.7297183669435993
  (1, 2)	0.5887321837696324
  (1, 7)	0.3477147117091919
  (2, 1)	0.5894630806320427
  (2, 8)	0.5894630806320427
  (2, 7)	0.2808823162882302
  (2, 5)	0.47557510189256375
  (3, 0)	0.5894630806320427
  (3, 4)	0.5894630806320427
  (3, 2)	0.47557510189256375
  (3, 7)	0.2808823162882302
  (4, 10)	0.6700917930430479
  (4, 3)	0.6700917930430479
  (4, 7)	0.3193023297639811


In [8]:
pprint(tfidf_model.vocabulary_)

{'ate': 0,
 'away': 1,
 'cat': 2,
 'end': 3,
 'finally': 4,
 'house': 5,
 'little': 6,
 'mouse': 7,
 'ran': 8,
 'saw': 9,
 'story': 10,
 'tiny': 11}


In [9]:
pprint(X.toarray())

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4755751 , 0.58946308, 0.28088232, 0.        , 0.        ,
        0.        , 0.58946308],
       [0.        , 0.        , 0.58873218, 0.        , 0.        ,
        0.        , 0.        , 0.34771471, 0.        , 0.72971837,
        0.        , 0.        ],
       [0.        , 0.58946308, 0.        , 0.        , 0.        ,
        0.4755751 , 0.        , 0.28088232, 0.58946308, 0.        ,
        0.        , 0.        ],
       [0.58946308, 0.        , 0.4755751 , 0.        , 0.58946308,
        0.        , 0.        , 0.28088232, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.67009179, 0.        ,
        0.        , 0.        , 0.31930233, 0.        , 0.        ,
        0.67009179, 0.        ]])


In [16]:
pd.DataFrame(X.toarray(), 
             index = dataset,
             columns=tfidf_model.get_feature_names())

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
the house had a tiny little mouse,0.0,0.0,0.0,0.0,0.0,0.423186,0.611264,0.27134,0.0,0.0,0.0,0.611264
the cat saw the mouse,0.0,0.0,0.599739,0.0,0.0,0.0,0.0,0.324657,0.0,0.731376,0.0,0.0
the mouse ran away from the house,0.0,0.611264,0.0,0.0,0.0,0.423186,0.0,0.27134,0.611264,0.0,0.0,0.0
the cat finally ate the mouse,0.590336,0.0,0.484084,0.0,0.590336,0.0,0.0,0.26205,0.0,0.0,0.0,0.0
the end of the mouse story,0.0,0.0,0.0,0.674653,0.0,0.0,0.0,0.299478,0.0,0.0,0.674653,0.0
mouse in the house,0.0,0.0,0.0,0.0,0.0,0.841819,0.0,0.53976,0.0,0.0,0.0,0.0
