## Library

In [4]:
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data

In [5]:
dataset = ["the house had a tiny little mouse",
           "the cat saw the mouse",
           "the house mouse ran away from the house",
           "the cat finally ate the mouse",
           "the end of the mouse story"]
print(dataset)

['the house had a tiny little mouse', 'the cat saw the mouse', 'the house mouse ran away from the house', 'the cat finally ate the mouse', 'the end of the mouse story']


## n-gram

### unigram

In [9]:
tfidf_model = TfidfVectorizer(max_features=20, 
                              stop_words='english', 
                              ngram_range=(1, 1))
X = tfidf_model.fit_transform(dataset)
pprint(tfidf_model.vocabulary_)

{'ate': 0,
 'away': 1,
 'cat': 2,
 'end': 3,
 'finally': 4,
 'house': 5,
 'little': 6,
 'mouse': 7,
 'ran': 8,
 'saw': 9,
 'story': 10,
 'tiny': 11}


### bigram

In [11]:
tfidf_model = TfidfVectorizer(max_features=20, 
                              stop_words='english', 
                              ngram_range=(2, 2))
X = tfidf_model.fit_transform(dataset)
pprint(tfidf_model.vocabulary_)

{'ate mouse': 0,
 'away house': 1,
 'cat finally': 2,
 'cat saw': 3,
 'end mouse': 4,
 'finally ate': 5,
 'house mouse': 6,
 'house tiny': 7,
 'little mouse': 8,
 'mouse ran': 9,
 'mouse story': 10,
 'ran away': 11,
 'saw mouse': 12,
 'tiny little': 13}


### trigram

In [12]:
tfidf_model = TfidfVectorizer(max_features=20, 
                              stop_words='english', 
                              ngram_range=(3, 3))
X = tfidf_model.fit_transform(dataset)
pprint(tfidf_model.vocabulary_)

{'cat finally ate': 0,
 'cat saw mouse': 1,
 'end mouse story': 2,
 'finally ate mouse': 3,
 'house mouse ran': 4,
 'house tiny little': 5,
 'mouse ran away': 6,
 'ran away house': 7,
 'tiny little mouse': 8}


### range

In [13]:
tfidf_model = TfidfVectorizer(max_features=20, 
                              stop_words='english', 
                              ngram_range=(1, 2))
X = tfidf_model.fit_transform(dataset)
pprint(tfidf_model.vocabulary_)

{'ate': 0,
 'ate mouse': 1,
 'away': 2,
 'away house': 3,
 'cat': 4,
 'cat finally': 5,
 'cat saw': 6,
 'end': 7,
 'end mouse': 8,
 'finally': 9,
 'finally ate': 10,
 'house': 11,
 'little mouse': 12,
 'mouse': 13,
 'mouse ran': 14,
 'ran': 15,
 'ran away': 16,
 'saw': 17,
 'saw mouse': 18,
 'story': 19}
