## Count Vectorizer

In [1]:
#!/Users/isavchuk/projects/text_mining/venv/bin/python3 -m pip install -q sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
count_vectorizer = CountVectorizer()

In [3]:
text_list = ["Be yourself; everyone else is already taken.",
            "A room without books is like a body without a soul.",
            "Be the change that you wish to see in the world.",
            "If you tell the truth, you don't need to remember anything.",
            "Always forgive your enemies; nothing annoys them so much."]

In [4]:
count_vectorizer.fit(text_list)
count_vectorizer.vocabulary_

{'be': 4,
 'yourself': 37,
 'everyone': 11,
 'else': 9,
 'is': 15,
 'already': 0,
 'taken': 25,
 'room': 21,
 'without': 33,
 'books': 6,
 'like': 16,
 'body': 5,
 'soul': 24,
 'the': 28,
 'change': 7,
 'that': 27,
 'you': 35,
 'wish': 32,
 'to': 30,
 'see': 22,
 'in': 14,
 'world': 34,
 'if': 13,
 'tell': 26,
 'truth': 31,
 'don': 8,
 'need': 18,
 'remember': 20,
 'anything': 3,
 'always': 1,
 'forgive': 12,
 'your': 36,
 'enemies': 10,
 'nothing': 19,
 'annoys': 2,
 'them': 29,
 'so': 23,
 'much': 17}

In [5]:
count_vectorizer.vocabulary_.get("soul")

24

In [6]:
transformed_vector = count_vectorizer.transform(text_list)
transformed_vector.shape

(5, 38)

In [7]:
transformed_vector.toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]])

In [8]:
count_vectorizer.transform(["a new text containing already be yourself"]).toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [9]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['already', 'be', 'else', 'everyone', 'is', 'taken', 'yourself'],
       dtype='<U8'),
 array(['body', 'books', 'is', 'like', 'room', 'soul', 'without'],
       dtype='<U8'),
 array(['be', 'change', 'in', 'see', 'that', 'the', 'to', 'wish', 'world',
        'you'], dtype='<U8'),
 array(['anything', 'don', 'if', 'need', 'remember', 'tell', 'the', 'to',
        'truth', 'you'], dtype='<U8'),
 array(['always', 'annoys', 'enemies', 'forgive', 'much', 'nothing', 'so',
        'them', 'your'], dtype='<U8')]

## N-gramm Vectorizer

In [10]:
n_gram_vectorizer = CountVectorizer(ngram_range=(2,2))
transformed_vector = n_gram_vectorizer.fit_transform(text_list)
transformed_vector.shape

(5, 41)

In [11]:
n_gram_vectorizer.vocabulary_

{'be yourself': 4,
 'yourself everyone': 40,
 'everyone else': 11,
 'else is': 9,
 'is already': 15,
 'already taken': 0,
 'room without': 21,
 'without books': 34,
 'books is': 6,
 'is like': 16,
 'like body': 17,
 'body without': 5,
 'without soul': 35,
 'be the': 3,
 'the change': 26,
 'change that': 7,
 'that you': 25,
 'you wish': 38,
 'wish to': 33,
 'to see': 31,
 'see in': 22,
 'in the': 14,
 'the world': 28,
 'if you': 13,
 'you tell': 37,
 'tell the': 24,
 'the truth': 27,
 'truth you': 32,
 'you don': 36,
 'don need': 8,
 'need to': 18,
 'to remember': 30,
 'remember anything': 20,
 'always forgive': 1,
 'forgive your': 12,
 'your enemies': 39,
 'enemies nothing': 10,
 'nothing annoys': 19,
 'annoys them': 2,
 'them so': 29,
 'so much': 23}

In [12]:
transformed_vector.toarray()

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

In [13]:
n_gram_vectorizer.inverse_transform(transformed_vector)

[array(['be yourself', 'yourself everyone', 'everyone else', 'else is',
        'is already', 'already taken'], dtype='<U17'),
 array(['room without', 'without books', 'books is', 'is like',
        'like body', 'body without', 'without soul'], dtype='<U17'),
 array(['be the', 'the change', 'change that', 'that you', 'you wish',
        'wish to', 'to see', 'see in', 'in the', 'the world'], dtype='<U17'),
 array(['if you', 'you tell', 'tell the', 'the truth', 'truth you',
        'you don', 'don need', 'need to', 'to remember',
        'remember anything'], dtype='<U17'),
 array(['always forgive', 'forgive your', 'your enemies',
        'enemies nothing', 'nothing annoys', 'annoys them', 'them so',
        'so much'], dtype='<U17')]

## TF-IDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

In [19]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_list)
tfidf_vector.shape

(5, 38)

In [20]:
tfidf_vectorizer.vocabulary_

{'be': 4,
 'yourself': 37,
 'everyone': 11,
 'else': 9,
 'is': 15,
 'already': 0,
 'taken': 25,
 'room': 21,
 'without': 33,
 'books': 6,
 'like': 16,
 'body': 5,
 'soul': 24,
 'the': 28,
 'change': 7,
 'that': 27,
 'you': 35,
 'wish': 32,
 'to': 30,
 'see': 22,
 'in': 14,
 'world': 34,
 'if': 13,
 'tell': 26,
 'truth': 31,
 'don': 8,
 'need': 18,
 'remember': 20,
 'anything': 3,
 'always': 1,
 'forgive': 12,
 'your': 36,
 'enemies': 10,
 'nothing': 19,
 'annoys': 2,
 'them': 29,
 'so': 23,
 'much': 17}

In [21]:
tfidf_vector.toarray()

array([[0.39835162, 0.        , 0.        , 0.        , 0.32138758,
        0.        , 0.        , 0.        , 0.        , 0.39835162,
        0.        , 0.39835162, 0.        , 0.        , 0.        ,
        0.32138758, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.39835162, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.39835162],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32189611, 0.32189611, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25970376, 0.32189611, 0.        , 0.        , 0.        ,
        0.        , 0.32189611, 0.        , 0.        , 0.32189611,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.64379222, 0.        ,
   

In [22]:
tfidf_vectorizer.idf_

array([2.09861229, 2.09861229, 2.09861229, 2.09861229, 1.69314718,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       1.69314718, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 1.69314718, 2.09861229,
       1.69314718, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       1.69314718, 2.09861229, 2.09861229])

In [23]:
dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))



{'already': 2.09861228866811,
 'always': 2.09861228866811,
 'annoys': 2.09861228866811,
 'anything': 2.09861228866811,
 'be': 1.6931471805599454,
 'body': 2.09861228866811,
 'books': 2.09861228866811,
 'change': 2.09861228866811,
 'don': 2.09861228866811,
 'else': 2.09861228866811,
 'enemies': 2.09861228866811,
 'everyone': 2.09861228866811,
 'forgive': 2.09861228866811,
 'if': 2.09861228866811,
 'in': 2.09861228866811,
 'is': 1.6931471805599454,
 'like': 2.09861228866811,
 'much': 2.09861228866811,
 'need': 2.09861228866811,
 'nothing': 2.09861228866811,
 'remember': 2.09861228866811,
 'room': 2.09861228866811,
 'see': 2.09861228866811,
 'so': 2.09861228866811,
 'soul': 2.09861228866811,
 'taken': 2.09861228866811,
 'tell': 2.09861228866811,
 'that': 2.09861228866811,
 'the': 1.6931471805599454,
 'them': 2.09861228866811,
 'to': 1.6931471805599454,
 'truth': 2.09861228866811,
 'wish': 2.09861228866811,
 'without': 2.09861228866811,
 'world': 2.09861228866811,
 'you': 1.693147180559945

In [24]:
tfidf_vectorizer.inverse_transform(tfidf_vector)

[array(['taken', 'already', 'is', 'else', 'everyone', 'yourself', 'be'],
       dtype='<U8'),
 array(['soul', 'body', 'like', 'books', 'without', 'room', 'is'],
       dtype='<U8'),
 array(['world', 'in', 'see', 'to', 'wish', 'you', 'that', 'change', 'the',
        'be'], dtype='<U8'),
 array(['anything', 'remember', 'need', 'don', 'truth', 'tell', 'if', 'to',
        'you', 'the'], dtype='<U8'),
 array(['much', 'so', 'them', 'annoys', 'nothing', 'enemies', 'your',
        'forgive', 'always'], dtype='<U8')]

## Hashing Vectorizer

In [33]:
from sklearn.feature_extraction.text import HashingVectorizer

hashing_vectorizer = HashingVectorizer(n_features=16, norm=None)
feature_vector = hashing_vectorizer.transform(text_list)

print('shape:', feature_vector.shape) # each of 5 vectors has been hashed to 16 features
feature_vector.toarray()

shape: (5, 16)


array([[ 0.,  0.,  0.,  1.,  1.,  0.,  0., -2.,  1.,  0.,  0.,  0.,  0.,
         1., -1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  2., -1.,  0.,  2.,  1.,  0.,  0.,
         1., -1.,  0.],
       [ 0.,  1.,  0.,  1.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
         0., -3., -1.],
       [ 1.,  0.,  0., -1.,  0.,  0.,  2.,  0.,  0.,  1.,  0.,  1.,  2.,
         1.,  0.,  0.],
       [-1.,  0.,  0.,  0., -1.,  0.,  0., -2., -1.,  1.,  0.,  0.,  0.,
         0., -1.,  0.]])

In [34]:
hashing_vectorizer = HashingVectorizer(n_features=16, norm='l1') ## L1 norm
feature_vector = hashing_vectorizer.transform(text_list)

print('shape:', feature_vector.shape)
feature_vector.toarray()

shape: (5, 16)


array([[ 0.        ,  0.        ,  0.        ,  0.14285714,  0.14285714,
         0.        ,  0.        , -0.28571429,  0.14285714,  0.        ,
         0.        ,  0.        ,  0.        ,  0.14285714, -0.14285714,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.25      , -0.125     ,  0.        ,  0.25      ,
         0.125     ,  0.        ,  0.        ,  0.125     , -0.125     ,
         0.        ],
       [ 0.        ,  0.11111111,  0.        ,  0.11111111,  0.        ,
        -0.11111111,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.11111111,  0.11111111,  0.        , -0.33333333,
        -0.11111111],
       [ 0.11111111,  0.        ,  0.        , -0.11111111,  0.        ,
         0.        ,  0.22222222,  0.        ,  0.        ,  0.11111111,
         0.        ,  0.11111111,  0.22222222,  0.11111111,  0.        ,
         0.        ],
       [-0.14285714,  0.        ,  0

In [36]:
hashing_vectorizer = HashingVectorizer(n_features=16, norm='l2') ## L2 norm
feature_vector = hashing_vectorizer.transform(text_list)

print('shape:', feature_vector.shape)
feature_vector.toarray()

shape: (5, 16)


array([[ 0.        ,  0.        ,  0.        ,  0.33333333,  0.33333333,
         0.        ,  0.        , -0.66666667,  0.33333333,  0.        ,
         0.        ,  0.        ,  0.        ,  0.33333333, -0.33333333,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.57735027, -0.28867513,  0.        ,  0.57735027,
         0.28867513,  0.        ,  0.        ,  0.28867513, -0.28867513,
         0.        ],
       [ 0.        ,  0.25819889,  0.        ,  0.25819889,  0.        ,
        -0.25819889,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.25819889,  0.25819889,  0.        , -0.77459667,
        -0.25819889],
       [ 0.2773501 ,  0.        ,  0.        , -0.2773501 ,  0.        ,
         0.        ,  0.5547002 ,  0.        ,  0.        ,  0.2773501 ,
         0.        ,  0.2773501 ,  0.5547002 ,  0.2773501 ,  0.        ,
         0.        ],
       [-0.33333333,  0.        ,  0

In [37]:
# Hashing is a one-way operation! 
hashing_vectorizer.inverse_transform(feature_vector)

AttributeError: 'HashingVectorizer' object has no attribute 'inverse_transform'