In [1]:
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pawankumarkc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawankumarkc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
corpus = ['The quick brown fox jumps over the lazy dog.',
          'The lazy dog likes to sleep all day.',
          'The brown fox prefers to eat cheese.',
          'The red fox jumps over the brown fox.',
          'The brown dog chases the fox'
         ]

In [4]:
def text_preprocess(text):
    text = re.sub('[^a-zA-Z]',' ', text)
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

In [5]:
corpus = [text_preprocess(sent) for sent in corpus]
print(corpus)

['quick brown fox jumps lazy dog', 'lazy dog likes sleep day', 'brown fox prefers eat cheese', 'red fox jumps brown fox', 'brown dog chases fox']


In [6]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [7]:
tfidf_vector = vectorizer.transform(corpus)

In [8]:
tfidf_vector.toarray()

array([[0.30620672, 0.        , 0.        , 0.        , 0.36399815,
        0.        , 0.30620672, 0.43850426, 0.43850426, 0.        ,
        0.        , 0.54351473, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.49389914, 0.33077001,
        0.        , 0.        , 0.        , 0.39847472, 0.49389914,
        0.        , 0.        , 0.        , 0.49389914],
       [0.29550385, 0.        , 0.52451722, 0.        , 0.        ,
        0.52451722, 0.29550385, 0.        , 0.        , 0.        ,
        0.52451722, 0.        , 0.        , 0.        ],
       [0.31309104, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.62618207, 0.44836297, 0.        , 0.        ,
        0.        , 0.        , 0.55573434, 0.        ],
       [0.39032474, 0.69282362, 0.        , 0.        , 0.46399205,
        0.        , 0.39032474, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [9]:
vectorizer.get_feature_names_out()

array(['brown', 'chases', 'cheese', 'day', 'dog', 'eat', 'fox', 'jumps',
       'lazy', 'likes', 'prefers', 'quick', 'red', 'sleep'], dtype=object)

In [10]:
vectorizer.idf_

array([1.18232156, 2.09861229, 2.09861229, 2.09861229, 1.40546511,
       2.09861229, 1.18232156, 1.69314718, 1.69314718, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229])

In [11]:
import pandas as pd
df = pd.DataFrame(vectorizer.get_feature_names_out(), vectorizer.idf_)
print(df)

                0
1.182322    brown
2.098612   chases
2.098612   cheese
2.098612      day
1.405465      dog
2.098612      eat
1.182322      fox
1.693147    jumps
1.693147     lazy
2.098612    likes
2.098612  prefers
2.098612    quick
2.098612      red
2.098612    sleep


In [12]:
vectorizer.transform(corpus)

<5x14 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [13]:
vectorizer.transform(corpus).shape

(5, 14)