In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kumha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kumha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kumha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kumha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open("sample.txt","r") as file:
    sample_text = file.read()

In [4]:
sample_text

'Hi i m kartik'

In [5]:
tokens = word_tokenize(sample_text)
tokens

['Hi', 'i', 'm', 'kartik']

In [6]:
pos_tags = pos_tag(tokens)
pos_tags

[('Hi', 'NNP'), ('i', 'NN'), ('m', 'VBP'), ('kartik', 'NN')]

In [7]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words ]

In [8]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
stemmed_words

['hi', 'kartik']

In [9]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
lemmatized_words

['Hi', 'kartik']

In [10]:
lowercase_words = [word.lower() for word in filtered_tokens]
lowercase_words

['hi', 'kartik']

In [11]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_matrix = tf_idf_vectorizer.fit_transform([sample_text])
print(tf_idf_matrix)

  (0, 1)	0.7071067811865475
  (0, 0)	0.7071067811865475


In [12]:
feature_names = tf_idf_vectorizer.get_feature_names_out()
feature_names

array(['hi', 'kartik'], dtype=object)

In [13]:

# Step 1: Calculate TF score for each term
tf_vectorizer = TfidfVectorizer(use_idf=False)
tf_matrix = tf_vectorizer.fit_transform([sample_text])

# Get feature names and TF values
feature_names_tf = tf_vectorizer.get_feature_names_out()
tf_values = tf_matrix.toarray()[0]

# Step 2: Calculate IDF score for each term
idf_vectorizer = TfidfVectorizer(use_idf=True)
idf_matrix = idf_vectorizer.fit_transform([sample_text])

# Get IDF values
idf_values = idf_vectorizer.idf_

# Step 3: Calculate TF-IDF score for each term
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform([sample_text])

# Get feature names and TF-IDF values
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.toarray()[0]

# Print TF scores for each term
print("Term Frequency (TF) scores:")
for term, value in zip(feature_names_tf, tf_values):
    print(term, ":", value)

# Print IDF scores for each term
print("\nInverse Document Frequency (IDF) scores:")
for term, value in zip(feature_names_tfidf, idf_values):
    print(term, ":", value)

Term Frequency (TF) scores:
hi : 0.7071067811865475
kartik : 0.7071067811865475

Inverse Document Frequency (IDF) scores:
hi : 1.0
kartik : 1.0
