In [25]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
sample_document = "Your sample text goes here."

# Tokenize into words
words = word_tokenize(sample_document)

# Tokenize into sentences
sentences = sent_tokenize(sample_document)

print("Tokenized Words:")
print(words)
print("\nTokenized Sentences:")
print(sentences)

Tokenized Words:
['Your', 'sample', 'text', 'goes', 'here', '.']

Tokenized Sentences:
['Your sample text goes here.']


In [27]:
pos_tags = pos_tag(words)

print("\nPOS Tags:")
print(pos_tags)


POS Tags:
[('Your', 'PRP$'), ('sample', 'NN'), ('text', 'NN'), ('goes', 'VBZ'), ('here', 'RB'), ('.', '.')]


In [28]:
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words if word.lower() not in stop_words]

print("\nFiltered Words (Stop Words Removed):")
print(filtered_words)


Filtered Words (Stop Words Removed):
['sample', 'text', 'goes', '.']


In [29]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in filtered_words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

print("\nStemmed Words:")
print(stemmed_words)
print("\nLemmatized Words:")
print(lemmatized_words)


Stemmed Words:
['sampl', 'text', 'goe', '.']

Lemmatized Words:
['sample', 'text', 'go', '.']


In [30]:
documents = [sample_document]

# Create the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
       goes      here    sample      text      your
0  0.447214  0.447214  0.447214  0.447214  0.447214
