<a href="https://colab.research.google.com/github/toche7/AI_ITM/blob/main/Lab10_ExampleNLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
text = "It's been a long day at work, and now I'm finally home. Home, sweet home! The weather was quite bad: rainy and windy, but it didn't dampen my spirits."


In [11]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
import string

# First-time use: download necessary NLTK data
nltk.download('punkt') ## a pre-trained model used for tokenization.
nltk.download('averaged_perceptron_tagger') #for Part-of-Speech (POS) tagging
nltk.download('stopwords') #  a list of stop words.

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natthawatboonchaiseree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/natthawatboonchaiseree/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natthawatboonchaiseree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Cleaning
# Convert to lowercase and remove punctuation
text_clean = text.lower().translate(str.maketrans('', '', string.punctuation))
print("Original Text:", text)
print("Cleaned Text:", text_clean)

Original Text: It's been a long day at work, and now I'm finally home. Home, sweet home! The weather was quite bad: rainy and windy, but it didn't dampen my spirits.
Cleaned Text: its been a long day at work and now im finally home home sweet home the weather was quite bad rainy and windy but it didnt dampen my spirits


In [13]:
# Tokenization
# Split into sentences
sentences = sent_tokenize(text_clean)
# Split into words
words = [word_tokenize(sentence) for sentence in sentences]
print("Tokenized Sentences:", sentences)
print("Tokenized Words:", words)

Tokenized Sentences: ['its been a long day at work and now im finally home home sweet home the weather was quite bad rainy and windy but it didnt dampen my spirits']
Tokenized Words: [['its', 'been', 'a', 'long', 'day', 'at', 'work', 'and', 'now', 'im', 'finally', 'home', 'home', 'sweet', 'home', 'the', 'weather', 'was', 'quite', 'bad', 'rainy', 'and', 'windy', 'but', 'it', 'didnt', 'dampen', 'my', 'spirits']]


In [14]:
# Parsing (Part-of-Speech Tagging)
tagged_words = [pos_tag(word) for word in words]
print("POS Tagged Words:", tagged_words)

POS Tagged Words: [[('its', 'PRP$'), ('been', 'VBN'), ('a', 'DT'), ('long', 'JJ'), ('day', 'NN'), ('at', 'IN'), ('work', 'NN'), ('and', 'CC'), ('now', 'RB'), ('im', 'VBP'), ('finally', 'RB'), ('home', 'VBN'), ('home', 'NN'), ('sweet', 'NN'), ('home', 'VBD'), ('the', 'DT'), ('weather', 'NN'), ('was', 'VBD'), ('quite', 'RB'), ('bad', 'JJ'), ('rainy', 'NN'), ('and', 'CC'), ('windy', 'NN'), ('but', 'CC'), ('it', 'PRP'), ('didnt', 'VBZ'), ('dampen', 'JJ'), ('my', 'PRP$'), ('spirits', 'NNS')]]


In [15]:
# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in word_list if word not in stop_words] for word_list in words]
print("Filtered Words:", filtered_words)

Filtered Words: [['long', 'day', 'work', 'im', 'finally', 'home', 'home', 'sweet', 'home', 'weather', 'quite', 'bad', 'rainy', 'windy', 'didnt', 'dampen', 'spirits']]


In [16]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [[stemmer.stem(word) for word in word_list] for word_list in filtered_words]
print("Stemmed Words:", stemmed_words)


Stemmed Words: [['long', 'day', 'work', 'im', 'final', 'home', 'home', 'sweet', 'home', 'weather', 'quit', 'bad', 'raini', 'windi', 'didnt', 'dampen', 'spirit']]
