# Tokenization

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
text = "I'm having fun learning NLP, starting with the basics of NLTK"

In [4]:
type(text)

str

In [5]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

word_tokenize(text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['I',
 "'m",
 'having',
 'fun',
 'learning',
 'NLP',
 ',',
 'starting',
 'with',
 'the',
 'basics',
 'of',
 'NLTK']

# Stop words removal

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

In [8]:
# Sample text
txt = "Why is the sky blue? Why didn't you call me? Why didn't I think of that? "

In [9]:
# Tokenize
words = word_tokenize(txt)

In [10]:
stp = set(stopwords.words("english"))

In [11]:
filtered_words = [w for w in words if w not in stp]
print(filtered_words)

['Why', 'sky', 'blue', '?', 'Why', "n't", 'call', '?', 'Why', "n't", 'I', 'think', '?']


# Stemming and Lemmatization

**Stemming**: A rule-based or algorithmic process that chops off prefixes or suffixes from words, often resulting in non-words or invalid forms. <br>
**Example**: "running", "runner", and "runs" might all be stemmed to "run" or "runn". <br>
**Pros**: Faster and computationally less expensive than lemmatization. <br>
**Cons**: Can produce incorrect or meaningless stems and may not consider the context of the word. 

**Lemmatization**: A more sophisticated process that considers the word's context and part of speech (POS) to determine its lemma (dictionary form). <br>
**Example**:
"better" would be lemmatized to "good", and "studies" would be lemmatized to "study". <br>
**Pros**:
Produces valid words and considers context, leading to more accurate text analysis. <br>
**Cons**:
More computationally expensive and slower than stemming. <br>

In [12]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()

sample_words = ["Program", "Programming", "Programs", "Programmer","Programmed"]

# Perform stemming
print("{0:20}{1:20}".format("Word", "Stem"))
for word in sample_words:
    print("{0:20}{1:20}".format(word, ps.stem(word)))

Word                Stem                
Program             program             
Programming         program             
Programs            program             
Programmer          programm            
Programmed          program             


In [30]:
from nltk.stem import WordNetLemmatizer

In [31]:
lemmatizer = WordNetLemmatizer()

sample_words = ["Program", "Programming", "Programs", "Programmer","Programmed"]

# Perform lemmatization
print("{0:20}{1:20}".format("Word", "Stem"))
for word in sample_words:
    print("{0:20}{1:20}".format(word, lemmatizer.lemmatize(word, pos="v")))

Word                Stem                
Program             Program             
Programming         Programming         
Programs            Programs            
Programmer          Programmer          
Programmed          Programmed          
