# Text Pre-processing Techniques

This notebook demonstrates essential text pre-processing techniques used in NLP pipelines.

In [None]:
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

print("Libraries imported successfully!")

## Sample Text

Let's use a sample text to demonstrate all preprocessing techniques:

In [None]:
sample_text = """
Natural Language Processing (NLP) is AMAZING! It enables computers to understand human language. 
The students are studying various NLP techniques including tokenization, stemming, and lemmatization.
These preprocessing steps are crucial for building effective NLP applications.
"""

print("Original Text:")
print(sample_text)

## 1. Tokenization

Tokenization is the process of breaking text into smaller units (tokens) such as words or sentences.

In [None]:
# Sentence Tokenization
sentences = sent_tokenize(sample_text)
print("Sentence Tokenization:")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent.strip()}")

print(f"\nTotal sentences: {len(sentences)}")

In [None]:
# Word Tokenization
tokens = word_tokenize(sample_text)
print("Word Tokenization:")
print(tokens)
print(f"\nTotal tokens: {len(tokens)}")

## 2. Normalization

Normalization includes converting text to lowercase, removing punctuation, and standardizing text.

In [None]:
# Convert to lowercase
text_lower = sample_text.lower()
print("Lowercase Text:")
print(text_lower)

In [None]:
# Remove punctuation
tokens = word_tokenize(text_lower)
tokens_no_punct = [word for word in tokens if word not in string.punctuation]

print("Tokens without punctuation:")
print(tokens_no_punct)

## 3. Stop-word Removal

Stop words are common words (like 'the', 'is', 'are') that don't carry much meaning and can be removed.

In [None]:
# Get English stop words
stop_words = set(stopwords.words('english'))

print("Sample stop words:")
print(list(stop_words)[:20])
print(f"\nTotal stop words: {len(stop_words)}")

In [None]:
# Remove stop words
tokens_no_stopwords = [word for word in tokens_no_punct if word not in stop_words]

print("Before removing stop words:")
print(tokens_no_punct)
print(f"\nAfter removing stop words:")
print(tokens_no_stopwords)
print(f"\nTokens reduced from {len(tokens_no_punct)} to {len(tokens_no_stopwords)}")

## 4. Stemming

Stemming reduces words to their root form by removing suffixes (may not always be a valid word).

In [None]:
# Initialize stemmer
stemmer = PorterStemmer()

# Apply stemming
stemmed_words = [stemmer.stem(word) for word in tokens_no_stopwords]

print("Original vs Stemmed words:")
for original, stemmed in zip(tokens_no_stopwords, stemmed_words):
    if original != stemmed:
        print(f"{original:20} -> {stemmed}")

In [None]:
# Stemming examples
words_to_stem = ['running', 'runs', 'ran', 'runner', 'easily', 'fairly']

print("Stemming Examples:")
for word in words_to_stem:
    print(f"{word:15} -> {stemmer.stem(word)}")

## 5. Lemmatization

Lemmatization reduces words to their base form (lemma) using vocabulary and morphological analysis.

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]

print("Original vs Lemmatized words:")
for original, lemmatized in zip(tokens_no_stopwords, lemmatized_words):
    if original != lemmatized:
        print(f"{original:20} -> {lemmatized}")

In [None]:
# Lemmatization with POS tags (more accurate)
words_to_lemmatize = ['running', 'runs', 'ran', 'better', 'cacti', 'geese']

print("Lemmatization Examples:")
for word in words_to_lemmatize:
    # As verb
    lemma_v = lemmatizer.lemmatize(word, pos='v')
    # As noun
    lemma_n = lemmatizer.lemmatize(word, pos='n')
    print(f"{word:15} -> verb: {lemma_v:12} noun: {lemma_n}")

## 6. Stemming vs Lemmatization

Let's compare both techniques side by side:

In [None]:
# Compare stemming and lemmatization
test_words = ['studies', 'studying', 'studied', 'better', 'running', 'feet']

print(f"{'Word':<15} {'Stemmed':<15} {'Lemmatized':<15}")
print("-" * 45)
for word in test_words:
    stemmed = stemmer.stem(word)
    lemmatized = lemmatizer.lemmatize(word, pos='v')
    print(f"{word:<15} {stemmed:<15} {lemmatized:<15}")

## 7. Complete Preprocessing Pipeline

Let's combine all steps into a single preprocessing function:

In [None]:
def preprocess_text(text, use_stemming=False, use_lemmatization=True):
    """
    Complete text preprocessing pipeline
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Tokenization
    tokens = word_tokenize(text)
    
    # 3. Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # 4. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Stemming or Lemmatization
    if use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    elif use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Test the pipeline
print("Original text:")
print(sample_text)
print("\nPreprocessed tokens:")
print(preprocess_text(sample_text))

## 8. Practice Exercise

Try preprocessing your own text:

In [None]:
# Your turn! Try with your own text
your_text = "The children are playing in the beautiful garden. They were running and laughing happily!"

print("Your text:")
print(your_text)
print("\nPreprocessed (with lemmatization):")
print(preprocess_text(your_text, use_lemmatization=True))
print("\nPreprocessed (with stemming):")
print(preprocess_text(your_text, use_stemming=True, use_lemmatization=False))