<a href="https://colab.research.google.com/github/krisdam/krisdam/blob/main/NLP_Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
# Sample text documents
documents = [
    "I saw Santa at the big building today along wiht a bunch of other people.",
    "There was a lot of traffic on the way to that place in the evening.",
    "Running helps to build stamina and strength.",
    "He ran swiftly and caught the ball."
]

# Initialize stemmers
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer("english")

# Define a simple regex for stemming (this can be customized)
regex_pattern = r'(ing|ed|es|s)$'
regex_stemmer = RegexpStemmer(regex_pattern)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
# Function to apply different stemmers
def apply_stemmers(documents):
    results = {}

    for doc in documents:
        # Tokenize the document
        tokens = word_tokenize(doc.lower())

        # Apply different stemmers
        porter_stems = [porter_stemmer.stem(token) for token in tokens]
        lancaster_stems = [lancaster_stemmer.stem(token) for token in tokens]
        snowball_stems = [snowball_stemmer.stem(token) for token in tokens]
        regex_stems = [regex_stemmer.stem(token) for token in tokens]

        # Store results
        results[doc] = {
            'porter': porter_stems,
            'lancaster': lancaster_stems,
            'snowball': snowball_stems,
            'regex': regex_stems
        }

    return results

# Apply the stemmers to the sample documents
stemmed_results = apply_stemmers(documents)

# Print the results
for original_doc, stems in stemmed_results.items():
    print(f"\nOriginal Document: {original_doc}")
    for stemmer_name, stemmed_words in stems.items():
        print(f"{stemmer_name.capitalize()} Stems: {stemmed_words}")



Original Document: I saw Santa at the big building today along wiht a bunch of other people.
Porter Stems: ['i', 'saw', 'santa', 'at', 'the', 'big', 'build', 'today', 'along', 'wiht', 'a', 'bunch', 'of', 'other', 'peopl', '.']
Lancaster Stems: ['i', 'saw', 'sant', 'at', 'the', 'big', 'build', 'today', 'along', 'wiht', 'a', 'bunch', 'of', 'oth', 'peopl', '.']
Snowball Stems: ['i', 'saw', 'santa', 'at', 'the', 'big', 'build', 'today', 'along', 'wiht', 'a', 'bunch', 'of', 'other', 'peopl', '.']
Regex Stems: ['i', 'saw', 'santa', 'at', 'the', 'big', 'build', 'today', 'along', 'wiht', 'a', 'bunch', 'of', 'other', 'people', '.']

Original Document: There was a lot of traffic on the way to that place in the evening.
Porter Stems: ['there', 'wa', 'a', 'lot', 'of', 'traffic', 'on', 'the', 'way', 'to', 'that', 'place', 'in', 'the', 'even', '.']
Lancaster Stems: ['ther', 'was', 'a', 'lot', 'of', 'traff', 'on', 'the', 'way', 'to', 'that', 'plac', 'in', 'the', 'ev', '.']
Snowball Stems: ['there', 

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('wordnet')


# Sample text documents
documents = [
    "The cats are running and jumping in the garden.",
    "She is a beautiful runner and loves to run fast.",
    "Running helps to build stamina and strength.",
    "He ran swiftly and caught the ball."
]

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to apply lemmatization
def apply_lemmatization(doc):
    # Tokenize the document
    tokens = word_tokenize(doc)

    # Apply lemmatization to each token
    lemmatized_tokens = [lemmatizer.lemmatize(token,pos='v') for token in tokens]

    return {
        "original": tokens,
        "lemmatized": lemmatized_tokens
    }

# Process each document and print results
for doc in documents:
    results = apply_lemmatization(doc)
    print(f"Original Document: {results['original']}")
    print(f"Lemmatized: {results['lemmatized']}")
    print("-" * 40)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Original Document: ['The', 'cats', 'are', 'running', 'and', 'jumping', 'in', 'the', 'garden', '.']
Lemmatized: ['The', 'cat', 'be', 'run', 'and', 'jump', 'in', 'the', 'garden', '.']
----------------------------------------
Original Document: ['She', 'is', 'a', 'beautiful', 'runner', 'and', 'loves', 'to', 'run', 'fast', '.']
Lemmatized: ['She', 'be', 'a', 'beautiful', 'runner', 'and', 'love', 'to', 'run', 'fast', '.']
----------------------------------------
Original Document: ['Running', 'helps', 'to', 'build', 'stamina', 'and', 'strength', '.']
Lemmatized: ['Running', 'help', 'to', 'build', 'stamina', 'and', 'strength', '.']
----------------------------------------
Original Document: ['He', 'ran', 'swiftly', 'and', 'caught', 'the', 'ball', '.']
Lemmatized: ['He', 'run', 'swiftly', 'and', 'catch', 'the', 'ball', '.']
----------------------------------------
