<a href="https://colab.research.google.com/github/harikagirigari/nlplab/blob/main/NLP_exp_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Sample text dataset
documents = [
    "Natural Language Processing is very interesting and powerful.",
    "Students are learning NLP techniques for text analysis.",
    "Text preprocessing includes tokenization, stemming and lemmatization."
]

# Convert to DataFrame
df = pd.DataFrame(documents, columns=["Original_Text"])
df

Unnamed: 0,Original_Text
0,Natural Language Processing is very interestin...
1,Students are learning NLP techniques for text ...
2,"Text preprocessing includes tokenization, stem..."


In [None]:
nltk.download('punkt_tab')
# Tokenization
df["Tokens"] = df["Original_Text"].apply(word_tokenize)
df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,Original_Text,Tokens
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte..."
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for..."
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ..."


In [None]:
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df["Normalized_Text"] = df["Original_Text"].apply(normalize_text)
df

Unnamed: 0,Original_Text,Tokens,Normalized_Text
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte...",natural language processing is very interestin...
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for...",students are learning nlp techniques for text ...
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ...",text preprocessing includes tokenization stemm...


In [None]:
df["Normalized_Tokens"] = df["Normalized_Text"].apply(word_tokenize)
df

Unnamed: 0,Original_Text,Tokens,Normalized_Text,Normalized_Tokens
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte...",natural language processing is very interestin...,"[natural, language, processing, is, very, inte..."
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for...",students are learning nlp techniques for text ...,"[students, are, learning, nlp, techniques, for..."
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ...",text preprocessing includes tokenization stemm...,"[text, preprocessing, includes, tokenization, ..."


In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df["Without_Stopwords"] = df["Normalized_Tokens"].apply(remove_stopwords)
df

Unnamed: 0,Original_Text,Tokens,Normalized_Text,Normalized_Tokens,Without_Stopwords
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte...",natural language processing is very interestin...,"[natural, language, processing, is, very, inte...","[natural, language, processing, interesting, p..."
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for...",students are learning nlp techniques for text ...,"[students, are, learning, nlp, techniques, for...","[students, learning, nlp, techniques, text, an..."
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ...",text preprocessing includes tokenization stemm...,"[text, preprocessing, includes, tokenization, ...","[text, preprocessing, includes, tokenization, ..."


In [None]:
stemmer = PorterStemmer()

def apply_stemming(tokens):
    return [stemmer.stem(word) for word in tokens]

df["Stemmed_Text"] = df["Without_Stopwords"].apply(apply_stemming)
df

Unnamed: 0,Original_Text,Tokens,Normalized_Text,Normalized_Tokens,Without_Stopwords,Stemmed_Text
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte...",natural language processing is very interestin...,"[natural, language, processing, is, very, inte...","[natural, language, processing, interesting, p...","[natur, languag, process, interest, power]"
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for...",students are learning nlp techniques for text ...,"[students, are, learning, nlp, techniques, for...","[students, learning, nlp, techniques, text, an...","[student, learn, nlp, techniqu, text, analysi]"
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ...",text preprocessing includes tokenization stemm...,"[text, preprocessing, includes, tokenization, ...","[text, preprocessing, includes, tokenization, ...","[text, preprocess, includ, token, stem, lemmat]"


In [None]:
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df["Lemmatized_Text"] = df["Without_Stopwords"].apply(apply_lemmatization)
df

Unnamed: 0,Original_Text,Tokens,Normalized_Text,Normalized_Tokens,Without_Stopwords,Stemmed_Text,Lemmatized_Text
0,Natural Language Processing is very interestin...,"[Natural, Language, Processing, is, very, inte...",natural language processing is very interestin...,"[natural, language, processing, is, very, inte...","[natural, language, processing, interesting, p...","[natur, languag, process, interest, power]","[natural, language, processing, interesting, p..."
1,Students are learning NLP techniques for text ...,"[Students, are, learning, NLP, techniques, for...",students are learning nlp techniques for text ...,"[students, are, learning, nlp, techniques, for...","[students, learning, nlp, techniques, text, an...","[student, learn, nlp, techniqu, text, analysi]","[student, learning, nlp, technique, text, anal..."
2,"Text preprocessing includes tokenization, stem...","[Text, preprocessing, includes, tokenization, ...",text preprocessing includes tokenization stemm...,"[text, preprocessing, includes, tokenization, ...","[text, preprocessing, includes, tokenization, ...","[text, preprocess, includ, token, stem, lemmat]","[text, preprocessing, includes, tokenization, ..."
