<a href="https://colab.research.google.com/github/ihabiba/NLP-Labs/blob/main/NLP_Pipeline_Builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TOKENIZATION

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# download the sentence & word tokenizer models
nltk.download('punkt_tab')

text = """Artificial Intelligence (AI) is transforming industries worldwide.
With its applications in healthcare, finance, and education, AI provides innovative solutions."""

# Sentence and Word Tokenization
def tokenize_text(text):
    print("--- Tokenization ---")
    print("Sentence Tokenization:")
    sentences = sent_tokenize(text)
    print(sentences)

    print("\nWord Tokenization:")
    words = word_tokenize(text)
    print(words)

tokenize_text(text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


--- Tokenization ---
Sentence Tokenization:
['Artificial Intelligence (AI) is transforming industries worldwide.', 'With its applications in healthcare, finance, and education, AI provides innovative solutions.']

Word Tokenization:
['Artificial', 'Intelligence', '(', 'AI', ')', 'is', 'transforming', 'industries', 'worldwide', '.', 'With', 'its', 'applications', 'in', 'healthcare', ',', 'finance', ',', 'and', 'education', ',', 'AI', 'provides', 'innovative', 'solutions', '.']


## STEMMING

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "Artificial Intelligence is transforming industries worldwide."
tokenlist = word_tokenize(text)

def stem_text(tokenlist):
    print("--- Stemming ---")
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokenlist]
    print("Stemmed Words:")
    print(stemmed_words)

stem_text(tokenlist)

--- Stemming ---
Stemmed Words:
['artifici', 'intellig', 'is', 'transform', 'industri', 'worldwid', '.']


## LEMMATIZATION

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('wordnet')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

def lemmatize_text(words):
    print("--- Lemmatization ---")
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    print("Lemmatized Words:")
    print(lemmatized_words)

lemmatize_text(words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


--- Lemmatization ---
Lemmatized Words:
['Artificial', 'Intelligence', 'is', 'transforming', 'industry', 'worldwide', '.']


## STOP WORD REMOVAL

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

def identify_stop_words(words):
    print("--- Stop Words ---")
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    print("Filtered Words (without stop words):")
    print(filtered_words)

identify_stop_words(words)

--- Stop Words ---
Filtered Words (without stop words):
['Artificial', 'Intelligence', 'transforming', 'industries', 'worldwide', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## POS (Part-of-Speech) TAGGING

In [6]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger_eng')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

def pos_tagging(words):
    print("--- POS Tagging ---")
    pos_tags = nltk.pos_tag(words)
    print("POS Tags:")
    print(pos_tags)

pos_tagging(words)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


--- POS Tagging ---
POS Tags:
[('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('is', 'VBZ'), ('transforming', 'VBG'), ('industries', 'NNS'), ('worldwide', 'RB'), ('.', '.')]


## DEPENDENCY PARSING (with spaCy)

In [7]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

text = "Artificial Intelligence is transforming industries worldwide."

def dependency_parsing(text):
    print("--- Dependency Parsing ---")
    doc = nlp(text)
    for token in doc:
        print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}, POS: {token.pos_}")

dependency_parsing(text)

# Visualize the dependency tree (works in Jupyter)
from spacy import displacy
displacy.render(nlp(text), style="dep", jupyter=True)

--- Dependency Parsing ---
Word: Artificial, Dependency: compound, Head: Intelligence, POS: PROPN
Word: Intelligence, Dependency: nsubj, Head: transforming, POS: PROPN
Word: is, Dependency: aux, Head: transforming, POS: AUX
Word: transforming, Dependency: ROOT, Head: transforming, POS: VERB
Word: industries, Dependency: dobj, Head: transforming, POS: NOUN
Word: worldwide, Dependency: advmod, Head: transforming, POS: ADV
Word: ., Dependency: punct, Head: transforming, POS: PUNCT


## NAMED ENTITY RECOGNITION (NER)

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Artificial Intelligence is transforming industries worldwide."

def named_entity_recognition(text):
    print("--- Named Entity Recognition ---")
    doc = nlp(text)
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")

named_entity_recognition(text)

--- Named Entity Recognition ---
Entity: Artificial Intelligence, Label: PERSON


## Task 3.1 — Why is Text Preprocessing Essential in Building an NLP Pipeline?

#### Text preprocessing is important because it cleans and standardizes raw text before applying NLP or machine learning models. It removes noise like punctuation and stop words, and uses steps such as tokenization, stemming, and lemmatization to make text simpler and more consistent. This helps models focus on meaningful patterns and improves overall accuracy.

## TASK 3.2 — GUTENBERG CORPUS ANALYSIS

In [9]:
import nltk
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

# Download required datasets
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Load and Explore the Dataset
print("--- Available Texts in Gutenberg Corpus ---")
print(gutenberg.fileids())

# Select a sample text
text = gutenberg.raw('austen-emma.txt')
print("\n--- First 500 Characters of the Text ---")
print(text[:500])

# Step 2: Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)

print("\n--- Number of Sentences ---")
print(len(sentences))
print("--- Number of Words ---")
print(len(words))

# Step 3: Remove Stop Words
stop_words = set(stopwords.words('english'))
filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

print("\n--- First 20 Non-Stop Words ---")
print(filtered_words[:20])

# Step 4: Word Frequency Analysis
freq_dist = FreqDist(filtered_words)
print("\n--- 10 Most Common Words ---")
print(freq_dist.most_common(10))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--- Available Texts in Gutenberg Corpus ---
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

--- First 500 Characters of the Text ---
[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had 

## TASK 3.3 — EFFECT OF STOP WORD REMOVAL

In [10]:
# ======== TASK 3.3 — EFFECT OF STOP WORD REMOVAL ========

all_words = [word.lower() for word in words if word.isalpha()]
freq_all = FreqDist(all_words)

print("\n--- 10 Most Common Words (WITHOUT Removing Stop Words) ---")
print(freq_all.most_common(10))


--- 10 Most Common Words (WITHOUT Removing Stop Words) ---
[('the', 5201), ('to', 5181), ('and', 4877), ('of', 4284), ('i', 3177), ('a', 3124), ('it', 2503), ('her', 2448), ('was', 2396), ('she', 2336)]


## TASK 3.4 — APPLY FULL NLP PIPELINE ON A BIGGER DATASET


In [13]:
import nltk
import spacy
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer


# Download required resources
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

# Load a large text (Jane Austen’s "Emma")
text = gutenberg.raw('austen-emma.txt')

# Load spaCy model for parsing and NER
nlp = spacy.load("en_core_web_sm")

# First 5000 Characters of the Text
sample_text = text[:5000]

# Tokenization
tokens = word_tokenize(sample_text)
print("--- Number of Tokens ---", len(tokens))
print("Sample Tokens:", tokens[:20])

# Stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(word.lower()) for word in tokens if word.isalpha()]
print("\nSample Stems:", stems[:20])

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha()]
print("\nSample Lemmas:", lemmas[:20])

# Stop word removal
stop_words = set(stopwords.words("english"))
filtered = [w for w in lemmas if w not in stop_words]
print("\nFiltered Words (No Stop Words):", filtered[:20])
print("Remaining Word Count:", len(filtered))

# POS tagging
pos_tags = pos_tag(filtered)
print("\nPOS Tags (first 15):", pos_tags[:15])

# Dependency parsing
doc = nlp(" ".join(filtered[:30]))
print("\nDependency Relations:")
for token in doc:
    print(f"{token.text} → {token.dep_} → {token.head.text}")

# Named entity recognition
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


--- Number of Tokens --- 992
Sample Tokens: ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']

Sample Stems: ['emma', 'by', 'jane', 'austen', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', 'handsom', 'clever', 'and', 'rich', 'with', 'a', 'comfort', 'home', 'and', 'happi']

Sample Lemmas: ['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy']

Filtered Words (No Stop Words): ['emma', 'jane', 'austen', 'volume', 'chapter', 'emma', 'woodhouse', 'handsome', 'clever', 'rich', 'comfortable', 'home', 'happy', 'disposition', 'seemed', 'unite', 'best', 'blessing', 'existence', 'lived']
Remaining Word Count: 417

POS Tags (first 15): [('emma', 'NN'), ('jane', 'NN'), ('austen', 'JJ'), ('volume', 'NN'), ('chapter', 'NN'), ('emma', 'NN'), ('woodhouse', 'IN'), ('handsome', 'JJ

In [None]:
#