## Module2
## Processing and Understanding Text

In [1]:
# remove the comment below to install nltk if not already installed
#pip install nltk

In [2]:
# Text Tokenization (Word and Sentence Tokenization) page(11)
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# remove the comment below to download NLTK data files

# Download necessary NLTK data files
# nltk.download('punkt')

In [3]:

# Sample text
text = "The quick brown fox jumped over the lazy dog."
# Sentence Tokenization
print(".................")
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)
# Word Tokenization
print(".................")
words = word_tokenize(text)
print("Word Tokenization:", words)

.................
Sentence Tokenization: ['The quick brown fox jumped over the lazy dog.']
.................
Word Tokenization: ['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']


In [4]:
# Stemming   page (13)
from nltk.stem import PorterStemmer
# Initialize the Porter Stemmer
ps = PorterStemmer()
# Words to be stemmed
words_to_stem = ["running", "runs", "runner", "easily", "fairly"]
# Stem each word
stemmed_words = [ps.stem(word) for word in words_to_stem]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['run', 'run', 'runner', 'easili', 'fairli']


In [5]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK data files
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()
# Words to be lemmatized
words_to_lemmatize = ["running", "runs", "runner", "easily",
"fairly"]
# Lemmatize each word (as a verb in this case)
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for
word in words_to_lemmatize]
print("Lemmatized Words (verbs):", lemmatized_words)

Lemmatized Words (verbs): ['run', 'run', 'runner', 'easily', 'fairly']


In [6]:
# ▪ Handle contractions (I’m to I am, can’t to cannot)
# ▪ Remove or replace specific words or phrases

In [7]:
# Code to implement POS "Part of Speech"
import nltk
from nltk.tokenize import word_tokenize
# Download necessary NLTK datasets 
#       (Activate the following 2 line if this is the 1st time)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The quick brown fox jumps over the lazy dog."
# Tokenize the text
tokens = word_tokenize(text)
# Perform POS tagging
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [8]:
#Example of Shallow Parsing in Action (Python using NLTK):
import nltk
from nltk import pos_tag
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize
# Sample sentence
sentence = "John Doe is working at IBM in New York."
# Tokenizing and POS tagging
tokens = word_tokenize(sentence)
tagged_tokens = pos_tag(tokens)
# Shallow parsing using Named Entity Recognition (NER)
chunked = ne_chunk(tagged_tokens)
print(chunked)

(S
  (PERSON John/NNP)
  (ORGANIZATION Doe/NNP)
  is/VBZ
  working/VBG
  at/IN
  (ORGANIZATION IBM/NNP)
  in/IN
  (GPE New/NNP York/NNP)
  ./.)


## SpaCy

In [9]:
# remove the comment below to install spacy if not already installed
#pip install spacy

In [10]:
# Dependency Parsing using spaCy   page (37)
# Importing spaCy
import spacy
# Load spaCy's English model for dependency parsing
nlp = spacy.load("en_core_web_sm")
# Dependency Parsing using spaCy
def dependency_parse(sentence):
    # Process the sentence
    doc = nlp(sentence)
    print("Dependency Parsing Output:")
    for token in doc:
        print(f"{token.text} -> {token.dep_} -> {token.head.text}")
    print()
# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."
# Perform Dependency Parsing
dependency_parse(sentence)

Dependency Parsing Output:
The -> det -> fox
quick -> amod -> fox
brown -> amod -> fox
fox -> nsubj -> jumps
jumps -> ROOT -> jumps
over -> prep -> jumps
the -> det -> dog
lazy -> amod -> dog
dog -> pobj -> over
. -> punct -> jumps



In [11]:
# Constituency Parsing using spaCy    page (38)
# Importing spaCy
import spacy
from spacy import displacy
# Load spaCy's English model for syntactic parsing
nlp = spacy.load("en_core_web_sm")
# Constituency Parsing (Tree visualization) using spaCy
def constituency_parse(sentence):
    # Process the sentence
    doc = nlp(sentence)
    print("Constituency-like Tree Visualization:")
    #displacy.serve(doc, style="dep", page=True)
    displacy.render(doc, style="dep")
# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."
# Perform Constituency-like Tree Visualization
constituency_parse(sentence)

Constituency-like Tree Visualization:
