In [1]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kittu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#1 Word Analysis
from nltk.tokenize import word_tokenize as wt
from nltk.stem import PorterStemmer as ps
from nltk import *
text = "Natural Language Processing is fascinating!"
token = wt(text)
print("Tokens:",wt(text))

stemmer = ps()
words = [stemmer.stem(x) for x in token]
print("Stems:",words)

print("Pos tags:", pos_tag(token))

Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']
Stems: ['natur', 'languag', 'process', 'is', 'fascin', '!']
Pos tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('fascinating', 'VBG'), ('!', '.')]


In [3]:
#Morphology
from nltk.tokenize import word_tokenize as wt
from nltk.stem import PorterStemmer as ps
from nltk.stem import WordNetLemmatizer as wn
from nltk import *
text = "Natural Language Processing is fascinating!"
token = wt(text)
print("Tokens:",wt(text))

stemmer = ps()
words = [stemmer.stem(x) for x in token]
print("Stems:",words)

lem = wn()
wl = [lem.lemmatize(x) for x in token]
print("lemma:",wl)

Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']
Stems: ['natur', 'languag', 'process', 'is', 'fascin', '!']
lemma: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']


In [4]:
#N-Grams
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize


def gen(text, n):
    words = word_tokenize(text)
    n_grams = list(ngrams(words, n))
    return n_grams

text = "Natural Language Processing is fascinating and powerful."
n = 3
result = gen(text, n)
print(f"{n}-grams:", result)

3-grams: [('Natural', 'Language', 'Processing'), ('Language', 'Processing', 'is'), ('Processing', 'is', 'fascinating'), ('is', 'fascinating', 'and'), ('fascinating', 'and', 'powerful'), ('and', 'powerful', '.')]


In [5]:
#N-Grams Smoothing
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

def gen(text, n):
    words = word_tokenize(text)
    n_grams = list(ngrams(words, n))
    freq_dist = FreqDist(n_grams)
    vocab_size = len(set(n_grams))
    smoothed_n_grams = [(gram, (freq_dist[gram] + 1) / (len(n_grams) + vocab_size)) for gram in n_grams]
    return smoothed_n_grams

text_to_analyze = "Natural Language Processing is fascinating and powerful."
n_value = 2
result_ngrams = gen(text_to_analyze, n_value)
print(f"{n_value}-grams with Add-One Smoothing:", result_ngrams)

2-grams with Add-One Smoothing: [(('Natural', 'Language'), 0.14285714285714285), (('Language', 'Processing'), 0.14285714285714285), (('Processing', 'is'), 0.14285714285714285), (('is', 'fascinating'), 0.14285714285714285), (('fascinating', 'and'), 0.14285714285714285), (('and', 'powerful'), 0.14285714285714285), (('powerful', '.'), 0.14285714285714285)]


In [6]:
#6. WORD TOKENIZER
from nltk.tokenize import word_tokenize as wt
text = "Natural Language Processing is fascinating!"
token = wt(text)
print("Tokens:",wt(text))

Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']


In [7]:
#7 SENTENCE TOKENIZER
from nltk.tokenize import sent_tokenize as st
text = "Natural Language Processing is fascinating!. Show sensitivity to linguistic phenomena and an ability to model them with formal grammars."
token = st(text)
# print("Tokens:",st(text))

for i,x in enumerate(token,start = 1):
    print(f"{i}. {x}")

1. Natural Language Processing is fascinating!.
2. Show sensitivity to linguistic phenomena and an ability to model them with formal grammars.


In [8]:
#8 PARAGRAPH TOKENIZER

text = """ NLTK is a powerful library for natural language processing.It provides tools for tasks such as tokenization, stemming, and part-of-speech tagging.

The library is widely used in the field of artificial intelligence.It is a valuable resource for researchers and developers working on NLP projects.
"""

para = text.split('\n\n')

for i,x in enumerate(para,start = 1):
    print(f"{i}. {x.strip()}")

1. NLTK is a powerful library for natural language processing.It provides tools for tasks such as tokenization, stemming, and part-of-speech tagging.
2. The library is widely used in the field of artificial intelligence.It is a valuable resource for researchers and developers working on NLP projects.


In [14]:
#9 Corpora
import nltk
from nltk.corpus import reuters

nltk.download('reuters')

def load_reuters_corpus():
    corpus = reuters
    
    print("Number of Categories:", len(corpus.categories()))
    print("Categories:", corpus.categories()[:10])
    print("File IDs in 'crude' category:", corpus.fileids('crude')[:5])
    
    document_id = 'test/14826'
    document_text = corpus.raw(document_id)
    print("\nText of Document (ID:", document_id, "):\n", document_text[:500])

load_reuters_corpus()


Number of Categories: 90
Categories: ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee']
File IDs in 'crude' category: ['test/14829', 'test/15063', 'test/15200', 'test/15230', 'test/15238']

Text of Document (ID: test/14826 ):
 ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\kittu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [11]:
#10. .PROBABILISTIC PARSING
import nltk
from nltk import CFG
from nltk.parse import EarleyChartParser

pcfg_grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | NP PP | 'John'
    Det -> 'the' | 'a'
    N -> 'man' | 'dog' | 'cat'
    VP -> V NP | VP PP
    V -> 'chased' | 'saw'
    PP -> P NP
    P -> 'with' | 'in'
""")

def probabilistic_parsing(sentence):
    tokens = nltk.word_tokenize(sentence)
    parser = EarleyChartParser(pcfg_grammar)
    for tree in parser.parse(tokens):
        print("Parse Tree:")
        print(tree)
        break
        
example_sentence = "the man saw a cat with a dog"
probabilistic_parsing(example_sentence)


Parse Tree:
(S
  (NP (Det the) (N man))
  (VP
    (VP (V saw) (NP (Det a) (N cat)))
    (PP (P with) (NP (Det a) (N dog)))))


In [12]:
import nltk
from nltk import PCFG
from nltk.parse import EarleyChartParser

pcfg_grammar = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.4] | 'John' [0.1]
    Det -> 'the' [0.6] | 'a' [0.4]
    N -> 'man' [0.5] | 'dog' [0.3] | 'cat' [0.2]
    VP -> V NP [0.7] | VP PP [0.3]
    V -> 'chased' [0.4] | 'saw' [0.6]
    PP -> P NP [1.0]
    P -> 'with' [0.7] | 'in' [0.3]
""")

def probabilistic_parsing(sentence):
    tokens = nltk.word_tokenize(sentence)
    parser = EarleyChartParser(pcfg_grammar)
    for tree in parser.parse(tokens):
        print("Parse Tree with Probability:", tree)
        break  

example_sentence = "the man saw a cat with a dog"
probabilistic_parsing(example_sentence)


Parse Tree with Probability: (S
  (NP (Det the) (N man))
  (VP
    (VP (V saw) (NP (Det a) (N cat)))
    (PP (P with) (NP (Det a) (N dog)))))
