In [1]:
import nltk

# Sentence Tokenization

In [2]:
from nltk.tokenize import sent_tokenize
cor = "My name is Sandeep of CSE-10, currently living in BBSR. Dr. John has done ph.D at IIT Madras located in Tamil Nadu on 24/01/2023. And yay! we're going to meet soon"
sent = sent_tokenize(cor)
sent

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/codespace/nltk_data'
    - '/workspaces/ML-Basic/.conda/nltk_data'
    - '/workspaces/ML-Basic/.conda/share/nltk_data'
    - '/workspaces/ML-Basic/.conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
for i in sent:
    print(i)

My name is Sandeep of CSE-10, currently living in BBSR.
Dr. John has done ph.D at IIT Madras located in Tamil Nadu on 24/01/2023.
And yay!
we're going to meet soon


# Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize
token = word_tokenize(sent[1])
token

['Dr.',
 'John',
 'has',
 'done',
 'ph.D',
 'at',
 'IIT',
 'Madras',
 'located',
 'in',
 'Tamil',
 'Nadu',
 'on',
 '24/01/2023',
 '.']

# Stemming using Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for i in token:
    print(ps.stem(i))

dr.
john
ha
done
ph.d
at
iit
madra
locat
in
tamil
nadu
on
24/01/2023
.


# Stemming using Lancaster Stemmer

In [None]:
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
for i in token:
    print(ls.stem(i))

dr.
john
has
don
ph.d
at
iit
madra
loc
in
tamil
nadu
on
24/01/2023
.


# Stemming using Snowball Stemmer

In [None]:
from nltk.stem import SnowballStemmer
ss = SnowballStemmer('english')
for i in token:
    print(ss.stem(i))

dr.
john
has
done
ph.d
at
iit
madra
locat
in
tamil
nadu
on
24/01/2023
.


# Lemmatization using WordNet Lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
for i in token:
    print(wl.lemmatize(i))

Dr.
John
ha
done
ph.D
at
IIT
Madras
located
in
Tamil
Nadu
on
24/01/2023
.


# POS Tagging

In [None]:
from nltk import pos_tag
pos_tag(token)

[('Dr.', 'NNP'),
 ('John', 'NNP'),
 ('has', 'VBZ'),
 ('done', 'VBN'),
 ('ph.D', 'NN'),
 ('at', 'IN'),
 ('IIT', 'NNP'),
 ('Madras', 'NNP'),
 ('located', 'VBN'),
 ('in', 'IN'),
 ('Tamil', 'NNP'),
 ('Nadu', 'NNP'),
 ('on', 'IN'),
 ('24/01/2023', 'CD'),
 ('.', '.')]

# Spacy for lemmatisation and pos tagging

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(cor)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

My my PRON poss
name name NOUN nsubj
is be AUX ROOT
Sandeep Sandeep PROPN attr
of of ADP prep
CSE-10 CSE-10 PROPN pobj
, , PUNCT punct
currently currently ADV advmod
living live VERB advcl
in in ADP prep
BBSR BBSR PROPN pobj
. . PUNCT punct
Dr. Dr. PROPN compound
John John PROPN nsubj
has have AUX aux
done do VERB ROOT
ph ph NOUN compound
. . PUNCT compound
D D PROPN dobj
at at ADP prep
IIT IIT PROPN compound
Madras Madras PROPN pobj
located locate VERB acl
in in ADP prep
Tamil Tamil PROPN compound
Nadu Nadu PROPN pobj
on on ADP prep
24/01/2023 24/01/2023 NUM pobj
. . PUNCT punct
And and CCONJ cc
yay yay INTJ ROOT
! ! PUNCT punct
we we PRON nsubj
're be AUX aux
going go VERB ROOT
to to PART aux
meet meet VERB xcomp
soon soon ADV advmod


# Named Entity Recognition

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Sandeep GPE
John PERSON
IIT Madras ORG
Tamil Nadu PERSON
24/01/2023 DATE


# N-gram Models

In [None]:
from nltk.util import ngrams, bigrams, trigrams

token = nltk.word_tokenize("Hello my name is Sandeep and I am a student from BBSR.")

print(list(bigrams(token)))
print(list(trigrams(token)))
print(list(ngrams(token, 4)))

[('Hello', 'my'), ('my', 'name'), ('name', 'is'), ('is', 'Sandeep'), ('Sandeep', 'and'), ('and', 'I'), ('I', 'am'), ('am', 'a'), ('a', 'student'), ('student', 'from'), ('from', 'BBSR'), ('BBSR', '.')]
[('Hello', 'my', 'name'), ('my', 'name', 'is'), ('name', 'is', 'Sandeep'), ('is', 'Sandeep', 'and'), ('Sandeep', 'and', 'I'), ('and', 'I', 'am'), ('I', 'am', 'a'), ('am', 'a', 'student'), ('a', 'student', 'from'), ('student', 'from', 'BBSR'), ('from', 'BBSR', '.')]
[('Hello', 'my', 'name', 'is'), ('my', 'name', 'is', 'Sandeep'), ('name', 'is', 'Sandeep', 'and'), ('is', 'Sandeep', 'and', 'I'), ('Sandeep', 'and', 'I', 'am'), ('and', 'I', 'am', 'a'), ('I', 'am', 'a', 'student'), ('am', 'a', 'student', 'from'), ('a', 'student', 'from', 'BBSR'), ('student', 'from', 'BBSR', '.')]
