# NLTK (1: Tokenize)

In [17]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = 'Anaconda is a package manager, an environment manager, and Python distribution that contains a collection of many open source packages. This is advantageous as when you are working on a data science project, you will find that you need many different packages (numpy, scikit-learn, scipy, pandas to name a few), which an installation of Anaconda comes preinstalled with. If you need additional packages after installing Anaconda, you can use Anacondas package manager, conda, or pip to install those packages.'
for i in sent_tokenize(text):
    print(i)
    print()


Anaconda is a package manager, an environment manager, and Python distribution that contains a collection of many open source packages.

This is advantageous as when you are working on a data science project, you will find that you need many different packages (numpy, scikit-learn, scipy, pandas to name a few), which an installation of Anaconda comes preinstalled with.

If you need additional packages after installing Anaconda, you can use Anacondas package manager, conda, or pip to install those packages.



In [20]:
for i in word_tokenize(text):
    print(i)
   

Anaconda
is
a
package
manager
,
an
environment
manager
,
and
Python
distribution
that
contains
a
collection
of
many
open
source
packages
.
This
is
advantageous
as
when
you
are
working
on
a
data
science
project
,
you
will
find
that
you
need
many
different
packages
(
numpy
,
scikit-learn
,
scipy
,
pandas
to
name
a
few
)
,
which
an
installation
of
Anaconda
comes
preinstalled
with
.
If
you
need
additional
packages
after
installing
Anaconda
,
you
can
use
Anacondas
package
manager
,
conda
,
or
pip
to
install
those
packages
.


# NLTK (2: Stop Words)

In [22]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
print(stop_words) 

#these words are the stop words in english language by default

{'most', 'she', 'myself', "couldn't", 've', 'under', 'will', 'weren', 'hers', "hadn't", 'didn', 'hadn', 'am', 'the', 'he', "shouldn't", 'just', 'been', 'so', 'should', 'my', 'to', 'isn', 'do', 'does', 'don', 'until', "aren't", 'wouldn', 'because', 'our', 'where', 'of', "won't", 'll', 'few', 'for', 'by', 'between', "didn't", 'but', "you'll", 'them', 'through', 'or', 'against', 'after', 'all', 'and', 'at', 'while', 'his', 'once', 'yourself', 'aren', 'with', 'some', 'too', 'yours', 'those', 'before', 'hasn', 'm', 'why', 'how', "shan't", 'when', "you've", 'yourselves', 'it', 'into', 'ma', 'themselves', "wouldn't", 'not', 'below', 'other', 'can', 'has', 'nor', 'whom', 'doing', "don't", 'now', "doesn't", 'only', 'couldn', 'have', 'did', 'shouldn', 'be', 'during', 'up', 'above', 'own', "you're", 'that', 'each', "haven't", "it's", 'are', 'both', 'down', 'then', "hasn't", 'there', 'such', "mightn't", 'out', 'itself', 'a', 'than', 'himself', 'you', "weren't", "you'd", 'these', "that'll", 'on', '

In [25]:
text = "You can upload and commit an existing file to a GitHub repository. Drag and drop a file to any directory in the file tree, or upload files from the repository's main page."
text_words = word_tokenize(text)
result_word = []
for i in text_words:
    if i not in stop_words:
        result_word.append(i)
print(result_word)

#output show the text_words with out stops words or we just remove the stop words in our text

# print(len(text_words))
# print(len(result_word))

['You', 'upload', 'commit', 'existing', 'file', 'GitHub', 'repository', '.', 'Drag', 'drop', 'file', 'directory', 'file', 'tree', ',', 'upload', 'files', 'repository', "'s", 'main', 'page', '.']


# NLTK (3: Stemming)

In [31]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
text = "Ali plays the cricket today while it was playing from his baat the match has been lost that played by ali's team"
words = word_tokenize(text)

for i in words:
    print(ps.stem(i))
    
# in output the text contain three words plays, played and playing stem convert it into a one word play means it convert roots form words into a word

ali
play
the
cricket
today
while
it
wa
play
from
hi
baat
the
match
ha
been
lost
that
play
by
ali
's
team


# NLTK (4: Tagging)

In [38]:
import nltk
text = 'Anaconda is a package manager, an environment manager, and Python distribution that contains a collection of many open source packages. This is advantageous as when you are working on a data science project, you will find that you need many different packages (numpy, scikit-learn, scipy, pandas to name a few), which an installation of Anaconda comes preinstalled with. If you need additional packages after installing Anaconda, you can use Anacondas package manager, conda, or pip to install those packages.'
words = nltk.word_tokenize(text)
print(nltk.pos_tag(words))
#POS-tag, processes a sequence of words and attaches a part of speech tag to each word

[('Anaconda', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('package', 'NN'), ('manager', 'NN'), (',', ','), ('an', 'DT'), ('environment', 'NN'), ('manager', 'NN'), (',', ','), ('and', 'CC'), ('Python', 'NNP'), ('distribution', 'NN'), ('that', 'WDT'), ('contains', 'VBZ'), ('a', 'DT'), ('collection', 'NN'), ('of', 'IN'), ('many', 'JJ'), ('open', 'JJ'), ('source', 'NN'), ('packages', 'NNS'), ('.', '.'), ('This', 'DT'), ('is', 'VBZ'), ('advantageous', 'JJ'), ('as', 'IN'), ('when', 'WRB'), ('you', 'PRP'), ('are', 'VBP'), ('working', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('data', 'NN'), ('science', 'NN'), ('project', 'NN'), (',', ','), ('you', 'PRP'), ('will', 'MD'), ('find', 'VB'), ('that', 'IN'), ('you', 'PRP'), ('need', 'VBP'), ('many', 'JJ'), ('different', 'JJ'), ('packages', 'NNS'), ('(', '('), ('numpy', 'JJ'), (',', ','), ('scikit-learn', 'JJ'), (',', ','), ('scipy', 'JJ'), (',', ','), ('pandas', 'JJ'), ('to', 'TO'), ('name', 'VB'), ('a', 'DT'), ('few', 'JJ'), (')', ')'), (',', ','), ('which', 'W

In [39]:
import nltk
train_text = 'Anaconda is a package manager, an environment manager, and Python distribution that contains a collection of many open source packages. This is advantageous as when you are working on a data science project, you will find that you need many different packages (numpy, scikit-learn, scipy, pandas to name a few), which an installation of Anaconda comes preinstalled with. If you need additional packages after installing Anaconda, you can use Anacondas package manager, conda, or pip to install those packages.'
test_text = 'When you create a new environment, Navigator installs the same Python version you used when you downloaded and installed Anaconda. If you want to use a different version of Python, for example Python 3.5, simply create a new environment and specify the version of Python that you want in that environment.'
sent_tokenizer = nltk.PunktSentenceTokenizer(train_text)
tokenized = sent_tokenize.tokenize(train_text)
for i in tokenized:
    words = nltk.word_tokenize(i)
    print(nltk.pos_tag(words))

[('Anaconda', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('package', 'NN'), ('manager', 'NN'), (',', ','), ('an', 'DT'), ('environment', 'NN'), ('manager', 'NN'), (',', ','), ('and', 'CC'), ('Python', 'NNP'), ('distribution', 'NN'), ('that', 'WDT'), ('contains', 'VBZ'), ('a', 'DT'), ('collection', 'NN'), ('of', 'IN'), ('many', 'JJ'), ('open', 'JJ'), ('source', 'NN'), ('packages', 'NNS'), ('.', '.')]
[('This', 'DT'), ('is', 'VBZ'), ('advantageous', 'JJ'), ('as', 'IN'), ('when', 'WRB'), ('you', 'PRP'), ('are', 'VBP'), ('working', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('data', 'NN'), ('science', 'NN'), ('project', 'NN'), (',', ','), ('you', 'PRP'), ('will', 'MD'), ('find', 'VB'), ('that', 'IN'), ('you', 'PRP'), ('need', 'VBP'), ('many', 'JJ'), ('different', 'JJ'), ('packages', 'NNS'), ('(', '('), ('numpy', 'JJ'), (',', ','), ('scikit-learn', 'JJ'), (',', ','), ('scipy', 'JJ'), (',', ','), ('pandas', 'JJ'), ('to', 'TO'), ('name', 'VB'), ('a', 'DT'), ('few', 'JJ'), (')', ')'), (',', ','), ('which', '

In [45]:
nltk.help.upenn_tagset('RB') #this is the way how we get info about the part of speech word

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


In [46]:
nltk.help.upenn_tagset('PRP')

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [47]:
nltk.help.upenn_tagset('NNS')

NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


In [None]:
'''POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent\'s
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when'''

# NLTK (5: Chuncking)

In [21]:
import nltk 
text = "I eat pasta yesterday"
words = nltk.word_tokenize(text)
print(words)
tagged = nltk.pos_tag(words)
chunks = nltk.chunk.ne_chunk(tagged)
print(chunks)

['I', 'eat', 'pasta', 'yesterday']
(S I/PRP eat/VBP pasta/NN yesterday/NN)


In [12]:
import nltk
sentence = "GreyAtom is committed to building an educational ecosystem for learners to upskill & help them make a career in data science."
grammar = ('''                   
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')

tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

chunk_pars = nltk.RegexpParser(grammar)
tree = chunk_pars.parse(tagged)
for subtree in tree.subtrees():
    print(subtree)

#tree.draw()

#Chunking in Natural Language Processing (NLP) is the process by which we group various words together by their part of speech tags. 


(S
  GreyAtom/NNP
  is/VBZ
  committed/VBN
  to/TO
  building/VBG
  (NP an/DT educational/JJ ecosystem/NN)
  for/IN
  learners/NNS
  to/TO
  upskill/VB
  &/CC
  help/VB
  them/PRP
  make/VB
  (NP a/DT career/NN)
  in/IN
  data/NNS
  (NP science/NN)
  ./.)
(NP an/DT educational/JJ ecosystem/NN)
(NP a/DT career/NN)
(NP science/NN)


# NLTK (6: Chinking)

In [11]:
import nltk
grammar = r"""
    NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
sentence = 'GreyAtom is committed to building an educational ecosystem for learners to upskill & help them make a career in data science.'

tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

cp = nltk.RegexpParser(grammar)
result = cp.parse(tagged)
for r in result:
    print(r)
    
# result.draw()

#Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. 
#The chunk that you remove from your chunk is your chink.

(NP
  GreyAtom/NNP
  is/VBZ
  committed/VBN
  to/TO
  building/VBG
  an/DT
  educational/JJ
  ecosystem/NN)
('for', 'IN')
(NP
  learners/NNS
  to/TO
  upskill/VB
  &/CC
  help/VB
  them/PRP
  make/VB
  a/DT
  career/NN)
('in', 'IN')
(NP data/NNS science/NN ./.)


# NLTK (6: Name Entity Recognition)

In [35]:
from nltk import *

sentence = "Mark and John are working at Google."
print(ne_chunk(pos_tag(word_tokenize(sentence))))

tagged = ne_chunk(pos_tag(word_tokenize(sentence)))
ab = tree2conlltags(tagged)
print(ab)

(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON John/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Google/NNP)
  ./.)
[('Mark', 'NNP', 'B-PERSON'), ('and', 'CC', 'O'), ('John', 'NNP', 'B-PERSON'), ('are', 'VBP', 'O'), ('working', 'VBG', 'O'), ('at', 'IN', 'O'), ('Google', 'NNP', 'B-ORGANIZATION'), ('.', '.', 'O')]
