## One time Installation

In [None]:
!pip install nltk
nltk.download('all')
!pip3 install spacy
!python -m spacy download en_core_web_sm

## **NLTK**

In [1]:
import nltk.corpus

#### **Tokenization**
Splitting up a text into meaning units 

In [2]:
import nltk
from nltk.tokenize import word_tokenize

In [3]:
sentence = 'Rohit Raj is the new Mayor of the town!!'
tokens = word_tokenize(sentence)
tokens

['Rohit', 'Raj', 'is', 'the', 'new', 'Mayor', 'of', 'the', 'town', '!', '!']

#### **Stopwords**

Removing not meaningful words like is of the an

In [4]:
from nltk.corpus import stopwords


sentence = 'Rohit Raj is the new Mayor of the town!!'

stop_words = set(stopwords.words('english'))  

word_tokens = word_tokenize(sentence)

filtered_sentence = [w for w in word_tokens if not w in stop_words]  
  
filtered_sentence = []  
  
for w in word_tokens:  
    if w not in stop_words:  
        filtered_sentence.append(w) 

filtered_sentence

['Rohit', 'Raj', 'new', 'Mayor', 'town', '!', '!']

#### **Stemming**
Stemming is the process of reducing a word to its stem by cutting it off at beginnnig or end.

**Note:** Not everytime stemming will give a result which will have a meaningfull meaning

In [5]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

stemming.stem('Winning'),stemming.stem('studies'),stemming.stem('buys')


('win', 'studi', 'buy')

#### **Lemmatization**
Lemmatization is the process of reducing a word to its lemma or dictionary form
Note: Unlike stemming will give a meaning result

In [6]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

lemma.lemmatize('Winning'),lemma.lemmatize('studies'),lemma.lemmatize('buys')

('Winning', 'study', 'buy')

#### **Parts of Speech Tagging (POS Tagging)** 

Process of Marking up a word of in a text (corpus) as corresponding to ap every token according to Grammer

In [7]:
text = 'Process of Marking up a word of in a text (corpus) as corresponding to ap every token according to Grammer'

pos_tokenize  = word_tokenize(text)

# pos_tokenize

for token in pos_tokenize:
  print(nltk.pos_tag([token]))

[('Process', 'NN')]
[('of', 'IN')]
[('Marking', 'VBG')]
[('up', 'RB')]
[('a', 'DT')]
[('word', 'NN')]
[('of', 'IN')]
[('in', 'IN')]
[('a', 'DT')]
[('text', 'NN')]
[('(', '(')]
[('corpus', 'NN')]
[(')', ')')]
[('as', 'IN')]
[('corresponding', 'VBG')]
[('to', 'TO')]
[('ap', 'NN')]
[('every', 'DT')]
[('token', 'NN')]
[('according', 'VBG')]
[('to', 'TO')]
[('Grammer', 'NN')]


#### **Named Entity Recognition (NER)**
Process of taking a string as a text if input and identifying relevant nouns (people places and organizations) that are mentioned in that string. NER Entities:

1. Facility
2. Person
3. Location
4. Organization
5. Geo-Political Entity
6. Geo-Social Political Group

In [8]:
from nltk import ne_chunk

john  = ' John lives in New York'

john_tokens = word_tokenize(john)

for tok in john_tokens:
  pos_tag = nltk.pos_tag([tok])
  print("NE Tag: ", ne_chunk(pos_tag)) 


NE Tag:  (S (PERSON John/NNP))
NE Tag:  (S lives/NNS)
NE Tag:  (S in/IN)
NE Tag:  (S (GPE New/NNP))
NE Tag:  (S (GPE York/NNP))


#### **N Grams**
It is collection of N tokens at time

Most commonly used Grams are :

**Unigrams:** one word at a time

**Bigrams:** 2 words at a time

 **Trigrams:** 3 words at a time 


In [9]:
#bigrams, ngrams

novel = 'No way was he going to turn himself in, not in this life. A werewolf would not kill himself, only frame others. That was the only way to live.'

novel_token = word_tokenize(novel)

list(nltk.bigrams(novel_token))

[('No', 'way'),
 ('way', 'was'),
 ('was', 'he'),
 ('he', 'going'),
 ('going', 'to'),
 ('to', 'turn'),
 ('turn', 'himself'),
 ('himself', 'in'),
 ('in', ','),
 (',', 'not'),
 ('not', 'in'),
 ('in', 'this'),
 ('this', 'life'),
 ('life', '.'),
 ('.', 'A'),
 ('A', 'werewolf'),
 ('werewolf', 'would'),
 ('would', 'not'),
 ('not', 'kill'),
 ('kill', 'himself'),
 ('himself', ','),
 (',', 'only'),
 ('only', 'frame'),
 ('frame', 'others'),
 ('others', '.'),
 ('.', 'That'),
 ('That', 'was'),
 ('was', 'the'),
 ('the', 'only'),
 ('only', 'way'),
 ('way', 'to'),
 ('to', 'live'),
 ('live', '.')]

In [10]:
list(nltk.trigrams(novel_token))


[('No', 'way', 'was'),
 ('way', 'was', 'he'),
 ('was', 'he', 'going'),
 ('he', 'going', 'to'),
 ('going', 'to', 'turn'),
 ('to', 'turn', 'himself'),
 ('turn', 'himself', 'in'),
 ('himself', 'in', ','),
 ('in', ',', 'not'),
 (',', 'not', 'in'),
 ('not', 'in', 'this'),
 ('in', 'this', 'life'),
 ('this', 'life', '.'),
 ('life', '.', 'A'),
 ('.', 'A', 'werewolf'),
 ('A', 'werewolf', 'would'),
 ('werewolf', 'would', 'not'),
 ('would', 'not', 'kill'),
 ('not', 'kill', 'himself'),
 ('kill', 'himself', ','),
 ('himself', ',', 'only'),
 (',', 'only', 'frame'),
 ('only', 'frame', 'others'),
 ('frame', 'others', '.'),
 ('others', '.', 'That'),
 ('.', 'That', 'was'),
 ('That', 'was', 'the'),
 ('was', 'the', 'only'),
 ('the', 'only', 'way'),
 ('only', 'way', 'to'),
 ('way', 'to', 'live'),
 ('to', 'live', '.')]

In [11]:
list(nltk.ngrams(novel_token,4))


[('No', 'way', 'was', 'he'),
 ('way', 'was', 'he', 'going'),
 ('was', 'he', 'going', 'to'),
 ('he', 'going', 'to', 'turn'),
 ('going', 'to', 'turn', 'himself'),
 ('to', 'turn', 'himself', 'in'),
 ('turn', 'himself', 'in', ','),
 ('himself', 'in', ',', 'not'),
 ('in', ',', 'not', 'in'),
 (',', 'not', 'in', 'this'),
 ('not', 'in', 'this', 'life'),
 ('in', 'this', 'life', '.'),
 ('this', 'life', '.', 'A'),
 ('life', '.', 'A', 'werewolf'),
 ('.', 'A', 'werewolf', 'would'),
 ('A', 'werewolf', 'would', 'not'),
 ('werewolf', 'would', 'not', 'kill'),
 ('would', 'not', 'kill', 'himself'),
 ('not', 'kill', 'himself', ','),
 ('kill', 'himself', ',', 'only'),
 ('himself', ',', 'only', 'frame'),
 (',', 'only', 'frame', 'others'),
 ('only', 'frame', 'others', '.'),
 ('frame', 'others', '.', 'That'),
 ('others', '.', 'That', 'was'),
 ('.', 'That', 'was', 'the'),
 ('That', 'was', 'the', 'only'),
 ('was', 'the', 'only', 'way'),
 ('the', 'only', 'way', 'to'),
 ('only', 'way', 'to', 'live'),
 ('way', 'to

#### **Finding Most Frequently occuring tokens**

In [12]:
# string
text = "Seeing that the outside world was having more and more doubts, with the mounting pressure from the upper echelons,Sherlock had no choice but to reluctantly pick out some of the forged evidence and form an evidence chain using his deductive skills."

In [13]:
paragrah_tokens = word_tokenize(text)

#checking type and number of tokens 
type(paragrah_tokens), len(paragrah_tokens)

(list, 45)

In [14]:
from nltk.probability import FreqDist
fdlist = FreqDist(paragrah_tokens)
fdlist

FreqDist({'the': 4, 'more': 2, 'and': 2, ',': 2, 'evidence': 2, 'Seeing': 1, 'that': 1, 'outside': 1, 'world': 1, 'was': 1, ...})

In [15]:
#Finding N topmost commonly occuring words
top_10 = fdlist.most_common(10)
top_10 

[('the', 4),
 ('more', 2),
 ('and', 2),
 (',', 2),
 ('evidence', 2),
 ('Seeing', 1),
 ('that', 1),
 ('outside', 1),
 ('world', 1),
 ('was', 1)]

#### **Chunking**

Technique used to group words or tokens into phrases or order to analyze the structure of a sentence. This grouping includes POS tags as well as phrases from a sentence

##### Chunking Noun Phrases

In [16]:
sample = 'The crazy brown dog went running through the mud.'

tokens= word_tokenize(sample)

pos_text = nltk.pos_tag(tokens)
pos_text

[('The', 'DT'),
 ('crazy', 'JJ'),
 ('brown', 'NN'),
 ('dog', 'NN'),
 ('went', 'VBD'),
 ('running', 'VBG'),
 ('through', 'IN'),
 ('the', 'DT'),
 ('mud', 'NN'),
 ('.', '.')]

In [17]:
grammer = (r''' 
            NP: {<DT>?<JJ>*<NN>*}  // first word determiner,? for 0 or 1 occurence,JJ for adjective, * for 0 or any number of occurence, NN for Noun
        ''')

chunkParser = nltk.RegexpParser(grammer)
tree = chunkParser.parse(pos_text)

print(tree)

(S
  The/DT
  crazy/JJ
  brown/NN
  dog/NN
  went/VBD
  running/VBG
  through/IN
  the/DT
  mud/NN
  ./.)


In [18]:
# tree.draw()

##### Chunking Verb Phrases

In [19]:
sample = 'He should wait before going swimming'

pos_tag = nltk.pos_tag(word_tokenize(sample))

grammer = (r'''
          VP: {<PRP>?<VB|VBD|VBZ|VBG>*<RB|RBR>}
          ''')

chunkParser = nltk.RegexpParser(grammer)

tree = chunkParser.parse(pos_text)

print(tree)



(S
  The/DT
  crazy/JJ
  brown/NN
  dog/NN
  went/VBD
  running/VBG
  through/IN
  the/DT
  mud/NN
  ./.)


#### **Chinking**

used to exclude a specific chunk from the whole chunk

In [20]:
sample = 'The crazy brown dog went running through the mud.'

tokens= word_tokenize(sample)

pos_text = nltk.pos_tag(tokens)
pos_text


grammer = (r""" Chunk:  {<.*>+}
          }<VB.?|IN|DT|TO>+{
          """)


chunkParser = nltk.RegexpParser(grammer)
tree = chunkParser.parse(pos_text)

print(tree)


(S
  The/DT
  (Chunk crazy/JJ brown/NN dog/NN)
  went/VBD
  running/VBG
  through/IN
  the/DT
  (Chunk mud/NN ./.))


## **spaCy**

In [21]:
import spacy

# You need to load spacy models before using any of the functionalities 
nlp = spacy.load('en_core_web_sm')

#### **Tokenization**
Splitting up a text into meaning units 

In [22]:
#creating document using nlp object. 
doc = nlp('Rohit Raj is the new Mayor of the town!!')

for token in doc:
  print('Index: ',token.i,' Token:',token.text)

Index:  0  Token: Rohit
Index:  1  Token: Raj
Index:  2  Token: is
Index:  3  Token: the
Index:  4  Token: new
Index:  5  Token: Mayor
Index:  6  Token: of
Index:  7  Token: the
Index:  8  Token: town
Index:  9  Token: !
Index:  10  Token: !


In [23]:
span = doc[2:5]
span

is the new

#### **Parts of Speech Tagging (POS Tagging)** 

Process of Marking up a word of in a text (corpus) as corresponding to ap every token according to Grammer

In [24]:
doc = nlp('Process of Marking up a word of in a text (corpus) as corresponding to ap every token according to Grammer')
for token in doc:
  print('Index:',token.i,' Token:' ,token.text,' POS Tag:' ,token.pos_)

Index: 0  Token: Process  POS Tag: NOUN
Index: 1  Token: of  POS Tag: ADP
Index: 2  Token: Marking  POS Tag: PROPN
Index: 3  Token: up  POS Tag: ADP
Index: 4  Token: a  POS Tag: DET
Index: 5  Token: word  POS Tag: NOUN
Index: 6  Token: of  POS Tag: ADP
Index: 7  Token: in  POS Tag: ADP
Index: 8  Token: a  POS Tag: DET
Index: 9  Token: text  POS Tag: NOUN
Index: 10  Token: (  POS Tag: PUNCT
Index: 11  Token: corpus  POS Tag: X
Index: 12  Token: )  POS Tag: PUNCT
Index: 13  Token: as  POS Tag: ADP
Index: 14  Token: corresponding  POS Tag: VERB
Index: 15  Token: to  POS Tag: PART
Index: 16  Token: ap  POS Tag: VERB
Index: 17  Token: every  POS Tag: DET
Index: 18  Token: token  POS Tag: ADJ
Index: 19  Token: according  POS Tag: VERB
Index: 20  Token: to  POS Tag: ADP
Index: 21  Token: Grammer  POS Tag: PROPN


#### **Named Entity Recognition (NER)**
Process of taking a string as a text if input and identifying relevant nouns (people places and organizations) that are mentioned in that string.   

In [25]:
doc = nlp(' John lives in New York')

for entity in doc.ents:
  print(entity.text, entity.label_)


doc = nlp('Barack Obama, former president will be vacating White House today')  
for entity in doc.ents:
  print(entity.text, entity.label_)

New York GPE
Barack Obama PERSON
White House ORG
today DATE


#### **Matcher**

Helps us to find patterns in a string with different criteria. Ex: Helping us understand if the word is lemma or punctuation or not

Link: https://spacy.io/usage/rule-based-matching

In [26]:
from spacy.matcher import Matcher
doc = nlp('Barack Obama the former president of United States will be vacating white house today')  

pattern = [{'LEMMA': 'vacate'}, {'ORTH': 'white'}]
matcher = Matcher(nlp.vocab)
matcher.add('white_Pattern',[pattern])
matches = matcher(doc)
matches

[(4191279314630736679, 10, 12)]

In [27]:
for match_id,start,end in matches:
  matched_span  = doc[start:end]
  print(matched_span.text)

vacating white


In [28]:
doc = nlp('2018 FIFA World Cup: Frace won!!')

pattern = [{'IS_DIGIT': True}, {'LOWER': 'fifa'}, {'LOWER': 'world'}]
matcher = Matcher(nlp.vocab)
matcher.add('fifa_Pattern',[pattern])
matches = matcher(doc)
matches

[(10959151708183927808, 0, 3)]

In [29]:
for match_id,start,end in matches:
  matched_span  = doc[start:end]
  print(matched_span.text)

2018 FIFA World
