In [None]:
#NLP UNIT 1 HANDS ON SESSION(COURSE CODE:UE20CS334)

In [None]:
#!pip install nltk
#!pip install spacy
#!pip install pattern
!pip install gensim



In [None]:
#IMPORT NECESSARY LIBRARIES
import nltk
nltk.download('punkt')
import spacy


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Sentence Tokenisation**

*Sentence boundary detection*, as its name suggests, addresses the problem of finding sentence boundaries. The concept of sentence is central in several natural language processing tasks, since sentences are standard textual units which confine a variety of linguistic phenomena such as collocations and variable binding.

In [None]:
#Sentence Tokenisation using NLTK to print individual sentences.

from nltk.tokenize import sent_tokenize
text = "Awesome! I am learning NLP."
for sent in sent_tokenize(text):
    print(sent)

Awesome!
I am learning NLP.


In [None]:
#Sentence Tokenisation using Spacy to print individual sentences.

from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')
doc = nlp('Hello, world. Here are two sentences.')
for sent in doc.sents:
    print(sent)

Hello, world.
Here are two sentences.


**Chunking**

This method is generally an optional method in the preprocessing of text. Chunking tries to split the sentences further into more meaningful pieces. For example the statement My name is Michael, I live in Bangalore. We can split this sentence into 2 parts one that provides information on the identity and the other on habitat. This is generally achieved through Semantic Segmentation using POS tagging and building syntactic trees that can then be split on the bases of need.

In [None]:
#Chunking using Spacy

from spacy import displacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('My name is Michael, I live in Bangalore.')
displacy.render(doc,style='dep',jupyter=True,options={'distance':140})

**Word Tokenisation**

In [None]:
#Word Tokenisation using spacy to print individual words

from nltk.tokenize import word_tokenize
text = "God is Great! I won a lottery."
for word in word_tokenize(text):
    print(word)

God
is
Great
!
I
won
a
lottery
.


In [None]:
#Word Tokenisation using spacy to print individual words

from spacy.lang.en import English
nlp = English()
# Created by processing a string of text with the nlp object
doc = nlp("God is Great! I won a lottery.")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

God
is
Great
!
I
won
a
lottery
.


**Morphological Analysis and Part-of-Speech Tagging**

Having texts separated in tokens, the next step is usually morphosyntactic analysis, in order to identify characteristics as word lemma and parts of speech.

It is important to distinguish two concepts: *lexeme and word form.*

The process of determining the word lemma is called *lemmatization.*

Another method called **word stemming** is common due to its simplicity. Word stemming reduces words to their base form by removing suffixes. The remaining form is not necessarily a valid root but it is usually suffi cient that related words map to the same stem or to a reduced set of stems if words are irregular. For example the words “mice” and “mouse” have the lemma “mouse” but some stemmers produce “mic” and “mous”, respectively.

**NLTK LEMMATIZER**

In [None]:
#NLTK Lemmatization

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize Single Word
print(lemmatizer.lemmatize("bats"))
print(lemmatizer.lemmatize("are"))
print(lemmatizer.lemmatize("feet"))

# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best support"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

[nltk_data] Downloading package wordnet to /root/nltk_data...


bat
are
foot
['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best', 'support']
The striped bat are hanging on their foot for best support


In [None]:
#LEMMAS BASED ON CONTEXT(VERB/NOUN)

print(lemmatizer.lemmatize("stripes", 'v'))
print(lemmatizer.lemmatize("stripes", 'n'))

strip
stripe


**SPACY LEMMATIZER**

In [None]:
#SPACY Lemmatization

import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

sentence = "The striped bats are hanging on their feet for best support"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])

'the stripe bat be hang on their foot for good support'

**TEXTBLOB LEMMATIZER**

In [None]:
#TEXTBLOB Lemmatization
from textblob import TextBlob, Word

# Lemmatize a word
word = 'stripes'
w = Word(word)
w.lemmatize()

'stripe'

**PATTERN LEMMATIZER**

In [None]:
import pattern
from pattern.en import lemma, lexeme

sentence = "The striped bats were hanging on their feet and ate best fishes"" ".join([lemma(wd) for wd in sentence.split()])

In [None]:
# Lexeme's for each word
[lexeme(wd) for wd in sentence.split()]

[['thethe', 'thethes', 'thething', 'thethed'],
 ['stripe', 'stripes', 'striping', 'striped'],
 ['bat', 'bats', 'batting', 'batted'],
 ['be',
  'am',
  'are',
  'is',
  'being',
  'was',
  'were',
  'been',
  'am not',
  "aren't",
  "isn't",
  "wasn't",
  "weren't"],
 ['hang', 'hangs', 'hanging', 'hung'],
 ['on', 'ons', 'oning', 'oned'],
 ['their', 'theirs', 'theiring', 'theired'],
 ['feet', 'feets', 'feeting', 'feeted'],
 ['and', 'ands', 'anding', 'anded'],
 ['eat', 'eats', 'eating', 'ate', 'eaten'],
 ['best', 'bests', 'besting', 'bested'],
 ['fishes', 'fishing', 'fishesed'],
 ['stripethe', 'stripethes', 'stripething', 'stripethed'],
 ['stripe', 'stripes', 'striping', 'striped'],
 ['bat', 'bats', 'batting', 'batted'],
 ['be',
  'am',
  'are',
  'is',
  'being',
  'was',
  'were',
  'been',
  'am not',
  "aren't",
  "isn't",
  "wasn't",
  "weren't"],
 ['hang', 'hangs', 'hanging', 'hung'],
 ['on', 'ons', 'oning', 'oned'],
 ['their', 'theirs', 'theiring', 'theired'],
 ['feet', 'feets', 'f

**Steps in NLP for preprocessing**

1)Tokenization
2)Stemming
3)Lemmatization
4)Part-of-speech (POS) tagging


In [None]:
#Code to implement all the NLP preprocessing steps

import nltk
nltk.download('all-nltk')
print("\n")

# Creating token of words
print("Creating token of words:")
from nltk.tokenize import word_tokenize
text="My name is Adithya Challa I wrote this shot!"
tokenize_word=word_tokenize(text)
print(tokenize_word)
print("\n")

# Stemming
print("Stemming:")
from nltk.stem import PorterStemmer
words=["light","lighting","lights"]
ps=PorterStemmer()
for w in words:
    rootword=ps.stem(w)
    print(rootword)
print("\n")

#Lemmatiztion:Converts all verb forms into root word
print("Lemmatiztion:Converts allverb forms into root word:")
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()
print(lem.lemmatize("playing"))
print("\n")

#POS Tag
print("POS Tag:")
from nltk import word_tokenize,pos_tag
text="My name is Adithya Challa I wrote this shot!"
print(pos_tag(word_tokenize(text)))


[nltk_data] Downloading collection 'all-nltk'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    



Creating token of words:
['My', 'name', 'is', 'Adithya', 'Challa', 'I', 'wrote', 'this', 'shot', '!']


Stemming:
light
light
light


Lemmatiztion:Converts allverb forms into root word:
playing


POS Tag:
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Adithya', 'NNP'), ('Challa', 'NNP'), ('I', 'PRP'), ('wrote', 'VBD'), ('this', 'DT'), ('shot', 'NN'), ('!', '.')]


***N-GRAMS***

In the fields of Computational Linguistics an **n-gram** is a contiguous sequence of n items from a given sample of text or speech.

The items can be letters, words or base pairs, phonemes or syllables   according to the application.

Eg Consider a sentence:

She was laughing at him.

“she was” or “was laughing” etc are bigram(2-gram)
“She was laughing” or “ laughing at him” etc are trigrams( 3-grams)


In [None]:
#Use of NLTK to print N-grams

from nltk import ngrams
sentence = 'I reside in Bengaluru.'
n = 2
unigrams = ngrams(sentence.split(), n)
for grams in unigrams:
  print(grams)

('I', 'reside')
('reside', 'in')
('in', 'Bengaluru.')


**PORTER STEMMER**

One of the most popular stemming algorithms is the Porter stemmer, which has been around since 1979.

In [None]:
# Stemming words

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]

for w in words:
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm


In [None]:
# Stemming a sentence

new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

it
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


**SNOWBALL STEMMER**

It is a stemming algorithm which is also known as the Porter2 stemming algorithm as it is a better version of the Porter Stemmer since some issues of it were fixed in this stemmer.

Difference Between Porter Stemmer and Snowball Stemmer:

1)Snowball Stemmer is more aggressive than Porter Stemmer.

2)Some issues in Porter Stemmer were fixed in Snowball Stemmer.

3)There is only a little difference in the working of these two.

4)Words like ‘fairly‘ and ‘sportingly‘ were stemmed to ‘fair’ and ‘sport’ in the snowball stemmer but when you use the porter stemmer they are stemmed to ‘fairli‘ and ‘sportingli‘.

5)The difference between the two algorithms can be clearly seen in the way the word ‘Sportingly’ in stemmed by both. Clearly Snowball Stemmer stems it to a more accurate stem.

In [None]:
from nltk.stem.snowball import SnowballStemmer

#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')

#list of tokenized words
words = ['cared','university','fairly','easily','singing',
       'sings','sung','singer','sportingly']

#stem's of each word
stem_words = []
for w in words:
    x = snow_stemmer.stem(w)
    stem_words.append(x)

#print stemming results
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

cared ----> care
university ----> univers
fairly ----> fair
easily ----> easili
singing ----> sing
sings ----> sing
sung ----> sung
singer ----> singer
sportingly ----> sport


**Minimum Edit Distance Algorithm Code with example** (Dynamic programming)

In [None]:
#Code to demonstrate the Minimum Edit Distance Algorithm
def edit_distance(str1, str2, a, b):
    string_matrix = [[0 for i in range(b+1)] for i in range(a+1)]

    for i in range(a+1):
        for j in range(b+1):

            if i == 0:
                string_matrix[i][j] = j   # If first string is empty, insert all characters of second string into first.

            elif j == 0:
                string_matrix[i][j] = i   # If second string is empty, remove all characters of first string.

            elif str1[i-1] == str2[j-1]:
                string_matrix[i][j] = string_matrix[i-1][j-1]  # If last characters of two strings are same, nothing much to do. Ignore the last two characters and get the count of remaining strings.

            else:
                string_matrix[i][j] = 1 + min(string_matrix[i][j-1],      # insert operation
                                       string_matrix[i-1][j],      # remove operation
                                       string_matrix[i-1][j-1])    # replace operation

    return string_matrix[a][b]


if __name__ == '__main__':
    str1 = 'Cats'
    str2 = 'Rats'

    print('No. of Operations required :',edit_distance(str1, str2, len(str1), len(str2)))


    str3 = 'Saturday'
    str4 = 'Sunday'
    print('No. of Operations required :',edit_distance(str3, str4, len(str3), len(str4)))

No. of Operations required : 1
No. of Operations required : 3


**Explanation for the code above:**

In Case-1, str1 =’Cats’ and str2 = ‘Rats’.  To change ‘Cats’ into ‘Rats’, only one update operation is required. That means letter ‘C’ is replaced by letter ‘R’.


In Case-2 , str3 =’Saturday’ and str4=’Sunday’. To change ‘Saturday’ to ‘Sunday’, three operations are required. That means letters ‘a’ and ‘t’ are deleted and ‘n’ is inserted.

**Minimum Edit Distance to correct  misspelled words**

Edit Distance measures dissimilarity between two strings by finding the minimum number of operations needed to transform one string into the other.


In [None]:
#Code to demonstrate Minimum Edit Distance to correct misspelled words
from nltk.metrics.distance  import edit_distance
nltk.download('words')
from nltk.corpus import words
correct_words = words.words()

incorrect_words=['happpy', 'azmaing', 'intelliengt']

for word in incorrect_words:
    temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]]
    print(sorted(temp, key = lambda val:val[0])[0][1])

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


happy
aiming
intelligent


**N GRAM LANGUAGE MODEL**

An N-gram language model predicts the probability of a given N-gram within any sequence of words in the language.

If we have a good N-gram model, we can predict p(w | h) – what is the probability of seeing the word w given a history of previous words h – where the history contains n-1 words.


*We compute this probability in two steps:*

1)Apply the chain rule of probability.

2)We then apply a very strong simplification assumption to allow us to compute p(w1…ws) in an easy manner


Now that we understand what an N-gram is, let’s build a basic language model using trigrams of the Reuters corpus. Reuters corpus is a collection of 10,788 news documents totaling 1.3 million words.


We first split our text into trigrams with the help of NLTK and then calculate the frequency in which each combination of the trigrams occurs in the dataset.

We then use it to calculate probabilities of a word, given the previous two words. That’s essentially what gives us our Language Model!

In [None]:
# code courtesy of https://nlpforhackers.io/language-models/

from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import nltk
nltk.download('reuters')
nltk.download('punkt')

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

print(dict(model['today', 'the']))

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'public': 0.05555555555555555, 'European': 0.05555555555555555, 'Bank': 0.05555555555555555, 'price': 0.1111111111111111, 'emirate': 0.05555555555555555, 'overseas': 0.05555555555555555, 'newspaper': 0.05555555555555555, 'company': 0.16666666666666666, 'Turkish': 0.05555555555555555, 'increase': 0.05555555555555555, 'options': 0.05555555555555555, 'Higher': 0.05555555555555555, 'pound': 0.05555555555555555, 'Italian': 0.05555555555555555, 'time': 0.05555555555555555}


If we keep following this process iteratively, we will soon have a coherent sentence! Here is a script to play around with generating a random piece of text using our n-gram model:

This is the same underlying principle which the likes of Google, Alexa, and Apple use for language modeling.

In [None]:
#generating a random piece of text using our n-gram model

import random

# starting words
text = ["today", "the"]
sentence_finished = False

while not sentence_finished:
  # select a random probability threshold
  r = random.random()
  accumulator = .0

  for word in model[tuple(text[-2:])].keys():
      accumulator += model[tuple(text[-2:])][word]
      # select words that are above the probability threshold
      if accumulator >= r:
          text.append(word)
          break

  if text[-2:] == [None, None]:
    sentence_finished = True

print (' '.join([t for t in text if t]))

today the newspaper article .
