# ***Practical - 5***

***a) word tokenization in Hindi ***

In [1]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
# Example Hindi sentence
sentence = 'सुबह उठकर समय पर खाना खाना स्वस्थ जीवन जीने के लिए बहुत जरूरी है।'
# Perform word tokenization on the sentence
tokens = word_tokenize(sentence)
# Print the output tokens
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['सुबह', 'उठकर', 'समय', 'पर', 'खाना', 'खाना', 'स्वस्थ', 'जीवन', 'जीने', 'के', 'लिए', 'बहुत', 'जरूरी', 'है।']


***c) Identify the Indian language of a text ***

In [None]:
from langdetect import detect

# example text
text = "हिन्दी मेरी मातृभाषा है।"

# detect language
lang = detect(text)

# print detected language
print("Detected language:", lang)

Detected language: hi


In [None]:
#############################################################################################################

# ***Practical - 10***

***a. Speech Tagging:*** 

***i. Speech tagging using spacy ***

In [None]:
import spacy
sp = spacy.load('en_core_web_sm')
sen = sp(u"I like to play football. I hated it in my childhood though")
print(sen.text)
print(sen[7].pos_)
print(sen[7].tag_)
print(spacy.explain(sen[7].tag_))
for word in sen:
  print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')
sen = sp(u'Can you google it?')
word = sen[2]

print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')
sen = sp(u'Can you search it on google?')
word = sen[5]
print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')

#Finding the Number of POS Tags
sen = sp(u"I like to play football. I hated it in my childhood though")
num_pos = sen.count_by(spacy.attrs.POS)
num_pos

for k,v in sorted(num_pos.items()):
  print(f'{k}. {sen.vocab[k].text:{8}}: {v}')
#Visualizing Parts of Speech Tags
from spacy import displacy

sen = sp(u"I like to play football. I hated it in my childhood though")
displacy.serve(sen, style='dep', options={'distance': 120})


**ii. Speech tagging using NLTK**

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

#create our training and testing data:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

#train the Punkt tokenizer like:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

# tokenize:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
  try:
    for i in tokenized[:2]:
      words = nltk.word_tokenize(i)
      tagged = nltk.pos_tag(words)
      print(tagged)

  except Exception as e:
    print(str(e))

process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import nltk
nltk.download('state_union')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

***b. Statistical parsing:*** 

***i. Usage of Give and Gave in the Penn Treebank sample ***

In [None]:
import nltk
import nltk.parse.viterbi
import nltk.parse.pchart

def give(t):
  return t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP'\
  and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP')\
  and ('give' in t[0].leaves() or 'gave' in t[0].leaves())

def sent(t):
  return ' '.join(token for token in t.leaves() if token[0] not in '*-0')

def print_node(t, width):
  output = "%s %s: %s / %s: %s" %\
  (sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2]))
  if len(output) > width:
   output = output[:width] + "..."
  print (output)

for tree in nltk.corpus.treebank.parsed_sents():
  for t in tree.subtrees(give):
    print_node(t, 72)




SyntaxError: ignored

***ii. probabilistic parser ***

In [None]:
import nltk
from nltk import PCFG

grammar = PCFG.fromstring('''
NP -> NNS [0.5] | JJ NNS [0.3] | NP CC NP [0.2]
NNS -> "men" [0.1] | "women" [0.2] | "children" [0.3] | NNS CC NNS [0.4]
JJ -> "old" [0.4] | "young" [0.6]
CC -> "and" [0.9] | "or" [0.1]
''')

print(grammar)
viterbi_parser = nltk.ViterbiParser(grammar)
token = "old men and women".split()
obj = viterbi_parser.parse(token)

print("Output: ")
for x in obj:
  print(x)


Grammar with 11 productions (start state = NP)
    NP -> NNS [0.5]
    NP -> JJ NNS [0.3]
    NP -> NP CC NP [0.2]
    NNS -> 'men' [0.1]
    NNS -> 'women' [0.2]
    NNS -> 'children' [0.3]
    NNS -> NNS CC NNS [0.4]
    JJ -> 'old' [0.4]
    JJ -> 'young' [0.6]
    CC -> 'and' [0.9]
    CC -> 'or' [0.1]
Output: 
(NP (JJ old) (NNS (NNS men) (CC and) (NNS women))) (p=0.000864)


In [None]:
import nltk
nltk.download('PCFG')

[nltk_data] Error loading PCFG: Package 'PCFG' not found in index


False

In [None]:
######################################################################################################################

# ***Practical 11***


***a) Multiword Expressions in NLP**

In [None]:
# Multiword Expressions in NLP

from nltk.tokenize import MWETokenizer
from nltk import sent_tokenize, word_tokenize
s = '''Good cake cost Rs.1500\kg in Mumbai.  Please buy me one of them.\n\nThanks.'''
mwe = MWETokenizer([('New', 'York'), ('Hong', 'Kong')], separator='_')
for sent in sent_tokenize(s):
  print(mwe.tokenize(word_tokenize(sent)))

['Good', 'cake', 'cost', 'Rs.1500\\kg', 'in', 'Mumbai', '.']
['Please', 'buy', 'me', 'one', 'of', 'them', '.']
['Thanks', '.']


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

***b) Normalized Web Distance and Word Similarity ***

In [None]:
import numpy as np
import re
import textdistance
# pip install textdistance # we will need scikit-learn>=0.21
import sklearn #pip install sklearn
from sklearn.cluster import AgglomerativeClustering

texts = [   'Reliance supermarket', 'Reliance hypermarket', 'Reliance', 'Reliance', 'Reliance downtown', 'Relianc market',    'Mumbai', 'Mumbai Hyper', 'Mumbai dxb', 'mumbai airport',     'k.m trading', 'KM Trading', 'KM trade', 'K.M.  Trading', 'KM.Trading' ]

def normalize(text):
  """ Keep only lower-cased text and numbers"""
  return re.sub('[^a-z0-9]+', ' ', text.lower())

def group_texts(texts, threshold=0.4):
  """ Replace each text with the representative of its cluster"""
  normalized_texts = np.array([normalize(text) for text in texts])
  distances = 1 - np.array([
      [textdistance.jaro_winkler(one, another) for one in normalized_texts]
      for another in normalized_texts   ])
  clustering = AgglomerativeClustering(
      distance_threshold=threshold, # this parameter needs to be tuned carefully
      affinity="precomputed", linkage="complete", n_clusters=None   ).fit(distances)
  centers = dict()

  for cluster_id in set(clustering.labels_):
    index = clustering.labels_ == cluster_id
    centrality = distances[:, index][index].sum(axis=1)
    centers[cluster_id] = normalized_texts[index][centrality.argmin()]
  return [centers[i] for i in clustering.labels_]

print(group_texts(texts))



['reliance', 'reliance', 'reliance', 'reliance', 'reliance', 'reliance', 'mumbai', 'mumbai', 'mumbai', 'mumbai', 'km trading', 'km trading', 'km trading', 'km trading', 'km trading']




In [None]:
!pip install textdistance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0


***c) Word Sense Disambiguation ***

In [None]:
from nltk.corpus import wordnet as wn

def get_first_sense(word, pos=None):
  if pos:
    synsets = wn.synsets(word,pos)
  else:
    synsets = wn.synsets(word)
  return synsets[0]

best_synset = get_first_sense('bank')
print ('%s: %s' % (best_synset.name, best_synset.definition))
best_synset = get_first_sense('set','n')
print ('%s: %s' % (best_synset.name, best_synset.definition))
best_synset = get_first_sense('set','v')
print ('%s: %s' % (best_synset.name, best_synset.definition))

<bound method Synset.name of Synset('bank.n.01')>: <bound method Synset.definition of Synset('bank.n.01')>
<bound method Synset.name of Synset('set.n.01')>: <bound method Synset.definition of Synset('set.n.01')>
<bound method Synset.name of Synset('put.v.01')>: <bound method Synset.definition of Synset('put.v.01')>


In [None]:
 Generate similar sentences from a given Hindi text input

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True