Name : Arsh Pratap

Course : Natural Language Processing

Roll No. : 2018IMT-021

In [None]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 23.9 MB/s 
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 66.3 MB/s 
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.3.2


In [None]:
import nltk
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
import requests
import io
from nltk import word_tokenize, sent_tokenize 
from nltk.lm import MLE
from nltk.tokenize.treebank import TreebankWordDetokenizer


#   Basic N-gram Language Modelling

In [None]:
text = [['how', 'are', 'you'], ['i', 'am', 'fine', 'and', 'so', 'is', 'everything']]

In [None]:
print("---Bigrams and trigrams generated---")
print(*list(bigrams(text[0])),sep="\n")
print(*(list(ngrams(text[1], n=3))),sep="\n")
print()

---Bigrams and trigrams generated---
('how', 'are')
('are', 'you')
('i', 'am', 'fine')
('am', 'fine', 'and')
('fine', 'and', 'so')
('and', 'so', 'is')
('so', 'is', 'everything')



In [None]:
print("---Trigrams with padding symbols---")
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=3))
print(*list(ngrams(padded_sent, n=3)),sep="\n")
print()

---Trigrams with padding symbols---
('<s>', '<s>', 'how')
('<s>', 'how', 'are')
('how', 'are', 'you')
('are', 'you', '</s>')
('you', '</s>', '</s>')



In [None]:
print("---Generated sample N-grams of max length = 2---")
padded_bigrams = list(pad_both_ends(text[0], n=2))
print(*list(everygrams(padded_bigrams, max_len=2)),sep="\n")
print()

---Generated sample N-grams of max length = 2---
('<s>',)
('<s>', 'how')
('how',)
('how', 'are')
('are',)
('are', 'you')
('you',)
('you', '</s>')
('</s>',)



In [None]:
print("---Flattened sentences with padding symbols---")
print(*list(flatten(pad_both_ends(sent, n=2) for sent in text)),sep="\n")
print()

---Flattened sentences with padding symbols---
<s>
how
are
you
</s>
<s>
i
am
fine
and
so
is
everything
</s>



In [None]:
print("---Value of lazy iterators - train and vocab---")
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
print("Unigram and bigram training iterators:")
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent),sep="\n")
    print()
print('#############')
print("Vocabulary iterator:")
print(list(padded_sentences))
print()

---Value of lazy iterators - train and vocab---
Unigram and bigram training iterators:
[('<s>',), ('<s>', 'how'), ('how',), ('how', 'are'), ('are',), ('are', 'you'), ('you',), ('you', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'i'), ('i',), ('i', 'am'), ('am',), ('am', 'fine'), ('fine',), ('fine', 'and'), ('and',), ('and', 'so'), ('so',), ('so', 'is'), ('is',), ('is', 'everything'), ('everything',), ('everything', '</s>'), ('</s>',)]

#############
Vocabulary iterator:
['<s>', 'how', 'are', 'you', '</s>', '<s>', 'i', 'am', 'fine', 'and', 'so', 'is', 'everything', '</s>']



#   Training an N-gram model

In [None]:
import nltk
nltk.download('punkt')
url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
    fout.write(text)
tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
print("---Preview of training corpus---")
print(print(text[:500]))
print()

---Preview of training corpus---
                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 
None



In [None]:
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

model = MLE(n)
print("------ Initializing Model ------")
print("Length of vocabulary: ", len(model.vocab))
print("------ Fitting Model ------")
model.fit(train_data, padded_sents)
print(model.vocab)
print("Length of vocabulary: ", len(model.vocab))
print()

print("---Preview of training corpus---")
print(model.vocab.lookup(tokenized_text[0]))
print()

print("---Model output with unseen data---")
print(model.vocab.lookup('language is never random lah .'.split()))
print()

# Using the N-gram language model

print("---Trained model with count of N-grams---")
print(model.counts)
print()

print("count('language') = ", model.counts['language'])
print("count('language is') = ", model.counts[['language']]['is'])
print("count('language is never') = ", model.counts[['language', 'is']]['never'])
print()

print("P('language') = ", model.score('language'))
print("P('is' | 'language') = ", model.score('is', 'language'.split()))
print("P('never' | 'language is') = ", model.score('never', 'language is'.split()))
print()

print("P_log('language') = ", model.logscore('language'))
print("P_log('is' | 'language') = ", model.logscore('is', 'language'.split()))
print("P_log('never' | 'language is') = ", model.logscore('never', 'language is'.split()))
print()

# Sentence generation using N-gram model

print("---Sentence generated using N-gram---")
print(model.generate(20, random_seed=7))
print()

print(model.score("<UNK>")==model.score("lah"))


------ Initializing Model ------
Length of vocabulary:  0
------ Fitting Model ------
<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>
Length of vocabulary:  1391

---Preview of training corpus---
('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')

---Model output with unseen data---
('language', 'is', 'never', 'random', '<UNK>', '.')

---Trained model with count of N-grams---
<NgramCounter with 3 ngram orders and 19611 ngrams>

count('language') =  25
count('language is') =  11
count('language is never') =  7

P('language') =  0.003691671588895452
P('is' | 'language') =  0.44
P('never' | 'language is') =  0.6363636363636364

P_log('language') =  -8.081510068120917
P_log('is' | 'language') =  -1.1844245711374275
P_log('never' | 'language is') =  -0.6520766965796932

---Sentence generated using N-gram-

In [None]:
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

print("---Generated sentence converted to human-readable form---")
print(generate_sent(model, 20, random_seed=7))
print()

print(model.vocab.lookup(tokenized_text[0]))

---Generated sentence converted to human-readable form---
and carroll used hypothesis testing has been used, and a half.

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')
