### Task 0. 
Take an arbitrary text from NLTK corpora (e.g. text3) and implement a Bag-of-Words tagger for it.

In [1]:
import nltk

In [5]:
from nltk.book import *
def get_vocabulary(corpus):
    vocabulary = set()
    for text in corpus:
        words = gutenberg.words(text)
        cleaned_words = {word.lower() for word in words if word.isalpha()}
        vocabulary.update(cleaned_words)
    return sorted(vocabulary)

vocabulary = get_vocabulary(['melville-moby_dick.txt', 'Austen-Emma.txt'])

In [11]:
from nltk.tokenize import word_tokenize
from collections import Counter
def count(word, document):
    return Counter(gutenberg.words(document)).get(word, 0)
count("a", "melville-moby_dick.txt")

4569

In [10]:
import numpy as np
def bow_tagger(document, corpus):
    vocabulary = get_vocabulary(corpus)
    bow_representation = np.zeros(len(vocabulary))
    word_count = Counter(gutenberg.words(document))
    for (i,word) in enumerate(vocabulary):
        bow_representation[i] = word_count.get(word, 0)
    return bow_representation
bow_md = bow_tagger('melville-moby_dick.txt', ['melville-moby_dick.txt', 'Austen-Emma.txt']).astype(int)
bow_md

array([4569,    2,    2, ...,    3,    1,    0])

### Task 1 
 Enhance the tagger so that it will use N-grams instead of words

In [51]:
from nltk.util import ngrams
def get_ngram_vocabulary(corpus, N):
    voc = []
    for document in corpus:
        tokens = gutenberg.words(document)
        tokens = [word for word in tokens if word.isalnum()]
        voc.extend(list(ngrams(tokens, N)))
    return list(set(voc))
get_ngram_vocabulary(['melville-moby_dick.txt', 'Austen-Emma.txt'], 3)

[('much', 'more', 'philosophic'),
 ('supplemental', 'casks', 'had'),
 ('CHAPTER', '2', 'The'),
 ('crown', 'of', 'geographical'),
 ('what', 'drugged', 'whales'),
 ('around', 'her', 'eager'),
 ('on', 'with', 'the'),
 ('could', 'command', 'that'),
 ('mounted', 'to', 'the'),
 ('for', 'any', 'consideration'),
 ('takes', 'a', 'fancy'),
 ('refusing', 'from', 'conscientious'),
 ('laughing', 'we', 'shall'),
 ('gore', 'capsizing', 'Flask'),
 ('some', 'king', 's'),
 ('man', 'has', 'a'),
 ('and', 'model', 'thyself'),
 ('brave', 'hearts', 'snap'),
 ('to', 'pause', 'in'),
 ('My', 'dear', 'dear'),
 ('feel', 'that', 'any'),
 ('high', 'A', 'whole'),
 ('not', 'in', 'use'),
 ('there', 'strangely', 'eyeing'),
 ('Commend', 'the', 'murderous'),
 ('just', 'as', 'I'),
 ('There', 'is', 'one'),
 ('part', 'of', 'his'),
 ('that', 'you', 'might'),
 ('and', 'sit', 'down'),
 ('again', 'while', 'the'),
 ('exquisite', 'feelings', 'of'),
 ('seriously', 'of', 'Miss'),
 ('But', 'butchers', 'also'),
 ('it', 'And', 'I'),
 

In [49]:
def bon_tagger(document, corpus, N):
    vocabulary = get_ngram_vocabulary(corpus, N)
    bon_representation = np.zeros(len(vocabulary))
    tokens = gutenberg.words(document)
    tokens = [word for word in tokens if word.isalnum()]
    ngrams_count = Counter(list(ngrams(tokens, N)))
    for (i,ngram) in enumerate(vocabulary):
        bon_representation[i] = ngrams_count.get(ngram, 0)
    return bon_representation
    

In [50]:
bon_md = bon_tagger('melville-moby_dick.txt', ['melville-moby_dick.txt', 'Austen-Emma.txt'], 3).astype(int)
bon_md

array([0, 1, 1, ..., 0, 0, 0])

### Task 2
 Implement PPMI weighting with co-occurrence based on the presence within the same paragraph.

In [14]:
def get_paragraphs(corpus):
    paragraphs = []
    for text in corpus:
        paragraphs += [p.strip().lower() for p in gutenberg.raw(text).split('\r\n\r\n') if p.strip()]
    return paragraphs


def co_parag(word, context, corpus):
    paragraphs = get_paragraphs(corpus)
    w_occurance, c_occurance, co_occurance = 0, 0, 0
    for paragraph in paragraphs:
        words = word_tokenize(paragraph)
        co_occurance += word.lower() in words and context.lower() in words
        w_occurance += word.lower() in words
        c_occurance += context.lower() in words
    return np.array([w_occurance, c_occurance, co_occurance])


moby_dick = 'melville-moby_dick.txt'
caesar = 'shakespeare-caesar.txt'
co_parag("the", "a", [moby_dick, caesar])

array([2176, 1566, 1438])

In [15]:
def ppmi(word, context, corpus):
    occurances = co_parag(word, context, corpus) / len(get_paragraphs(corpus))
    ppmi = np.log2(occurances[2] / (occurances[0]*occurances[1])) 
    ppmi = max([ppmi, 0])
    return ppmi
ppmi("the", "a", [moby_dick, caesar])

0.23763292730738073

In [17]:
ppmi("Moby", "dick", [moby_dick, caesar])

5.298369185904781

### Task 3
 Implement PPMI weighting with co-occurrence based on a sliding window of neighboring words. Pick some number between 2-10.


In [18]:
def get_words(corpus):
    words = []
    for text in corpus:
        words += list(gutenberg.words(text))
    return words

def co_window(word, context, corpus):
    words = get_words(corpus)
    w_occurance, c_occurance, co_occurance = 0, 0, 0
    for idx, w in enumerate(list(words)):
        if w.lower() == context.lower():
            c_occurance += 1
        elif w.lower() == word.lower():
            w_occurance += 1
            co_occurance += context.lower() in list(map(lambda x: x.lower(), words[idx-5: idx+5]))
    return np.array([w_occurance, c_occurance, co_occurance])
co_window("moby", "dick", [moby_dick, caesar])

array([84, 84, 83])

In [19]:
def ppmi2(word, context, corpus):
    occurances = co_window(word, context, corpus) / len(get_words(corpus))
    ppmi = np.log2((occurances[2]) / (occurances[0]*occurances[1])) 
    ppmi = max([ppmi, 0])
    return ppmi 
ppmi2("moby", "dick", [moby_dick, caesar])

11.719345405127092

### Task 4

To solve the proble we can use the `Add-one smoothing` method. The problem arises because we have very small occurance values for the infrequent words. So if we added some small positive constant to the numerator and denominator in PPMMI formula, the value for the infrequent words would normilize, as adding even a very small constant will decrease the value, while value for the frequent words will not change a lot with adding a tiny constant. Let's check it

In [61]:
def ppmi2_upd(word, context, corpus):
    occurances = co_window(word, context, corpus) / len(get_words(corpus))
    ppmi = np.log2((occurances[2] + 1e-5) / (occurances[0]*occurances[1] + 1e-5)) 
    ppmi = max([ppmi, 0])
    return ppmi 

In [62]:
print("-" * 20 + "For frequent words" + "-"*20)
print(f'Before Add-one smoothing PPMI for "the" and "a": {ppmi2("the", "a", [moby_dick, emma])}')
print(f'After Add-one smoothing PPMI for "the" and "a": {ppmi2_upd("the", "a", [moby_dick, emma])}')
print("-" * 20 + "For infrequent words" + "-"*20)
print(f'Before Add-one smoothing PPMI for "Moby" and "Dick": {ppmi2("Moby", "Dick", [moby_dick, emma])}')
print(f'After Add-one smoothing PPMI for "Moby" and "Dick": {ppmi2_upd("Moby", "Dick", [moby_dick, emma])}')

--------------------For frequent words--------------------
Before Add-one smoothing PPMI for "the" and "a": 2.226220092372014
After Add-one smoothing PPMI for "the" and "a": 2.2112484636462186
--------------------For infrequent words--------------------
Before Add-one smoothing PPMI for "Moby" and "Dick": 12.380339348136902
After Add-one smoothing PPMI for "Moby" and "Dick": 4.266505168587376


### Task 5
 Check how algorithm works using English thesaurus. Pick some 10 words, find synonyms for these, e.g. using https://www.merriam-webster.com/thesaurus. Note that semantic similarity is represented in different shades of orange. Does it match the output of PPMI weighting function? Would be nice if you could also draw a table with shaded cells matching closeness given by PPMI.

In [26]:
syns1 = ["large", "great","huge", "bulky", "grand", "colossal", "high", "noble"]
word1 = "big"
ppmis1 = [ppmi(word, syn, [moby_dick, caesar]) for syn in syns1]
ppmis1

  ppmi = np.log2(occurances[2] / (occurances[0]*occurances[1]))


[1.5382232216394212, 0.7589919004962051, 0, 0, 0, 0, 0, 0]