In [1]:
import os
from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm import tqdm_notebook
import random
import warnings
warnings.filterwarnings("ignore")

#nltk.download('punkt')

In [2]:
args = Namespace(
    raw_dataset_txt   = "frankenstein.txt",
    window_size       = 3,
    train_proportion  = 0.7,
    val_proportion    = 0.15,
    test_proportion   = 0.15,
    output_csv        = "frankenstein_with_splits.csv",
)

# 1. Utilizes the sentence tokenizer from NLTK  to segment English text into sentences.
### - [nltk.data.load](https://www.nltk.org/api/nltk.data.html): Load a given resource from the NLTK data package. Use the NLTK Downloader to obtain the resource before loading:  nltk.download('punkt').
### - [punkt.PunktSentenceTokenizer](https://www.nltk.org/api/nltk.tokenize.PunktSentenceTokenizer.html): A sentence tokenizer which uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences; and then uses that model to find sentence boundaries. This approach has been shown to work well for many European languages.
### - PunktSentenceTokenizer.tokenize(): Given a text, returns a list of the sentences in that text.

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
text_example = "Pluto was discovered in 1930 by Clyde W. Tombaugh, making it by far the first known object in the Kuiper belt. It was immediately hailed as the ninth planet, but it was always the odd object out, and its planetary status was questioned when it was found to be much smaller than expected. These doubts increased following the discovery of additional objects in the Kuiper belt starting in the 1990s, and particularly the more massive scattered disk object Eris in 2005. In 2006, the International Astronomical Union (IAU) formally redefined the term planet to exclude dwarf planets such as Pluto. Many planetary astronomers, however, continue to consider Pluto and other dwarf planets to be planets."
sentences_example = tokenizer.tokenize(text_example)

In [5]:
text_example

'Pluto was discovered in 1930 by Clyde W. Tombaugh, making it by far the first known object in the Kuiper belt. It was immediately hailed as the ninth planet, but it was always the odd object out, and its planetary status was questioned when it was found to be much smaller than expected. These doubts increased following the discovery of additional objects in the Kuiper belt starting in the 1990s, and particularly the more massive scattered disk object Eris in 2005. In 2006, the International Astronomical Union (IAU) formally redefined the term planet to exclude dwarf planets such as Pluto. Many planetary astronomers, however, continue to consider Pluto and other dwarf planets to be planets.'

In [6]:
i = 0
for s in sentences_example:
    print(i)
    i+=1
    print(s)
    print('-'*60)


0
Pluto was discovered in 1930 by Clyde W. Tombaugh, making it by far the first known object in the Kuiper belt.
------------------------------------------------------------
1
It was immediately hailed as the ninth planet, but it was always the odd object out, and its planetary status was questioned when it was found to be much smaller than expected.
------------------------------------------------------------
2
These doubts increased following the discovery of additional objects in the Kuiper belt starting in the 1990s, and particularly the more massive scattered disk object Eris in 2005.
------------------------------------------------------------
3
In 2006, the International Astronomical Union (IAU) formally redefined the term planet to exclude dwarf planets such as Pluto.
------------------------------------------------------------
4
Many planetary astronomers, however, continue to consider Pluto and other dwarf planets to be planets.
-----------------------------------------------

In [7]:
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)
print (len(sentences), "sentences")

a = random.randint(0, len(sentences))
print("Sample ({}):".format(a))
print(sentences[a])

3427 sentences
Sample (1378):
Cursed be the day, abhorred devil, in which you first saw
light!


# 2. Utilizes the sentence tokenizer from NLTK to segment English text into sentences.

In [8]:
def preprocess_text(text):
    ### converts all words in the text to lowercase
    text = ' '.join(word.lower() for word in text.split(" "))
    ### substitute [.,!?] with spaces before and after matched punctuation marks
    text = re.sub(r"([.,!?])", r" \1 ", text)
    ### replace [^a-zA-Z.,!?] with a single space
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [9]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

a = random.randint(0, len(sentences))
print("Sample ({}):".format(a))
print("Before cleaning")
print(sentences[a])
print("After cleaning")
print(cleaned_sentences[a])

Sample (3273):
Before cleaning
Nay, these are virtuous
and immaculate beings!
After cleaning
nay , these are virtuous and immaculate beings ! 


# 3. Create windows
### - The window size used is a hyperparameter, and one that is fairly critical to CBOW. Too large of a window, and the model might fail to capture regularities; too small of a window, and the window might miss out on interesting dependencies.

In [10]:
MASK_TOKEN = "<MASK>"

### - [nltk.ngrams(sequence, n)](https://tedboy.github.io/nlps/generated/generated/nltk.ngrams.html): Return the ngrams generated from a sequence of items, as an iterator. 

In [11]:
### return an iterator
nltk.ngrams([1,2,3,4,5],3)

<zip at 0x7f8ba459ed00>

In [12]:
### return a list
list(nltk.ngrams([1,2,3,4,5], 3))

[(1, 2, 3), (2, 3, 4), (3, 4, 5)]

In [13]:
sentence_now = 'this is a ball'
sentence_now

'this is a ball'

In [14]:
window_size_now = 3
sequence_now = [MASK_TOKEN] * window_size_now + sentence_now.split(' ')\
               + [MASK_TOKEN] * window_size_now
sequence_now

['<MASK>',
 '<MASK>',
 '<MASK>',
 'this',
 'is',
 'a',
 'ball',
 '<MASK>',
 '<MASK>',
 '<MASK>']

In [15]:
ngrams_now = list(nltk.ngrams(sequence_now, window_size_now * 2 + 1))
ngrams_now

[('<MASK>', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball'),
 ('<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>'),
 ('<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>'),
 ('this', 'is', 'a', 'ball', '<MASK>', '<MASK>', '<MASK>')]

### - flatten(list): A lambda function that flattens the nested list outer_list into a single-layered list and return this resulting list.

In [16]:
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]

In [17]:
### The input needs to be a nested list
try: 
    flatten([1,2,3])
except Exception as e:
    print(e)

'int' object is not iterable


In [18]:
nested_list = [[1,2,3],['a','b','c','d']]
print("nested list")
print(nested_list)
print('-'*60)
print("flattened nested list")
print(flatten(nested_list))

nested list
[[1, 2, 3], ['a', 'b', 'c', 'd']]
------------------------------------------------------------
flattened nested list
[1, 2, 3, 'a', 'b', 'c', 'd']


In [19]:
nested_list = [(1,2,3),('a','b','c','d')]
print("nested list")
print(nested_list)
print('-'*60)
print("flattened nested list")
print(flatten(nested_list))

nested list
[(1, 2, 3), ('a', 'b', 'c', 'd')]
------------------------------------------------------------
flattened nested list
[1, 2, 3, 'a', 'b', 'c', 'd']


In [20]:
print("ngrams_now")
print('Shape: '+str(len(ngrams_now)))
print(ngrams_now)
print('-'*60)
print("flattened ngrams_now")
print('Shape: '+str(len(flatten(ngrams_now))))
print(flatten(ngrams_now))

ngrams_now
Shape: 4
[('<MASK>', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball'), ('<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>'), ('<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>'), ('this', 'is', 'a', 'ball', '<MASK>', '<MASK>', '<MASK>')]
------------------------------------------------------------
flattened ngrams_now
Shape: 28
['<MASK>', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>', 'this', 'is', 'a', 'ball', '<MASK>', '<MASK>', '<MASK>']


### Create the windows

In [21]:
# Create windows
flatten  = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
_windows = [list(
                nltk.ngrams(
                    [MASK_TOKEN] * args.window_size + \
                    sentence.split(' ') + \
                    [MASK_TOKEN] * args.window_size, 
                    args.window_size * 2 + 1)) \
            for sentence in tqdm_notebook(cleaned_sentences)]
windows = flatten(_windows)

  0%|          | 0/3427 [00:00<?, ?it/s]

In [22]:
print("Number of windows (before flattening): {}".format(len(_windows)))
print("Number of windows (after flattening): {}".format(len(windows)))

Number of windows (before flattening): 3427
Number of windows (after flattening): 90698


In [23]:
print('Sample')
window_now = windows[0]
target_token_now = window_now[args.window_size]
print("window:")
print(window_now)
print("target_token: "+ target_token_now)


Sample
window:
('<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the')
target_token: frankenstein


In [24]:
print('Sample')
window_now = windows[0]
target_token_now = window_now[args.window_size]
print("window:")
print(window_now)
print("target_token: "+ target_token_now)


Sample
window:
('<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the')
target_token: frankenstein


### Create the cbow_data

In [25]:
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
cbow_data = pd.DataFrame(data, columns=["context", "target"])  

  0%|          | 0/90698 [00:00<?, ?it/s]

In [26]:
print('Sample\n')
i = 0
window_now = windows[i]
print("window:")
print(window_now)
print("\ntarget:")
print(window_now[args.window_size])
print("\ndata: ")
print(cbow_data.loc[[i],])

Sample

window:
('<MASK>', '<MASK>', '<MASK>', 'frankenstein', ',', 'or', 'the')

target:
frankenstein

data: 
    context        target
0  , or the  frankenstein


### Create the split

In [27]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [29]:
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train


# 4. Class frankenstein_munging

In [None]:
class frankenstein_munging():
    def __init__(self, txt_file, MASK_TOKEN):
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        
        with open(txt_file) as fp:
            self.book  = fp.read()
        
        self.sentences         = self.tokenizer.tokenize(self.book)

        self.MASK_TOKEN        = MASK_TOKEN
    

    
    
    def create_windows(self):
        def preprocess_text(text):
            text = ' '.join(word.lower() for word in text.split(" "))
            text = re.sub(r"([.,!?])", r" \1 ", text)
            text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
        return text
        
        flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
        
        _windows = [list(
                        nltk.ngrams(
                            [MASK_TOKEN] * args.window_size + \
                            sentence.split(' ') + \
                            [MASK_TOKEN] * args.window_size, 
                            args.window_size * 2 + 1)
                        ) \
                    for sentence in tqdm_notebook(cleaned_sentences)]
        
        windows = flatten(_windows)
    


In [31]:
def create_cbow_data(raw_dataset_txt,window_size):
    def preprocess_text(text):
        text = ' '.join(word.lower() for word in text.split(" "))
        text = re.sub(r"([.,!?])", r" \1 ", text)
        text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
        return text
    
    train_proportion  = 0.7
    val_proportion    = 0.15
    test_proportion   = 0.15
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    with open(raw_dataset_txt) as fp:
        book = fp.read()
    
    sentences = tokenizer.tokenize(book)
    
    cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]
    
    MASK_TOKEN = "<MASK>"
    
    ##### Create windows
    flatten  = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
    _windows = [list(
                    nltk.ngrams(
                        [MASK_TOKEN] * window_size + \
                        sentence.split(' ') + \
                        [MASK_TOKEN] * window_size, 
                        window_size * 2 + 1)) \
                for sentence in tqdm_notebook(cleaned_sentences)]
    windows = flatten(_windows)
    
    ##### Create cbow data
    data = []
    for window in tqdm_notebook(windows):
        target_token = window[window_size]
        context = []
        for i, token in enumerate(window):
            if token == MASK_TOKEN or i == window_size:
                continue
            else:
                context.append(token)
        data.append([' '.join(token for token in context), target_token])
    cbow_data = pd.DataFrame(data, columns=["context", "target"])  
    
    ##### Create split data
    n = len(cbow_data)
    def get_split(row_num):
        if row_num <= n*train_proportion:
            return 'train'
        elif (row_num > n*train_proportion) and (row_num <= n*train_proportion + n*val_proportion):
            return 'val'
        else:
            return 'test'
    cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)
    
    return cbow_data

In [36]:
frankenstein_cbow_3 = create_cbow_data(raw_dataset_txt = raw_dataset_txt,
                                       window_size=3)

  0%|          | 0/3427 [00:00<?, ?it/s]

  0%|          | 0/90698 [00:00<?, ?it/s]

In [38]:
frankenstein_cbow_5 = create_cbow_data(raw_dataset_txt = raw_dataset_txt,
                                       window_size = 5)

  0%|          | 0/3427 [00:00<?, ?it/s]

  0%|          | 0/90698 [00:00<?, ?it/s]

In [39]:
frankenstein_cbow_3.head()

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train


In [40]:
frankenstein_cbow_5.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train
