In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]','',doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc
normalize_corpus = np.vectorize(normalize_document)

## word2Vec

In [3]:
from nltk.corpus import gutenberg
from string import punctuation


In [4]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [7]:
bible  =  gutenberg.sents('bible-kjv.txt')
print(len(bible))
print(bible[10])
print(' '.join(bible[10]))

30103
['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']
1 : 6 And God said , Let there be a firmament in the midst of the waters , and let it divide the waters from the waters .


In [11]:
remove_term = punctuation + '0123456789'
norm_bible = [[word.lower() for word in sent if word not in remove_term] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None,normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split())>2]

In [14]:
print('Total Line:', len(bible))
print('Sample Line:', bible[10])
print('Normalized:', norm_bible[10])

Total Line: 30103
Sample Line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']
Normalized: god said let firmament midst waters let divide waters waters


# CBoW
- vocabulary
- pairwose context words
- CBOW
- Train
- Test

In [18]:
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import skipgrams

tokenizer =  text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id= tokenizer.word_index
word2id['PAD'] = 0

id2word = {v:k for k,v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)]for doc in norm_bible]

print('Samples', list(word2id.items())[:10])
print('Vocabulary Size', len(word2id))

Samples [('shall', 1), ('unto', 2), ('lord', 3), ('thou', 4), ('thy', 5), ('god', 6), ('ye', 7), ('said', 8), ('thee', 9), ('upon', 10)]
Vocabulary Size 12425


In [23]:
vocab_size = len(word2id)
embed_size = 100
window_size = 2

In [None]:
word2id.items()

In [32]:
def word_pairs(corpus,window_size,vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words =[]
            label_word = []
            start = index - window_size
            end = index + window_size+1
            
            context_words.append([words[i] for i in range(start, end) if 0<=i<sentence_length and i!=index])
            label_word.append(word)
            
            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x,y)

In [33]:
i = 0
for x, y in word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1  

Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses
Context (X): ['beginning', 'god', 'heaven', 'earth'] -> Target (Y): created
Context (X): ['earth', 'without', 'void', 'darkness'] -> Target (Y): form
Context (X): ['without', 'form', 'darkness', 'upon'] -> Target (Y): void
Context (X): ['form', 'void', 'upon', 'face'] -> Target (Y): darkness
Context (X): ['void', 'darkness', 'face', 'deep'] -> Target (Y): upon
Context (X): ['spirit', 'god', 'upon', 'face'] -> Target (Y): moved
Context (X): ['god', 'moved', 'face', 'waters'] -> Target (Y): upon
Context (X): ['god', 'said', 'light', 'light'] -> Target (Y): let
Context (X): ['god', 'saw', 'good', 'god'] -> Target (Y): light


In [28]:
word_pairs(wids,window_size,vocab_size)

<generator object word_pairs at 0x00000206BFD7B430>

# Build CBOW Deep Learning Model

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
import tensorflow.keras.backend as K

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x,axis=1), output_shape=(embed_size,1)))
cbow.add(Dense(vocab_size,activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='adam')
print(cbow.summary())



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1242500   
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12425)             1254925   
Total params: 2,497,425
Trainable params: 2,497,425
Non-trainable params: 0
_________________________________________________________________
None


# Train model

In [None]:
for epoch in range(1,51):
    loss = 0.0
    i=0
    for x,y in word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        loss+= cbow.train_on_batch(x,y)
        i+=1
        if i%100==0:
            print('Processed {}'.format(i))
    print('Epoch: ', epoch, '\t', loss)
    print()


In [41]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(12424, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
unto,0.005532,-0.028494,0.027624,-0.046013,0.034911,-0.044818,0.037544,0.041854,-0.061618,-0.011193,...,-0.027403,0.006258,0.065401,-0.020276,-0.024361,-0.003698,-0.015867,0.130519,-0.062761,0.081934
lord,-0.012903,-0.008829,-0.113023,0.086098,-0.054459,-0.047707,-0.016049,0.023504,0.036188,0.109216,...,-0.023334,0.036802,0.035012,0.030573,0.050417,0.018698,-0.022541,0.123939,0.03076,0.069052
thou,-0.07865,0.047485,-0.047135,0.071397,-0.081151,-0.102222,0.087451,-0.062817,0.099307,0.132839,...,0.086158,0.017178,-0.006755,0.014293,0.030317,0.00262,0.057206,-0.052498,0.022344,-0.071545
thy,-0.090057,0.025937,-0.124368,0.030982,0.029679,-0.147876,0.07511,-0.033713,0.064994,0.14551,...,-0.020966,0.020684,0.001814,-0.018096,0.018874,-0.070777,-0.042339,0.02342,-0.041602,0.064784
god,-0.06604,0.006885,-0.074482,0.08284,-0.008219,-0.065309,0.027086,-0.012326,-0.011355,0.105838,...,0.0205,0.002825,-0.009807,-0.031183,0.082439,-0.04606,0.041017,0.028875,0.085337,0.045473


In [42]:
# find the similar of words or cluster of word
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(12424, 12424)


In [44]:
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['god', 'lord', 'thou', 'thy', 'john', 'gospel', 'moses','famine']}

similar_words

{'god': ['tents', 'gods', 'bird', 'children', 'sort'],
 'lord': ['return', 'adam', 'therein', 'high', 'james'],
 'thou': ['saying', 'began', 'unto', 'canaan', 'male'],
 'thy': ['went', 'canaan', 'nakedness', 'two', 'sort'],
 'john': ['imagineth', 'ether', 'benhanan', 'priest', 'riding'],
 'gospel': ['undersetters', 'herewith', 'foameth', 'presented', 'beryl'],
 'moses': ['moving', 'dwelt', 'havilah', 'hills', 'given'],
 'famine': ['reviving', 'behave', 'bestead', 'rolled', 'bethshittah']}