In [1]:
import bs4 as bs
import urllib.request
import re
import nltk
import string
import numpy as np
import gensim

In [2]:
# PURPOSE: load Google's pre-trained Word2Vec model.
pretrained_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [46]:
# PURPOSE: get data and preprocess raw text

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')


article_text = ""

for p in paragraphs[:10]:
    article_text += p.text

# with open('./article.txt', 'r') as file:
#     article_text = file.read().replace('\n', '')
    
article_text = re.sub('\[.*?\]', '', article_text)  # strip '[*]'
article_text = re.sub('\(|\)|"', '', article_text)  # strip '()' and '"'
raw_sentences = article_text.split('.')  # split into sentences

corpus = set()
text = []
x = []

# construct corpus and text
for raw_sentence in raw_sentences:
    raw_sentence = raw_sentence.translate(str.maketrans('', '', string.punctuation))  # strip all punctuation
    raw_sentence = raw_sentence.lower()
    x.append(raw_sentence)
    temp = raw_sentence.split()
    if len(temp) < 6:  # skip sentences with less than 6 words
        continue
    text.append(temp)
    for i in temp:
        corpus.add(i.lower())

print(x)
# construct word_to_index and index_to_word mapping
word_to_index = {}
index_to_word = {}
index = 0
for i in corpus:
    word_to_index[i] = index
    index_to_word[index] = i
    index += 1
    
corpus_length = len(corpus)

# softmax function
def softmax(array):
    ex = np.exp(array - np.max(array))
    return ex / ex.sum()

# sigmoid function for negative sampling
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

['\n\nartificial intelligence ai is intelligence demonstrated by machines as opposed to the natural intelligence displayed by humans or animals', ' \nleading ai textbooks define the field as the study of intelligent agents any system that perceives its environment and takes actions that maximize its chance of achieving its goals', ' \nsome popular accounts use the term artificial intelligence to describe machines that mimic cognitive functions that humans associate with the human mind such as learning and problem solving however this definition is rejected by major ai researchers', '\nai applications include advanced web search engines i', 'e', ' google recommendation systems used by youtube amazon and netflix understanding human speech such as siri or alexa selfdriving cars e', 'g', ' tesla and competing at the highest level in strategic game systems such as chess and go\nas machines become increasingly capable tasks considered to require intelligence are often removed from the defini

In [90]:
print("length of corpus: {}".format(corpus_length))
print("number of sentences: {}".format(len(text)))
print(list(word_to_index.items())[:10])
#print(corpus)
print(text)

length of corpus: 289
number of sentences: 23
[('natural', 0), ('things', 1), ('representation', 2), ('potential', 3), ('environment', 4), ('understanding', 5), ('from', 6), ('behavior', 7), ('successful', 8), ('mind', 9)]
[['artificial', 'intelligence', 'ai', 'is', 'intelligence', 'demonstrated', 'by', 'machines', 'as', 'opposed', 'to', 'the', 'natural', 'intelligence', 'displayed', 'by', 'humans', 'or', 'animals'], ['leading', 'ai', 'textbooks', 'define', 'the', 'field', 'as', 'the', 'study', 'of', 'intelligent', 'agents', 'any', 'system', 'that', 'perceives', 'its', 'environment', 'and', 'takes', 'actions', 'that', 'maximize', 'its', 'chance', 'of', 'achieving', 'its', 'goals'], ['some', 'popular', 'accounts', 'use', 'the', 'term', 'artificial', 'intelligence', 'to', 'describe', 'machines', 'that', 'mimic', 'cognitive', 'functions', 'that', 'humans', 'associate', 'with', 'the', 'human', 'mind', 'such', 'as', 'learning', 'and', 'problem', 'solving', 'however', 'this', 'definition', '

In [105]:
# PURPOSE: construct CBOW model class

class CBOW_model:
    def __init__(self):
        self.x_train = []
        self.y_train = []
        self.y_predict = []
        self.hidden = []
        self.output = []
        self.W1 = []
        self.W2 = []
        self.embedding_dimension = 0
        self.window_size = 0
        self.alpha = 0
        self.epoch = 0
        self.negative_sampling = False
    
    # inialize the model eg. train & window_size & alpha, etc.  
    def initialize(self, window_size=1, negative_sampling=False, embedding_dimension=300, alpha=1, epoch=1000, google_news=False):
        # initialize hyperparameters
        self.window_size = window_size
        self.negative_sampling = negative_sampling
        self.embedding_dimension = embedding_dimension
        self.alpha = alpha
        self.epoch = epoch
        
        # generate one-hot vectors for target word (y) and context word (x)
        for sentence in text:
            for index in range(len(sentence)):
                target_word = sentence[index]
                # target one-hot vector
                target_vector = np.zeros(corpus_length, dtype=int)
                target_vector[word_to_index[target_word]] = 1
                self.y_train.append(target_vector)
                # context one-hot vector
                context_word = set(sentence[index-window_size:index] + sentence[index+1:index+window_size+1])
                context_vector = np.zeros(corpus_length, dtype=int)
                for i in context_word:
                    context_vector[word_to_index[i]] = 1
                # early averaging the context vector due to nature of multiple word CBOW model
                context_vector = context_vector / len(context_word)
                self.x_train.append(context_vector)
        
        # initialize weight metrics
        self.W1 = np.random.uniform(-0.8, 0.8, (corpus_length, self.embedding_dimension))
        self.W2 = np.random.uniform(-0.8, 0.8, (self.embedding_dimension, corpus_length))
        
        # use Google-news pretrained data
        if google_news:
            for word in corpus:
                index = word_to_index[word]
                if word in pretrained_model:
                    self.W1[index] = pretrained_model[word]
        
    
    # forward propagation
    def forward_propagation(self, x):
        X = self.x_train[x]
        self.hidden = np.dot(self.W1.T, X)
        self.output = np.dot(self.W2.T, self.hidden)
        self.y_predict = softmax(self.output)
    
    # backward propagation
    def backward_propagation(self, x):
        e = self.y_predict - self.y_train[x]
        dW2 = np.outer(self.hidden, e)
        dW1 = np.outer(self.x_train[x], np.dot(self.W2, e))
        self.W2 = self.W2 - self.alpha * dW2
        self.W1 = self.W1 - self.alpha * dW1
        
    # calculate loss
    def calculate_loss(self, x):
#         sum1 = 0
#         target_word_one_hot_vector = self.y_train[x]
#         for i in range(len(target_word_one_hot_vector)):
#             if target_word_one_hot_vector[i] == 1:
#                 sum1 = -self.output[i]
#                 break
#         sum2 = np.log(np.sum(np.exp(self.output)))
#         return sum1 + sum2
        target_word = ''
        output = 0
        target_word_one_hot_vector = self.y_train[x]
        for i in range(len(target_word_one_hot_vector)):
            if target_word_one_hot_vector[i] == 1:
                # target_word = index_to_word[i]
                output = -np.log(self.y_predict[i])
                # print('target word {}, loss = {}'.format(target_word, output))
                return output, i
        
    
    # training the model
    def train(self):
        for i in range(self.epoch):
            epoch_loss = 0
            correct = 0
            for x in range(len(self.x_train)):
                self.forward_propagation(x)
                self.backward_propagation(x)
                current_loss, index = self.calculate_loss(x)
                epoch_loss += current_loss
                if np.argmax(self.y_predict) == index:
                    correct += 1
            epoch_loss /= len(self.x_train)
            if i % 100 == 0 or i == self.epoch - 1:
                print('epoch{} loss: {} accuracy: {}'.format(i, epoch_loss, correct / len(self.x_train)))    

In [116]:
cbow_model = CBOW_model()
cbow_model.initialize(alpha=0.01, epoch=500, google_news=False, window_size=3)
cbow_model.train()

epoch0 loss: 6.985438459959627 accuracy: 0.0018484288354898336
epoch100 loss: 0.169901391136197 accuracy: 0.9926062846580407
epoch200 loss: 0.07027234811183239 accuracy: 0.9926062846580407
epoch300 loss: 0.04560652987982047 accuracy: 0.9926062846580407
epoch400 loss: 0.034569372398902926 accuracy: 0.9926062846580407
epoch499 loss: 0.02838439404600512 accuracy: 0.9926062846580407


In [50]:
print(cbow_model.x_train[57])

[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [52]:
print(predict(cbow_model, ['maximize', 'chance']))

0.9999688851741054
0.9999999999999998
its


In [114]:
# PURPOSE: construct CBOW model class with negative sampling

class CBOW_model_negative_sampling:
    def __init__(self):
        self.x_train_one_hot = []
        self.x_train_context_words = []
        self.y_train_positive = []
        self.y_train_negative = []
        self.y_predict = []
        self.hidden = []
        self.output = []
        self.cost = 0
        self.W1 = []
        self.W2 = []
        self.embedding_dimension = 0
        self.window_size = 0
        self.alpha = 0
        self.epoch = 0
        self.Pnw = {}
        self.k = 0
    
    # inialize the model eg. train & window_size & alpha, etc.  
    def initialize(self, window_size=1, embedding_dimension=300, alpha=0.01, epoch=1000, google_news=False, k=10):
        # initialize hyperparameters
        self.window_size = window_size
        self.embedding_dimension = embedding_dimension
        self.alpha = alpha
        self.epoch = epoch
        self.k = k
        
        # compute Pnw
        power = 3 / 4
        frequency = {}
        raw_text = []
        for sentence in text:
            for word in sentence:
                raw_text.append(word)

        for word in raw_text:
            if word in frequency:
                frequency[word] += 1
            else:
                frequency[word] = 1
        for key in frequency:
            frequency[key] /= len(raw_text)
        temp = {key: val ** power for key, val in frequency.items()}
        Z = sum(temp.values())
        self.Pnw = {key: val / Z for key, val in temp.items()}
        print(sum(self.Pnw.values()))
        
        # generate one-hot vectors for target word (y_train_positive & y_train_negative) and context word (x_train)
        for sentence in text:
            for index in range(len(sentence)):
                target_word = sentence[index]
                # context one-hot vector x
                context_word = set(sentence[index-window_size:index] + sentence[index+1:index+window_size+1])
                context_vector = np.zeros(corpus_length, dtype=int)
                for i in context_word:
                    context_vector[word_to_index[i]] = 1
                # early averaging the context vector due to nature of multiple word CBOW model
                context_vector = context_vector / len(context_word)
                self.x_train_one_hot.append(context_vector)
                self.x_train_context_words.append(context_word)
                # y_train_positive
                self.y_train_positive.append(target_word)
                # y_train_negative
                number_of_negative_words = self.k
                negative_words = []
                while number_of_negative_words > 0: 
                    choice = np.random.choice(list(self.Pnw.keys()), p=list(self.Pnw.values()))
                    if choice == target_word or choice in context_word:
                        continue
                    else:
                        negative_words.append(choice)
                        number_of_negative_words -= 1
                self.y_train_negative.append(negative_words)
                
        
        # initialize weight metrics
        self.W1 = np.random.uniform(-0.8, 0.8, (self.embedding_dimension, corpus_length))
        self.W2 = np.random.uniform(-0.8, 0.8, (corpus_length, self.embedding_dimension))
        
        # use Google-news pretrained data
        if google_news:
            for word in corpus:
                index = word_to_index[word]
                if word in pretrained_model:
                    self.W1[:,index] = pretrained_model[word]
                       
                    
    # forward propagation
    def forward_propagation(self, x):
        X = self.x_train_one_hot[x]
        self.hidden = np.dot(self.W1, X)
        c_positive = self.W2[word_to_index[self.y_train_positive[x]]]
        pos_cost = -np.log(sigmoid(np.dot(c_positive, self.hidden)))
        neg_cost = 0
        for negative_word in self.y_train_negative[x]:
            c_negative = self.W2[word_to_index[negative_word]]
            neg_cost += np.log(sigmoid(np.dot(-c_negative, self.hidden)))
        self.cost = pos_cost - neg_cost

    # backward propagation
    def backward_propagation(self, x):
        c_positive_index = word_to_index[self.y_train_positive[x]]
        # compute W1
        dW1 = (sigmoid(np.dot(self.W2[c_positive_index], self.hidden)) - 1) * self.W2[c_positive_index]
        # compute W2
        self.W2[c_positive_index] = self.W2[c_positive_index] - self.alpha * ((sigmoid(np.dot(self.W2[c_positive_index], self.hidden))) - 1) * self.hidden
        for negative_word in self.y_train_negative[x]:
            c_negative_index = word_to_index[negative_word]
            # compute W1
            dW1 += sigmoid(np.dot(self.W2[c_negative_index], self.hidden)) * self.W2[c_negative_index]
            # compute W2
            self.W2[c_negative_index] = self.W2[c_negative_index] - self.alpha * sigmoid(np.dot(self.W2[c_negative_index], self.hidden)) * self.hidden

        # update W1
        for word in self.x_train_context_words[x]:
            context_word_index = word_to_index[word]
            self.W1[:,context_word_index] = self.W1[:,context_word_index] - 1 / len(self.x_train_context_words[x]) * self.alpha * dW1


    # make prediction
    def predict(self, x):
        X = self.x_train_one_hot[x]
        hidden = np.dot(self.W1, X)
        output = np.dot(self.W2, hidden)
        y_predict = softmax(output)
        ans = index_to_word[np.argmax(y_predict)]
        return ans == self.y_train_positive[x]



    # training the model
    def train(self):
        for i in range(self.epoch):
            for x in range(len(self.x_train_one_hot)):
                self.forward_propagation(x)
                self.backward_propagation(x)
            if i % 100 == 0 or i == self.epoch - 1:
                correct = 0
                for index in range(len(self.x_train_one_hot)):
                    if self.predict(index):
                        correct += 1
                print('epoch{} accuracy: {}'.format(i, correct / len(self.x_train_one_hot)))
                
    def debug(self, x):
        X = self.x_train_one_hot[x]
        hidden = np.dot(self.W1, X)
        output = np.dot(self.W2, hidden)
        y_predict = softmax(output)
        ans = index_to_word[np.argmax(y_predict)]
        print('{} --- {}'.format(ans, self.y_train_positive[x]))

In [117]:
cbow_model_negative_sampling = CBOW_model_negative_sampling()
cbow_model_negative_sampling.initialize(alpha=0.01, epoch=5000, google_news=False, window_size=2, embedding_dimension=300)
#print(cbow_model_negative_sampling.y_train_negative)
cbow_model_negative_sampling.train()

0.9999999999999933
epoch0 accuracy: 0.0018484288354898336
epoch100 accuracy: 0.059149722735674676
epoch200 accuracy: 0.1756007393715342
epoch300 accuracy: 0.27911275415896486
epoch400 accuracy: 0.34935304990757854
epoch500 accuracy: 0.38817005545286504
epoch600 accuracy: 0.41589648798521256
epoch700 accuracy: 0.4658040665434381
epoch800 accuracy: 0.4953789279112754
epoch900 accuracy: 0.5138632162661737
epoch1000 accuracy: 0.5268022181146026
epoch1100 accuracy: 0.532347504621072
epoch1200 accuracy: 0.5378927911275416
epoch1300 accuracy: 0.5471349353049908
epoch1400 accuracy: 0.55637707948244
epoch1500 accuracy: 0.5619223659889094
epoch1600 accuracy: 0.5693160813308688
epoch1700 accuracy: 0.5767097966728281
epoch1800 accuracy: 0.5804066543438078
epoch1900 accuracy: 0.5841035120147874
epoch2000 accuracy: 0.5859519408502772
epoch2100 accuracy: 0.589648798521257
epoch2200 accuracy: 0.5878003696857671
epoch2300 accuracy: 0.589648798521257
epoch2400 accuracy: 0.5914972273567468
epoch2500 accu

In [109]:
print(cbow_model_negative_sampling.x_train_context_words[53])
print(cbow_model_negative_sampling.y_train_positive[53])
print(cbow_model_negative_sampling.y_train_negative[53])

{'intelligence', 'use', 'the', 'artificial'}
term
['centered', 'such', 'go', 'competing', 'and', 'reasoning', 'web', 'tasks', 'imitating', 'machines']
