In [1]:
#import library: bs4, urllib.request, numpy, string, stopwords
import bs4 as bs
import urllib.request
import numpy as np 
import string 
from nltk.corpus import stopwords 

# with help of library urllib.request we can open website and do some operations like read
data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = data.read()
# with help of library bs4 we can take lxml format. lxml - the most feature-rich and easy-to-use library for processing XML and HTML in the Python language.
parsedArticle = bs.BeautifulSoup(article,'lxml')
# from tag "p" we take all information
paragraphs = parsedArticle.find_all('p')

articleText = ""
# we combine all text
for p in paragraphs:
    articleText += p.text

In [3]:
# we calculate probability of word
def probabilityOfWord(x): 
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 
# main class where we can train and predict data   
class word2vec(object): 
    #set parameters
    def __init__(self): 
        self.N = 10
        self.X_train = [] 
        self.y_train = [] 
        self.window_size = 2
        self.alpha = 0.001
        self.words = [] 
        self.word_index = {} 
   # V=Number of unique words in our corpus of text
# W=Weights between input layer and hidden layer
# W1=Weights between hidden layer and output layer
# N=Number of neurons in the hidden layer of neural network
    def initialize(self,V,data): 
        self.V = V 
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
           
        self.words = data 
        for i in range(len(data)): 
            self.word_index[data[i]] = i 
   
    # we calculate Forward Propagation   
    def forwardPropagation(self,X): 
        self.h = np.dot(self.W.T,X).reshape(self.N,1) 
        self.u = np.dot(self.W1.T,self.h) 
        self.y = probabilityOfWord(self.u)   
        return self.y 
    # we calculate Back Propagation       
    def backPropagation(self,x,t): 
        e = self.y - np.asarray(t).reshape(self.V,1) 
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
    # in this function we train our data       
    def train(self,epochs): 
        for x in range(1,epochs):         
            self.loss = 0
            for j in range(len(self.X_train)): 
                self.forwardPropagation(self.X_train[j]) 
                self.backPropagation(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
     # in this function we predict words which similar and words often used together         
    def predict(self,word,numberOfPredictions): 
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.forwardPropagation(X) 
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
               
            topContextWords = [] 
            for k in sorted(output,reverse=True): 
                topContextWords.append(self.words[output[k]]) 
                if(len(topContextWords)>=numberOfPredictions): 
                    break
       
            return topContextWords 
        else: 
            print("Word not found in dicitonary") 
# we clean our data, we delete stopwords, space, punctuation, do lower words            
def preprocessing(corpus): 
    stopWords = set(stopwords.words('english'))     
    trainingData = [] 
    sentences = corpus.split(".") 
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip() 
        sentence = sentences[i].split() 
        x = [word.strip(string.punctuation) for word in sentence 
                                     if word not in stopWords] 
        x = [word.lower() for word in x] 
        trainingData.append(x) 
    return trainingData 
       
# this function help us to take our data and train it, we divide words from sentences, sort, count  
def dataForTraining(sentences,w2v): 
    data = {} 
    for sentence in sentences: 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) 
    data = sorted(list(data.keys())) 
    vocab = {} 
    for i in range(len(data)): 
        vocab[data[i]] = i 
       
    for sentence in sentences: 
        for i in range(len(sentence)): 
            centerWord = [0 for x in range(V)] 
            centerWord[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(centerWord) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 
   
    return w2v.X_train,w2v.y_train 

# we give a text from website
corpus = articleText
# quantity of how many we should train our data
epochs = 100
# our training data  
trainingData = preprocessing(corpus) 
# our method
w2v = word2vec() 
# our data and method  
dataForTraining(trainingData,w2v) 
#quantity of how many we should train our data
w2v.train(epochs)  
#find 3 close words for our "artificial" word  
print(w2v.predict("artificial",3)) 

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\kaisa/nltk_data'
    - 'D:\\panamera2\\env\\nltk_data'
    - 'D:\\panamera2\\env\\share\\nltk_data'
    - 'D:\\panamera2\\env\\lib\\nltk_data'
    - 'C:\\Users\\kaisa\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
