In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
df= pd.read_excel(r'news_headlines.xlsx')

In [3]:
headlines = pd.DataFrame(df, columns=['SENTENCES'])

In [4]:
headlines

Unnamed: 0,SENTENCES
0,"Moeller's student-run newspaper, The Crusader,..."
1,"In 2008, The Crusader won First Place, the sec..."
2,The Squire is a student literary journal that ...
3,Paul Keels - play-by-play announcer for Ohio S...
4,Joe Uecker - Ohio State Senator (R-66) .
...,...
46112,Vancouver's characteristic approach to urban p...
46113,Vancouver is also considered to have the worst...
46114,The Vancouver Art Gallery is housed downtown i...
46115,A prominent addition to the city's landscape i...


In [5]:
stopwords = [",", "(", ")" ,"." ,"-" ,"i" , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
print(len(stopwords))

132


In [6]:
class word2vec():
    def __init__(self):
        self.n = settings['n']  #dim of word embeddings
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']    
        self.window = settings['window_size']   #context window +- center word

    
    def generate_training_data(self, settings, corpus):
        word_count = defaultdict(int)  #finds unq word counts using dictionary
        
        for row in corpus:
            for word in row:
                word_count[word] += 1

        self.v_count = len(word_count.keys())   #how many unq words in the dict
        print(self.v_count)
        self.word_list = list(word_count.keys())    #generatings look-up dict(vocab)
        
        #generate word:index
        self.word_index = dict((word, i) for i, word in enumerate(self.word_list))

        #generate index:word
        self.index_word = dict((i, word) for i, word in enumerate(self.word_list))


        training_data = []

        #cycling through each sentence in corpus
        for sentence in corpus:
            sent_len = len(sentence)
            
            #cyle through each word in sentence
            for i, word in enumerate(sentence):

                #converting target word into one-hot enc
                w_target = self.word2onehot(sentence[i])

                w_context= []

                #cycle through context window
                for j in range(i-self.window, i+ self.window+1):
                    #criteria for context word
                    #1. target word != context word
                    #2. index must be >= 0 (j >= 0)
                    #3. index must be <= len(sentence)

                    if j != i and j<=sent_len-1 and j>=0:
                        
                        #append the one-hot representation of word to w_context
                        w_context.append(self.word2onehot(sentence[j]))

                
                training_data.append([w_target, w_context])

        return np.array(training_data)

    def word2onehot(self, word):

        #initialize a blank vector __ word_vec
        word_vec = [0 for i in range(0, self.v_count)]

        #get the ID of the word from word_index
        word_index = self.word_index[word]
        
        #change value to 1 acc to ID of the word
        word_vec[word_index] = 1

        return word_vec

    def train(self, training_data):
        
        #initialize weight matrix
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))      #9x10
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))      #10x9


        #cycle through each epoch
        for i in range(self.epochs):
            self.loss = 0       #initialize loss to 0

            #cycle through each training example
            for w_t, w_c in training_data:      #w_t is the target vector & w_c is the context vector

                #forward pass
                y_pred, h, u = self.forward_pass(w_t)
                
                #calculate error
                #for a target word, cal diff b/w y_pred & each of the context words
                #sum up the diffreneces for each target word
                EI = np.sum([np.subtract(y_pred,word) for word in w_c],axis=0)

                #backpropagation
                #we use SGD to backpropagate errors - cal loss on the output layer
                self.backprop(EI, h, w_t)
                
                #calculate loss
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))

            print('\nEpoch:', i, 'Loss: ', self.loss)

    def forward_pass(self, x) :
        #hidden layer activation
        h = np.dot(self.w1.T, x )
        
        #output layer before softmax 
        u = np.dot(self.w2.T, h)

        #run output layer through softmax
        y_c = self.softmax(u)

        return y_c, h, u

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x/e_x.sum(axis=0)

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)     #d1_dw2 : (9x1) X (10x1)
        dl_dw1 = np.outer(x, np.dot(self.w2, e))      #x --- 9x1 ; self.w2 --- 9x10, e.T --- 1x9, e --- 9x1, self.w2.T --- 10x9
        #d1_dw1 : 9x10
        #update weights
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)

    #get vector from the word
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w

    #input vector retunr nearest word(s)
    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):
            #find the similarity score for each word in vocab
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta


        word_sorted = sorted(word_sim.items(), key=lambda kv : kv[1], reverse=True)

        for word, sim in word_sorted[:top_n]:
            print('\n')
            print(word, sim)

In [7]:
for ind in df.index:
    if ind == 0:
        text = df['SENTENCES'][100]
print(text)
print(len(text))

Operation mode of Image forming of the AFM are generally classified into two groups from the viewpoint whether it uses z-Feedback loop (not shown) to maintain the tip-sample distance to keep signal intensity exported by the detector.
233


In [None]:
corpus = [[word.lower() for word in text.split()]]
print(len(corpus[0]))
print(corpus[0])
i =0
for word in stopwords:
    while i < len(corpus[0]):
        if corpus[0][i] == word:
            corpus[0].pop(i)
print()
print(corpus)
print(len(corpus[0]))

37
['operation', 'mode', 'of', 'image', 'forming', 'of', 'the', 'afm', 'are', 'generally', 'classified', 'into', 'two', 'groups', 'from', 'the', 'viewpoint', 'whether', 'it', 'uses', 'z-feedback', 'loop', '(not', 'shown)', 'to', 'maintain', 'the', 'tip-sample', 'distance', 'to', 'keep', 'signal', 'intensity', 'exported', 'by', 'the', 'detector.']


In [None]:
corpus
corpus[0][4]

In [18]:
settings = {
    'window_size':2,
    'n': len(corpus[0]),
    'epochs':100,
    'learning_rate':0.07
}
settings.items()

dict_items([('window_size', 2), ('n', 31), ('epochs', 100), ('learning_rate', 0.07)])

In [19]:
w2v = word2vec()

training_data = w2v.generate_training_data(settings, corpus)

w2v.train(training_data)

29

Epoch: 0 Loss:  599.2618029950265

Epoch: 1 Loss:  381.5696754215314

Epoch: 2 Loss:  295.17108915821746

Epoch: 3 Loss:  247.68348752114815

Epoch: 4 Loss:  218.72874053445074

Epoch: 5 Loss:  201.60588676269913

Epoch: 6 Loss:  191.82900880184744

Epoch: 7 Loss:  185.9962282018811

Epoch: 8 Loss:  182.24187469921793

Epoch: 9 Loss:  179.68066935412315

Epoch: 10 Loss:  177.84278646092284

Epoch: 11 Loss:  176.45969387182802

Epoch: 12 Loss:  175.390921272997

Epoch: 13 Loss:  174.53463608645868

Epoch: 14 Loss:  173.85099266257313

Epoch: 15 Loss:  173.28433556141636

Epoch: 16 Loss:  172.83958896198078

Epoch: 17 Loss:  172.45904917696345

Epoch: 18 Loss:  172.1768525472945

Epoch: 19 Loss:  171.8970158775967

Epoch: 20 Loss:  171.69713413850857

Epoch: 21 Loss:  171.42764478733775

Epoch: 22 Loss:  171.25090528321223

Epoch: 23 Loss:  170.95872312173267

Epoch: 24 Loss:  170.79684331457196

Epoch: 25 Loss:  170.49747717668123

Epoch: 26 Loss:  170.35818411312223

Epoch: 27 Loss