In [33]:
!pip install h5py



In [34]:
# Load the file and pull out words and embeddings
import h5py

with h5py.File('datasets/mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
print("all_words dimensions: {0}".format(len(all_words)))
print("all_embeddings dimensions: {0}".format(all_embeddings.shape))

print(all_words[1337])

all_words dimensions: 371369
all_embeddings dimensions: (371369, 300)
/c/de/aufträgen


In [35]:
# restrict our vocablury to just english words
english_words=[word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices=[i for i,word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings=all_embeddings[english_word_indices]
print("all_words dimensions: {0}".format(len(english_words)))
print("all_embeddings dimensions: {0}".format(english_embeddings.shape))

print(english_words[1337])

all_words dimensions: 166967
all_embeddings dimensions: (166967, 300)
acoustical


In [36]:
import numpy as np
norms=np.linalg.norm(english_embeddings,axis=1)
normalized_embeddings=english_embeddings.astype('float32')/norms.astype('float32').reshape([-1,1])

In [37]:
index={word: i for i,word in enumerate(english_words)}

In [38]:
#measuring similarity between words
def similarity_score(w1,w2):
  score = np.dot(normalized_embeddings[index[w1],:],normalized_embeddings[index[w2],:])
  return score  

# A word is as similar with itself as possible:
print('cat\tcat\t', similarity_score('cat', 'cat'))

# Closely related words still get high scores:
print('cat\tfeline\t', similarity_score('cat', 'feline'))
print('cat\tdog\t', similarity_score('cat', 'dog'))

# Unrelated words, not so much
print('cat\tkitty\t', similarity_score('cat', 'kitty'))
print('mouse\tmouse\t', similarity_score('mouse', 'mouse'))

# Antonyms are still considered related, sometimes more so than synonyms
print('antonyms\topposites\t', similarity_score('antonym', 'opposite'))
print('antonyms\tsynonyms\t', similarity_score('antonym', 'synonym'))                                                                       
print('love\thate\t', similarity_score('love', 'love'))                                                                                                                                              
                                                                       

cat	cat	 1.0
cat	feline	 0.81091654
cat	dog	 0.58677226
cat	kitty	 0.819126
mouse	mouse	 0.99999994
antonyms	opposites	 0.31266534
antonyms	synonyms	 0.48834023
love	hate	 1.0


In [40]:
#finding most similar words to a given words
def closest_to_vector(v,n):
  all_scores = np.dot(normalized_embeddings,v)
  best_words = map(lambda i:english_words[i], reversed(np.argsort(all_scores)))
  return [next(best_words) for _ in range(n)]

def most_similar(w,n):
   return closest_to_vector(normalized_embeddings[index[w],:],n)

In [42]:
print(most_similar('cat', 10))
print(most_similar('dog', 10))
print(most_similar('mahal', 10))

['cat', 'humane_society', 'kitten', 'kitty', 'cats', 'feline', 'colocolo', 'housecat', 'maine_coon', 'moggie']
['dog', 'dogs', 'doggy_paddle', 'good_friend', 'lhasa_apso', 'wire_haired_dachshund', 'cadaver_dog', 'woof_woof', 'golden_retrievers', 'scenthound']
['mahal', 'taj', 'taj_mahal', 'dhivehi', 'mumtaz', 'udaipur', 'jahan', 'serai', 'rajasthan', 'raj']


In [43]:
# solve_analogy
def solve_analogy(a1,b1,a2):
    b2=normalized_embeddings[index[b1], :]-normalized_embeddings[index[a1],:]+normalized_embeddings[index[a2], : ]
    return closest_to_vector(b2,1)

print(solve_analogy("man","brother","woman"))
print(solve_analogy("man", "husband", "woman"))
print(solve_analogy("britain", "london", "france"))    

['sister']
['wife']
['paris']


In [44]:
# Using WOrd EMbeddings in deep models and sentimental analysis approach

import string

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y=int(line[0])
    
     # Split the line into words using Python's split() function
    words=line[2:].translate(remove_punct).lower().split()
    
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words
                  if w in index]
    
 

   # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return {'x': x, 'y': y}

# Apply the function to each line in the file.
with open("movie-simple.txt","r",encoding="utf-8", errors="ignore") as f:
    dataset=[convert_line_to_example(l) for l in f.readlines()]

In [45]:
len(dataset)

1411

In [47]:
import random
random.shuffle(dataset)

batch_size=100
total_batches=len(dataset)//batch_size
train_batches=3*total_batches//4
train,test=dataset[:train_batches*batch_size],dataset[train_batches*batch_size:]

In [55]:
#building mlp 
import tensorflow as tf

#Supress warning
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

#placeholder for input
x=tf.placeholder(tf.float32,[None,300])
y=tf.placeholder(tf.float32,[None,1])

#three layer MLP
h1=tf.layers.dense(x,100,tf.nn.relu)
h2=tf.layers.dense(h1,20,tf.nn.relu)
logits=tf.layers.dense(h2,1)
probabilities=tf.sigmoid(logits)

# Loss and metrics
loss=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=y))
accuracy=tf.reduce_mean(tf.cast(tf.equal(tf.round(probabilities),y),tf.float32))


# Training
train_step = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

# Initialization of variables
init_op= tf.global_variables_initializer()



In [59]:
init_op= tf.global_variables_initializer()
#Train
sess=tf.Session()
sess.run(init_op)

for epoch in range(250):
    for batch in range(train_batches):
        data=train[batch*batch_size:(batch+1)*batch_size]
        reviews=[sample['x'] for sample in data]
        labels  =[sample['y'] for sample in data]
        labels = np.array(labels).reshape([-1,1])
        _, l, acc=sess.run([train_step,loss,accuracy],feed_dict={x:reviews,y:labels})
    if epoch%10 ==0:
        print("Epoch: {0} \t Loss: {1} \t Acc: {2}".format(epoch, l, acc))
        
        
    random.shuffle(train)
    
# Evaluate on test set
test_reviews = [sample['x'] for sample in test]
test_labels  = [sample['y'] for sample in test]
test_labels  = np.array(test_labels).reshape([-1, 1])

acc = sess.run(accuracy, feed_dict={x: test_reviews, y: test_labels})
print("Final accuracy: {0}".format(acc))    
    
    

Epoch: 0 	 Loss: 0.6899133920669556 	 Acc: 0.5600000023841858
Epoch: 10 	 Loss: 0.6865106821060181 	 Acc: 0.5199999809265137
Epoch: 20 	 Loss: 0.6806683540344238 	 Acc: 0.5099999904632568
Epoch: 30 	 Loss: 0.6754550337791443 	 Acc: 0.5299999713897705
Epoch: 40 	 Loss: 0.6513552665710449 	 Acc: 0.6000000238418579
Epoch: 50 	 Loss: 0.6276317834854126 	 Acc: 0.6499999761581421
Epoch: 60 	 Loss: 0.5802499055862427 	 Acc: 0.8100000023841858
Epoch: 70 	 Loss: 0.542470395565033 	 Acc: 0.8100000023841858
Epoch: 80 	 Loss: 0.4524689018726349 	 Acc: 0.8799999952316284
Epoch: 90 	 Loss: 0.42233946919441223 	 Acc: 0.8999999761581421
Epoch: 100 	 Loss: 0.32607659697532654 	 Acc: 0.949999988079071
Epoch: 110 	 Loss: 0.27644532918930054 	 Acc: 0.9200000166893005
Epoch: 120 	 Loss: 0.20447510480880737 	 Acc: 0.9599999785423279
Epoch: 130 	 Loss: 0.25389957427978516 	 Acc: 0.9300000071525574
Epoch: 140 	 Loss: 0.15890401601791382 	 Acc: 0.9800000190734863
Epoch: 150 	 Loss: 0.2250901758670807 	 Acc: 0.

In [64]:
#check our sentiment analysis model
words_to_test=["exciting","hated","boring","loved","enjoyable","disgusting"]


for word in words_to_test:
    print(word,sess.run(probabilities,feed_dict={x:normalized_embeddings[index[word]].reshape(1,300)}))

exciting [[0.9998894]]
hated [[0.]]
boring [[8.817332e-08]]
loved [[0.99999917]]
enjoyable [[0.9990632]]
disgusting [[9.870593e-08]]


In [65]:
sess.close()
tf.reset_default_graph()