In [61]:
import os
import urllib
import zipfile
import nltk
import numpy as np
import tensorflow as tf
import random

In [2]:
EMBEDDING_DIMENSION=100 # Available dimensions for 6B data is 50, 100, 200, 300
data_directory = '/home/johannes/thesis_code/word_embeddings/'

glove_weights_file_path = '/home/johannes/thesis_code/word_embeddings/glove.6B.100d.txt'



In [3]:
PAD_TOKEN = 0

word2idx = { 'PAD': PAD_TOKEN } # dict so we can lookup indices for tokenising our text later from string to sequence of integers weights = []

weights=[]
with open(glove_weights_file_path,'r') as file:
    for index,line in enumerate(file):
        values = line.split()
        word = values[0]
        word_weights = np.asarray(values[1:], dtype=np.float32)
        word2idx[word] = index + 1
        weights.append(word_weights)
        if index + 1 == 40000:
            # Limit vocabulary to top 40k terms
            break


In [4]:
EMBEDDING_DIMENSION = len(weights[0])

In [5]:
weights.insert(0, np.random.randn(EMBEDDING_DIMENSION))
UNKNOWN_TOKEN=len(weights) 
word2idx['UNK'] = UNKNOWN_TOKEN 
weights.append(np.random.randn(EMBEDDING_DIMENSION))

In [6]:
weights = np.asarray(weights, dtype=np.float32)
VOCAB_SIZE=weights.shape[0]

In [7]:

features = {}
features['word_indices'] = nltk.word_tokenize('hello world') # ['hello', 'world']
features['word_indices'] = [word2idx.get(word, UNKNOWN_TOKEN) for word in features['word_indices']]


In [7]:
glove_weights_initializer = tf.constant_initializer(weights)
embedding_weights = tf.get_variable(
    name='embedding_weights', 
    shape=(VOCAB_SIZE, EMBEDDING_DIMENSION), 
    initializer=glove_weights_initializer,
    trainable=False)
#embedding = tf.nn.embedding_lookup(embedding_weights, features['word_indices'])

In [8]:
embedding_weights

<tf.Variable 'embedding_weights:0' shape=(40002, 100) dtype=float32_ref>

In [9]:
"""
******************
DONE WITH
WORD EMBEDDINGS??
******************
"""

'\n******************\nDONE WITH\nWORD EMBEDDINGS??\n******************\n'

In [None]:
"""
******************
READING THE TRAINING DATA
******************
"""

In [30]:
train_data_path = "/home/johannes/thesis_code/ml_experimentation/data/training/1A.english.training.data.txt"
train_gold_path = "/home/johannes/thesis_code/ml_experimentation/data/training/1A.english.training.gold.txt"

data_file = open(train_data_path,'r')
gold_file = open(train_gold_path,'r')

train_data = [line.split('\t')[0] for line in data_file]
gold_data = [[w.strip('\n') for w in line.split('\t')] for line in gold_file]

data_file.close()
gold_file.close()
assert len(train_data) == len (gold_data)

In [82]:
def remove_words_without_embeddings(train, gold):
    train_embeddings = []
    gold_embeddings = []
    for i in range(len(train)):
        if train[i] not in word2idx:
            continue
        if not np.array([w in word2idx for w in gold[i]]).any():
            continue
        train_embeddings.append(train[i])
        gold_embeddings += [[w for w in gold[i] if w in word2idx]]
    assert len(train_embeddings) == len(gold_embeddings)
    print("kept ", len(train_embeddings), " words with embeddings")
    return train_embeddings, gold_embeddings
        
def create_training_dict(train, gold, add_neg=0):
    '''
    Takes lists of indices
    '''
    possible_negatives = []
    if add_neg:
        possible_negatives = list(word2idx.keys())
    
    train_dict = {}
    train_dict["queries"] = []
    train_dict["candidates"] = []
    train_dict["targets"] = []

    for i in range(len(train)):
        current_query = train[i]
        current_candidates = [h for h in gold[i]]
        current_negatives = [word2idx[random.choice(possible_negatives)] for _ in range(add_neg*len(current_candidates))]
        num_pos = len(current_candidates)
        num_neg = len(current_negatives)
        
        train_dict["queries"] += [current_query]*num_pos
        train_dict["candidates"] += current_candidates
        train_dict["targets"] += [1]*num_pos
        if add_neg:
            train_dict["queries"] += [current_query]*num_neg
            train_dict["candidates"] += current_negatives
            train_dict["targets"] += [0]*num_neg
    assert len(train_dict["queries"]) == len(train_dict["candidates"]) == len(train_dict["targets"])
    return train_dict

In [86]:
train_data, gold_data = remove_words_without_embeddings(train_data,gold_data)
train_indices = [word2idx.get(word, UNKNOWN_TOKEN) for word in train_data]
gold_indices = [[word2idx.get(word, UNKNOWN_TOKEN) for word in line] for line in gold_data]

assert len(train_indices) == len(gold_indices)

train_dict = create_training_dict(train_indices, gold_indices, 3)

"""
Now have a train_dict with
"queries","candidates" and "targets" (word_idx, word_idx, 1/0)
"""


kept  412  words with embeddings
[4400, 1508, 1294, 12323, 22635, 2814, 13929, 34347, 5252, 13769]


In [26]:
"""
******************
PERFORMING THE TRAINING
AND STUFF
******************
"""

'\n******************\nPERFORMING THE TRAINING\n******************\n'

In [10]:
learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1

k=24

In [11]:
x = tf.placeholder(tf.float32, shape=(EMBEDDING_DIMENSION)) # mnist data image of shape 28*28=784
h = tf.placeholder(tf.float32, shape=(EMBEDDING_DIMENSION))
y = tf.placeholder(tf.float32) # 0-9 digits recognition => 10 classes

In [12]:
W = tf.Variable(tf.math.add(tf.eye(EMBEDDING_DIMENSION),tf.random.normal([EMBEDDING_DIMENSION, EMBEDDING_DIMENSION],0,0.2)))

In [18]:
proj = tf.matmul(tf.expand_dims(x,0),W)
sim = tf.tensordot(proj, h, 1)
pred = tf.math.sigmoid(sim)

In [19]:
pred

<tf.Tensor 'Sigmoid_1:0' shape=(1,) dtype=float32>

In [20]:
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()

In [25]:
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    #x = tf.get_variable('x', shape=[VOCAB_SIZE, EMBEDDING_DIMENSION], initializer=glove_weights_initializer)
    #x.initializer.run()
    embedding = tf.nn.embedding_lookup(embedding_weights, [1,2,3])
    print(embedding.eval())
    # Embeddings
    #sess.run(embedding_init, feed_dict={embedding_placeholder: list(em.items())})

    # Training cycle
    """
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
"""

[[-0.038194  -0.24487    0.72812   -0.39961    0.083172   0.043953
  -0.39141    0.3344    -0.57545    0.087459   0.28787   -0.06731
   0.30906   -0.26384   -0.13231   -0.20757    0.33395   -0.33848
  -0.31743   -0.48336    0.1464    -0.37304    0.34577    0.052041
   0.44946   -0.46971    0.02628   -0.54155   -0.15518   -0.14107
  -0.039722   0.28277    0.14393    0.23464   -0.31021    0.086173
   0.20397    0.52624    0.17164   -0.082378  -0.71787   -0.41531
   0.20335   -0.12763    0.41367    0.55187    0.57908   -0.33477
  -0.36559   -0.54857   -0.062892   0.26584    0.30205    0.99775
  -0.80481   -3.0243     0.01254   -0.36942    2.2167     0.72201
  -0.24978    0.92136    0.034514   0.46745    1.1079    -0.19358
  -0.074575   0.23353   -0.052062  -0.22044    0.057162  -0.15806
  -0.30798   -0.41625    0.37972    0.15006   -0.53212   -0.2055
  -1.2526     0.071624   0.70565    0.49744   -0.42063    0.26148
  -1.538     -0.30223   -0.073438  -0.28312    0.37104   -0.25217
   0.016