In [2]:
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
import pandas as pd
from bs4 import BeautifulSoup

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/data/khuangaf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
df= pd.read_csv('data/ASSISTmentsProblems.csv')
df.body = df.body.astype('string')
df['parsed'] = df.apply(lambda(row): BeautifulSoup(row['body'], 'html.parser').get_text() , axis=1)
df

Unnamed: 0,problem_id,body,parsed
0,134,"But remember, we need to find the area of the ...","But remember, we need to find the area of the ..."
1,135,"By the way, did you notice that side AC belong...","By the way, did you notice that side AC belong..."
2,171,What is the length of one side of a square in ...,What is the length of one side of a square in ...
3,183,<p>It is not always possible to make a triangl...,It is not always possible to make a triangle u...
4,184,<p>Now match up the lengths to see if they wil...,Now match up the lengths to see if they will f...
5,185,<p>Which set of line segments cannot be used t...,Which set of line segments cannot be used to f...
6,359,"Good, now that we know how to convert from squ...","Good, now that we know how to convert from squ..."
7,395,"The measurement of the room is in square feet,...","The measurement of the room is in square feet,..."
8,396,<p>The area of the square in the picture is 1 ...,The area of the square in the picture is 1 squ...
9,397,<p>What is the area of the library in square f...,What is the area of the library in square feet...


In [4]:
#collect all the words into one list
filtered_words=[]
def count_words(row):
    global filtered_words
    filtered_words += [word for word in row['parsed'].split() if word not in stopwords.words('english')]
df.apply(count_words, axis=1)

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
128942    None
128943    None
128944    None
128945    None
128946    None
128947    None
128948    None
128949    None
128950    None
128951    None
128952    None
128953    None
128954    None
128955    None
128956    None
128957    None
128958    None
128959    None
128960    None
128961    None
128962    None
128963    None
128964    None
128965    None
128966    None
128967    None
128968    None
128969    None
128970    None
128971    None
Length: 128972, dtype: object

In [5]:
vocabulary_size = 10000

def build_dataset(words):
    # UNK token is used to denote words that are not in the dictionary
    count = [['UNK', -1]]
    # returns set of tuples (word,count) with most common 50000 words
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    # set word count for all the words to the current number of keys in the dictionary
    # in other words values act as indices for each word
    # first word is 'UNK' representing unknown words we encounter
    for word, _ in count:
        dictionary[word] = len(dictionary)
    # this contains the words replaced by assigned indices
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(filtered_words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])

Most common words (+UNK) [['UNK', 84465], (u'Page', 28673), (u'What', 15021), (u'answer', 11573), (u'Do', 10752)]
Sample data [2389, 8046, 102, 55, 133, 3230, 391, 6800, 167, 101]


In [18]:
data_index = 0
def generate_batch(batch_size, skip_window):
    # skip window is the amount of words we're looking at from each side of a given word
    # creates a single batch
    global data_index
    assert skip_window%2==1

    span = 2 * skip_window + 1 # [ skip_window target skip_window ]

    batch = np.ndarray(shape=(batch_size,span-1), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # e.g if skip_window = 2 then span = 5
    # span is the length of the whole frame we are considering for a single word (left + word + right)
    # skip_window is the length of one side

    # queue which add and pop at the end
    buffer = collections.deque(maxlen=span)

    #get words starting from index 0 to span
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    # num_skips => # of times we select a random word within the span?
    # batch_size (8) and num_skips (2) (4 times)
    # batch_size (8) and num_skips (1) (8 times)
    for i in range(batch_size):
        target = skip_window  # target label at the center of the buffer
        target_to_avoid = [ skip_window ] # we only need to know the words around a given word, not the word itself

        # do this num_skips (2 times)
        # do this (1 time)

        # add selected target to avoid_list for next time
        col_idx = 0
        for j in range(span):
            if j==span//2:
                continue
            # e.g. i=0, j=0 => 0; i=0,j=1 => 1; i=1,j=0 => 2
            batch[i,col_idx] = buffer[j] # [skip_window] => middle element
            col_idx += 1
        labels[i, 0] = buffer[target]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    assert batch.shape[0]==batch_size and batch.shape[1]== span-1
    return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])


data: [u'But', u'remember,', u'need', u'find', u'area', u'square.', u'So,', u"square's"]


In [14]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
# pick 16 samples from 100
valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
valid_examples = np.append(valid_examples,random.sample(range(1000,1000+valid_window), valid_size//2))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size,2*skip_window])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Variables.
    # embedding, vector for each word in the vocabulary
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                     stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Model.
    # Look up embeddings for inputs.
    # this might efficiently find the embeddings for given ids (traind dataset)
    # manually doing this might not be efficient given there are 50000 entries in embeddings
    embeds = None
    for i in range(2*skip_window):
        embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])
        print('embedding %d shape: %s'%(i,embedding_i.get_shape().as_list()))
        emb_x,emb_y = embedding_i.get_shape().as_list()
        if embeds is None:
            embeds = tf.reshape(embedding_i,[emb_x,emb_y,1])
        else:
            print(embedding_i)
            print(emb_x)
            print(emb_y)
            embeds = tf.concat([embeds,tf.reshape(embedding_i,[emb_x,emb_y,1])],2)

    assert embeds.get_shape().as_list()[2]==2*skip_window
    print("Concat embedding size: %s"%embeds.get_shape().as_list())
    avg_embed =  tf.reduce_mean(embeds,2,keep_dims=False)
    print("Avg embedding size: %s"%avg_embed.get_shape().as_list())

    '''embedding_0 = tf.nn.embedding_lookup(embeddings, train_dataset[:,0])
    embedding_1 = tf.nn.embedding_lookup(embeddings, train_dataset[:,1])
    avg_embed =  (embedding_0+embedding_1)/2.0
    print("Avg embedding size: %s"%avg_embed.get_shape().as_list())'''

    # Compute the softmax loss, using a sample of the negative labels each time.
    # inputs are embeddings of the train words
    # with this loss we optimize weights, biases, embeddings

    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, train_labels, avg_embed, num_sampled, vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    # Adagrad is required because there are too many things to optimize
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

embedding 0 shape: [128, 128]
embedding 1 shape: [128, 128]
Tensor("embedding_lookup_1:0", shape=(128, 128), dtype=float32, device=/device:CPU:0)
128
128
Concat embedding size: [128, 128, 2]
Avg embedding size: [128, 128]


In [19]:
num_steps = 100001
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 6.043101
Nearest to 7: 85°, 96,, 370, _______, estimation, P7, must, 12x6,
Nearest to For: rules, coordinate?, room,, Paul's, false(You, Thus, 42x, scaled,
Nearest to What: #8c, models, figures., positive.(You, does., big, $13, Meg's,
Nearest to #6: 2/x=12/6., 2d, 4a, demand, cost, Appalachian, 0.8,, characteristic,
Nearest to Sorry,: New, hypothesis?, values?, spending, baseboards, Good!, √72, model,
Nearest to write: Leaf, ___cm2, meeting?, (8x, Example, cubic, substance, histogram.,
Nearest to 'Break: 638, P32, #37(You, #66, crops, Subtract, 368, -7.,
Nearest to #2: 90-92, (SSS,, shore, (-4,, paint, appears., feature, 1/2,
Nearest to Ms.: false(You, 0.82,, chart, said., ∏), -9, entering, (No,
Nearest to value?: a*10^b.), rest, outcomes, Price, Ben, Japan, Released, (Don't,
Nearest to area.: Negative, values., 46, basic, confidence, plate, 376, comparing,
Nearest to e: 566, $2.00, stars, judged, -use, P25, prime, Jane,
Nearest to -7: B)H0:, within,

Average loss at step 62000: 0.839483
Average loss at step 64000: 0.624272
Average loss at step 66000: 0.918466
Average loss at step 68000: 0.804024
Average loss at step 70000: 0.940906
Nearest to 7: 5, 6, 63, 141, 9, 361, 12, 377,
Nearest to For: 1:2, city, Thus, .2, With, (For, sets, (Assume,
Nearest to What: Find, Enter, Which, The, Predict, (Do, According, How,
Nearest to #6: #8, #4, #5, #28, #3, #27, #22, #24,
Nearest to Sorry,: Q, Good!, √72, An, press, New, 50x, Mode,
Nearest to write: enter, dismayed, include, find, leave, round, meeting?, greatest,,
Nearest to 'Break: +4), sphere., 638, reasonable, hadn’t, crops, P32, Katie,
Nearest to #2: #4, #3, #26, #1, #14, #76, #7, #6,
Nearest to Ms.: (No, signs., entire, false(You, 17(You, Some, 65?, school,
Nearest to value?: weekly, models, value., (Capitalize, a?, answer:, number?, 45s,
Nearest to area.: dimensions, x:, plate, sum., exact, 376, below:, absolute,
Nearest to e: $2.00, Harrison, resources, -7(You, 0,, -use, √4?, rewrite,
