In [1]:
import tensorflow as tf

In [162]:
import numpy as np
import os
from random import shuffle
import re
import pdb
import random

In [177]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import sklearn.cluster

In [4]:
import urllib
import zipfile
import lxml.etree

In [7]:
with zipfile.ZipFile('../ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

In [19]:
keyword_strings = doc.xpath('//keywords/text()')

In [27]:
# list of keywords for each talk
keywords = []
for string in keyword_strings:
    keywords.append(map(lambda x: x.lower(), string.split(', ')))

In [30]:
content_strings = doc.xpath('//content/text()')

In [31]:
content_strings_noparens = map(lambda x: re.sub(r'\([^)]*\)', '', x), content_strings)

In [35]:
# sentence-by-sentence tokens for each sentence in each talk
content_tokens = []
for content_str in content_strings_noparens:
    sentences = content_str.split('.')
    content_sentences = []
    for sentence in sentences:
        tokens = map(str,re.sub(r"[^a-z0-9]+", " ", sentence.lower()).split())
        content_sentences.append(tokens)
    content_tokens.append(content_sentences)

In [40]:
# replace tokens which only appear once in the training set with 'UNK'
word_counts = {}
for talk in content_tokens[:1585]:
    for sentence in talk:
        for token in sentence:
            if token in word_counts:
                word_counts[token] += 1
            else:
                word_counts[token] = 1
                

In [45]:
len(word_counts)

47407

In [49]:
rare_words = set(map(lambda (k,v): k, filter(lambda (k,v): v==1, word_counts.items())))

In [57]:
content_tokens_common = []
for talk in content_tokens:
    content_sentences = []
    for sentence in talk:
        tokens = []
        for token in sentence:
            if token in rare_words:
                tokens.append('UNK')
            else:
                tokens.append(token)
        content_sentences.append(tokens)
    content_tokens_common.append(content_sentences)

In [58]:
len(content_tokens_common)

2085

In [78]:
talk_labels = []
for talk in keywords:
    label = 0
    if 'technology' in talk:
        label += 1
    if 'entertainment' in talk:
        label += 2
    if 'design' in talk:
        label += 4
    talk_labels.append(label)

In [64]:
content_sentences = [sentence for talk in content_tokens_common for sentence in talk]

In [69]:
model = Word2Vec(content_sentences, size=100, window=5, min_count=1, workers=4)

In [70]:
'UNK' in model

True

In [83]:
def doc_embedding(talk, model):
    embedding = np.zeros(100)
    words = 0
    for sentence in talk:
        words += len(sentence)
        for token in sentence:
            if token in model:
                embedding += model[token]
            else:
                embedding += model['UNK']
    return embedding/words

In [188]:
def label_talks():
    x = tf.placeholder(tf.float32, [None,100])
    sy_labels = tf.placeholder(tf.uint8, [None])
    W = tf.Variable(tf.truncated_normal([100,20],stddev=0.1))
    b = tf.Variable(tf.zeros(20))
    h = tf.nn.relu(tf.matmul(x,W)+b)
    V = tf.Variable(tf.truncated_normal([20,2],stddev=0.1))
    c = tf.Variable(tf.zeros(2))
    u = tf.matmul(h,V)+c
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=u, labels=tf.one_hot(sy_labels,2)))
    
    preds = tf.argmax(u,1)
    train_step = tf.train.AdamOptimizer().minimize(loss)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(sy_labels,tf.int32),tf.cast(preds,tf.int32)),tf.float32))

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(2):
            for i in range(1550/50):
                embedding = [doc_embedding(content_tokens[50*i+j], model) for j in range(50)]
                labels = talk_labels[50*i:50*(i+1)]
                labels_t = [label%2 for label in labels]
                labels_e = [(label/2)%2 for label in labels]
                labels_e = [((label/2)/2)%2 for label in labels]
                _, acc, fin, predictions, _V = sess.run([train_step, accuracy, u, preds, V], feed_dict={x: embedding, sy_labels: labels})
                print acc
                print predictions

In [183]:
tsne = TSNE(n_components=2, random_state=0)
labels_tsne = tsne.fit_transform(np.transpose(_V))
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="T-SNE for TED talk labels")

source = ColumnDataSource(data=dict(x1=labels_tsne[:,0],
                                    x2=labels_tsne[:,1],
                                    names=range(8)))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [334]:
def process_batch_rnn(batch,model):
    max_tokens = 0
    embedding_size = 100
    for talk in batch:
        if sum([len(sentence) for sentence in talk]) > max_tokens:
            max_tokens = sum([len(sentence) for sentence in talk])
    new_batch = []
    sequence_lengths = []
    for talk in batch:
        sequence_lengths.append(sum([len(sentence) for sentence in talk]))
        talk_tokens = []
        talk_words = [word for word in sentence for sentence in talk]
        for i in range(max_tokens):
            if i < len(talk_words):
                if talk_words[i] in model:
                    talk_tokens.append(model[token])
                else:
                    talk_tokens.append(model['UNK'])
            else:
                talk_tokens.append(np.zeros(embedding_size))
        talk_tokens = np.array(talk_tokens)
        new_batch.append(talk_tokens)
    return sequence_lengths, new_batch

In [339]:
def label_talks_rnn():
    tf.reset_default_graph()
    inputs = tf.placeholder(tf.float32, [None,None,100])
    sy_labels = tf.placeholder(tf.uint8, [None])
    s_length = tf.placeholder(tf.int32, [None])
    #cell = tf.contrib.rnn.BasicRNNCell(10)
    cell = tf.contrib.rnn.BasicLSTMCell(10)
    output, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=s_length, dtype=tf.float32)
    W = tf.Variable(tf.truncated_normal([10,8],stddev=0.1))
    b = tf.Variable(tf.zeros(8))
    #logits = tf.matmul(state,W)+b
    logits = tf.matmul(state[-1],W)+b
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.one_hot(sy_labels,8)))
    
    preds = tf.argmax(logits,1)
    train_step = tf.train.AdamOptimizer().minimize(loss)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(sy_labels,tf.int32),tf.cast(preds,tf.int32)),tf.float32))

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(2):
            for i in range(1550/50):
                sequence_lengths, embedding = process_batch_rnn([content_tokens[50*i+j] for j in range(50)],model)
                labels = talk_labels[50*i:50*(i+1)]
                labels_t = [label%2 for label in labels]
                labels_e = [(label/2)%2 for label in labels]
                labels_e = [((label/2)/2)%2 for label in labels]
                _, acc, predictions = sess.run([train_step, accuracy, preds], feed_dict={inputs: embedding, sy_labels: labels, s_length: sequence_lengths})
                print acc
                print predictions

In [340]:
label_talks_rnn()

0.64
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.62
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.86
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.78
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.6
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.82
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.8
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.76
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.78
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
0.8
[0 0 0 0 0 0 0 0 0 0 0 0 0