In [1]:
import numpy as np
import os
from random import shuffle
import zipfile
import lxml.etree
import re
from gensim.models import Word2Vec



In [2]:
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [3]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = ' \t '.join(doc.xpath('//content/text()'))
keywords = ' \n '.join(doc.xpath('//keywords/text()'))
del doc

In [4]:
labels = list(keywords.split('\n'))

In [5]:
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [6]:
input_docs = list(input_text_noparens.split('\t'))
# Uncomment if you need to save some RAM: these strings are about 50MB.
del input_text, input_text_noparens

In [7]:
len(input_docs)

2085

In [8]:
len(labels)

2085

In [9]:
train_docs = input_docs[0:1585]
validation_docs = input_docs[1585:1835]
test_docs = input_docs[1835:2085]

<strong>Removing speaker's name </strong>

In [10]:
def text_clean(doc):
    docs = []
    #for doc in docs:
    for line in doc.split('\n'):
        m = re.sub(r"\\n.*:", '', line)
        docs.append(m)
    return docs

<Strong>We convert the training documents to sentences and then tokenize them in order to train the word2vec model </strong>

In [11]:
train_docs_clean = []
for doc in train_docs:
    train_docs_clean.append(text_clean(doc))
#example
#train_docs_clean[0]

In [12]:
len(train_docs_clean)

1585

In [13]:
def tokenize (docs):
    sentences_tokenized = []
    docs_tokenized = []
    for doc in docs:
        temp = []
        for sent_str in doc:
            tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
            sentences_tokenized.append(tokens)
            temp.append(tokens)
        docs_tokenized.append(temp)
    return sentences_tokenized, docs_tokenized

In [14]:
sentences_train, train_docs_tokenized = tokenize(train_docs_clean)
#example
#sentences_train[0]

In [15]:
len(sentences_train)

38815

In [16]:
len(train_docs_tokenized)

1585

In [17]:
validation_docs_clean = []
for doc in validation_docs:
    validation_docs_clean.append(text_clean(doc))
    
_, validation_docs_tokenized = tokenize(validation_docs_clean)

In [18]:
len(validation_docs_tokenized)

250

In [19]:
test_docs_clean = []
for doc in test_docs:
    test_docs_clean.append(text_clean(doc))
    
_, test_docs_tokenized = tokenize(test_docs_clean)

In [20]:
len(test_docs_tokenized)

250

In [21]:
v = set()
for sentence in sentences_train:
    v.update(sentence)
        

In [22]:
vocab = list(v)
len(vocab)

47407

In [23]:
ted_w2v = Word2Vec(sentences_train, size=50)

In [33]:
import gensim
ted_glove = Word2Vec.load_word2vec_format('glove.txt', binary=False)

<strong>Preparing the labels list</strong>

In [35]:
labels_new = []
for label_set in labels:
    if 'technology' and 'entertainment' and 'design' in label_set:
        labels_new.append('ted')
    elif 'technology' and 'entertainment'in label_set:
        labels_new.append('teo')
    elif 'technology' and 'design' in label_set:
        labels_new.append('tod')
    elif 'entertainment' and 'design'in label_set:
        labels_new.append('oed')
    elif 'technology' in label_set:
        labels_new.append('too')
    elif 'entertainment'in label_set:
        labels_new.append('oeo')
    elif 'design' in label_set:
        labels_new.append('ood')
    else:
        labels_new.append('ooo')

In [36]:
index_to_label = ['ted', 'teo', 'tod', 'oed', 'too', 'oeo', 'ood', 'ooo']
label_to_index = {label: index for index, label in enumerate(index_to_label)}

In [37]:
label_to_index

{'oed': 3,
 'oeo': 5,
 'ood': 6,
 'ooo': 7,
 'ted': 0,
 'teo': 1,
 'tod': 2,
 'too': 4}

In [38]:
NUMBER_OF_TRAINING_SAMPLES = len(train_docs)

In [45]:
def text_embedding(docs, w2v_model):
    x = []
    
    for doc in docs:
        c = 1 #to avoid dividing by zero
        tokens = np.zeros(50)
        for sentence in doc:
            for token in sentence:
                if token in w2v_model.vocab:
                    tokens += w2v_model.wv[token]
                    c = c+1
        x.append(tokens/c)
    return x

In [46]:
x_train = text_embedding(train_docs_tokenized, ted_w2v)

In [47]:
np.array(x_train).shape

(1585, 50)

In [48]:
x_validation = text_embedding(validation_docs_tokenized, ted_w2v)

In [49]:
X_validation = np.array(x_validation)

In [50]:
X_validation.shape

(250, 50)

In [51]:
x_test = text_embedding(test_docs_tokenized, ted_w2v)

In [52]:
X_test = np.array(x_test)

In [53]:
X_test.shape

(250, 50)

In [54]:
X_train = np.array(x_train)
Y = np.zeros((len(labels), 8), dtype=np.bool)

In [55]:
for i, label in enumerate(labels_new):
    Y[i,[label_to_index[label]]] = 1 

In [56]:
Y_train = Y[0:1585]
Y_validation = Y[1585:1835]
Y_test = Y[1835:2085]

In [57]:
Y_train.shape

(1585, 8)

<strong>Training the model</strong>

In [58]:
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout

Using Theano backend.


In [59]:
model = Sequential()

In [60]:
model.add(Dense(32, activation ='tanh', input_dim=50))
model.add(Dense(8, activation='softmax'))

In [61]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [62]:
model.fit(X_train, Y_train, nb_epoch=50, batch_size=50, validation_data=(X_validation, Y_validation))

Train on 1585 samples, validate on 250 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0xb1f298d710>

In [63]:
model.evaluate(X_test, Y_test, batch_size=50)

 50/250 [=====>........................] - ETA: 0s

[1.2788642168045044, 0.41999999880790712]

In [64]:
model.metrics_names

['loss', 'categorical_accuracy']

<strong>Using GloVe embeddings</strong>

In [65]:
x_train_glove = text_embedding(train_docs_tokenized, ted_glove)

In [67]:
X_train_g = np.array(x_train_glove)

In [68]:
X_train_g.shape

(1585, 50)

In [69]:
x_validation_glove = text_embedding(validation_docs_tokenized, ted_glove)

In [70]:
X_validation_g = np.array(x_validation_glove)

In [71]:
x_test_glove = text_embedding(test_docs_tokenized, ted_glove)

In [72]:
X_test_g = np.array(x_test_glove)

In [73]:
model_2 = Sequential()

In [74]:
model_2.add(Dense(32, activation ='tanh', input_dim=50))
model_2.add(Dense(8, activation='softmax'))

In [75]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [78]:
model.fit(X_train_g, Y_train, nb_epoch=50, batch_size=50, validation_data=(X_validation_g, Y_validation))

Train on 1585 samples, validate on 250 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0xb1f4f06940>

In [79]:
model.evaluate(X_test_g, Y_test, batch_size=50)

 50/250 [=====>........................] - ETA: 0s

[1.1764106750488281, 0.46800000071525572]

<strong>Training the model with Relu instead of Tanh</strong>

In [80]:
model_3 = Sequential()

In [81]:
model_3.add(Dense(32, activation ='relu', input_dim=50))
model_3.add(Dense(8, activation='softmax'))

In [82]:
model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [83]:
model_3.fit(X_train, Y_train, nb_epoch=50, batch_size=50, validation_data=(X_validation, Y_validation))

Train on 1585 samples, validate on 250 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0xb1f547bd68>

In [84]:
model_3.evaluate(X_test, Y_test, batch_size=50)

 50/250 [=====>........................] - ETA: 0s

[1.2613336801528932, 0.42400000095367429]

<strong>Adding Dropout</strong>

In [85]:
model_4 = Sequential()

In [86]:
model_4.add(Dense(32, activation ='tanh', input_dim=50))
model_4.add(Dropout(0.3))
model_4.add(Dense(8, activation='softmax'))

In [87]:
model_4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [88]:
model_4.fit(X_train, Y_train, nb_epoch=50, batch_size=50, validation_data=(X_validation, Y_validation))

Train on 1585 samples, validate on 250 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0xb1f58dff28>

In [89]:
model_4.evaluate(X_test, Y_test, batch_size=50)

 50/250 [=====>........................] - ETA: 0s

[1.2630130052566528, 0.42400000095367429]

<strong>Visualizing the lables</strong>

In [90]:
from bokeh.models import ColumnDataSource ,LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [91]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
ted_labels_tsne = tsne.fit_transform(Y)

In [92]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Ted labels T-SNE")

source = ColumnDataSource(data=dict(x1=ted_labels_tsne[:,0],
                                    x2=ted_labels_tsne[:,1],
                                    names=labels_new))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)