# Import

### Basic

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import params

# Load dataset

In [2]:
DATASET = np.load("data/dataset.npy")

In [3]:
print "Shape :", DATASET.shape

Shape : (6126219, 161, 1)


# Preprocessing function

In [4]:
ACCEPTED_CHARS = params.ACCEPTED_CHARS

In [5]:
print ACCEPTED_CHARS

['\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x80', '\x92', '\x98', '\x9f', '\xa6', '\xe2', '\xf0']


In [6]:
def batch2onehot(batch, D): # see preprocessing.py
    ''' Function used during the training to encode batches.
    Input size : (batch_size, tweet_length, 1).
    Output size : (batch_size, tweet_length, D)'''
    B, T = batch.shape[0:2]
    one_hot_batch = np.zeros((B*T, D))
    one_hot_batch[range(B*T), batch.flatten()] = 1
    one_hot_batch = one_hot_batch.reshape((B,T,D))
    return one_hot_batch

def batch2tweet(batch, accepted_caracters, special_char=""): # see preprocessing.py
    '''Not optimized. But not used during the training : no need to be fast.'''
    tweets = []
    for t in batch:
        tweet = ""
        for char in t:
            try:
                tweet += accepted_caracters[char[0]]
            except:
                tweet += special_char # Special marker indicating the end of the tweet
        tweets.append(tweet)
    return tweets

def onehot2tweet(batch, accepted_caracters, special_char=""): # see preprocessing.py
    '''Not optimized. But not used during the training : no need to be fast.'''
    tweets = []
    for t in batch:
        tweet = ""
        for char in t:
            try:
                tweet += accepted_caracters[np.where(char==1)[0][0]]
            except:
                tweet += special_char # Special marker indicating the end of the tweet
        tweets.append(tweet)
    return tweets

### Test preprocessing functions

In [8]:
D = params.D  # dimension of one-hot vectors
B = params.B  # batch size for the demo
T = params.T  # max length of a tweet
batch = DATASET[0:B] 
# Dataset > One-hot
one_hot_batch = batch2onehot(batch, D)

# Pick a tweet
t = np.random.randint(B)
# Pick a char
c = np.random.randint(T)
print "Char %d from tweet %d :"%(c,t), one_hot_batch[t][c]

Char 50 from tweet 24 : [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


#### If you want to, possible to go back in the "string" domain

In [9]:
# Using 'batch2tweet' :
t = np.random.randint(B) # Pick a tweet
print batch2tweet(batch, ACCEPTED_CHARS)[t]
# or 'onehot2tweet' if the tweet has already been converted to onehot :
print onehot2tweet(one_hot_batch, ACCEPTED_CHARS)[t]

#nude naked boys with girls video tool prison sex lyrics meaning https://t.co/ujgrdiedap
#nude naked boys with girls video tool prison sex lyrics meaning https://t.co/ujgrdiedap


# Model definition

In [11]:
from keras.models import Model
from keras.layers import Input, LSTM, Masking, Dropout, TimeDistributed, Dense, Activation
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

Using Theano backend.


In [13]:
def get_model(T, D, lr, nhidden, drop_rate): # see models.py
    # Input layer
    inputs = Input((T, D))
    # Masking "only-0" input features
    masked = Masking(mask_value=0.0)(inputs)
    # Hidden layers
    for i in range(nhidden):
        if i == 0:
            hidden  = LSTM(64, return_sequences=True)(masked)  #128
        else:
            hidden  = LSTM(64, return_sequences=True)(dropout)
        dropout = Dropout(drop_rate)(hidden)
    # Output layer : linear TimeDistributedDense + softmax
    decoder = TimeDistributed(Dense(D))(dropout) # Apply the same dense layer on each timestep
    outputs = Activation("softmax") (decoder)

    model = Model(input=inputs, output=outputs)

    model.compile(optimizer=Adam(lr=lr), loss="categorical_crossentropy")

    return model

### Model instanciation

In [24]:
T = 161
D = 64
LR = params.LR # learning rate
model = get_model(T-1, D-1, LR, 1, 0.1) # D-1 because params.D accounts for the padding dimension

In [25]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 160, 63)       0                                            
____________________________________________________________________________________________________
masking_4 (Masking)              (None, 160, 63)       0           input_4[0][0]                    
____________________________________________________________________________________________________
lstm_6 (LSTM)                    (None, 160, 64)       32768       masking_4[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 160, 64)       0           lstm_6[0][0]                     
___________________________________________________________________________________________

# Test

Needs to be filled - Sampling functions - "Temperature"

In [112]:
# load the weights
weights = np.load('results/exp_001/weights/best_model.npy')

model.layers[2].set_weights(weights[0:12])
model.layers[4].set_weights(weights[12:14])

In [423]:
# One-hot encoding
one_hot_batch = batch2onehot(batch, D)
# Remove the padding dimension
one_hot_batch = one_hot_batch[:,0:160,0:(D-1)]

# exemple 21
t = 21
tweet = batch2tweet(batch, ACCEPTED_CHARS)[t]
one_hot_batch = one_hot_batch[t]
one_hot_batch = one_hot_batch[None,:,:]

out = model.predict(one_hot_batch)

In [425]:
predictions = []
confidences = []
nb_predictions = 5

for i in range(nb_predictions):
    onehot = np.asarray([np.eye(64)[np.argsort(o)[-(i + 1)]] for o in out[0]])
    predictions.append(onehot2tweet(onehot[None,:,:], ACCEPTED_CHARS)[0])
    confidences.append([o[np.argsort(o)[-(i + 1)]] for o in out[0]])

In [427]:
from IPython.display import HTML

output = tweet + '<br>'
for i in range(nb_predictions):
    output = output + '_'
    for c in range(len(tweet)):
        output = output + '<span style=\"background-color:hsl(240,100%,' + str(100 - confidences[i][c] * 50) + '%);\">' + predictions[i][c].replace('\n',' ') + '</span>'
    output = output + '<br>'
    
output = '<p style="font-family:monospace; font-size:16px">' + output + '</p>'

In [428]:
output = unicode(output, 'utf-8', errors='replace')
output = output.encode('ascii', 'xmlcharrefreplace')

HTML(output)