In [1]:
import keras
import pickle
import gensim
import csv
import numpy as np
from keras.layers import Input, Dense, LSTM, RepeatVector, Dropout
from keras.models import Model

Using Theano backend.


In [2]:
def transform_sent(sent):
    '''
    Takes as input a tokenized sentence (list of strings) and outputs a numpy array
    where each row is the word vector for the corresponding word in the sentence.
    Final matrix has dimensionality of len(sent) x dimensionality of word vectors
    '''
    vector = []
    for w in sent:
        if w == '<PAD>' or w not in model.vocab:
            vector.append(np.ones((300,)))
        else:
            vector.append(model[w])
    vector = np.array(vector)
    
    return vector

def pad_sentences(text):
    maxlen = max([len(sent) for sent in text])
    for sent in text:
        while len(sent) < maxlen:
            sent.append('<PAD>')
            
    return text

In [3]:
tokenized = pickle.load(open('../pickles/sarcasm/sarcasm_tokenized.pkl', 'rb'))
model = gensim.models.Word2Vec.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

In [4]:
datafile = "../data/sarcasm_v2.csv"

def load_data():
    with open(datafile) as f:
        return list(csv.reader(f))

data = load_data()
features = [[transform_sent(sent) for sent in doc] for doc in tokenized]
labels = [line[1] for line in data if line[0]=="GEN"]

In [27]:
X = np.array([transform_sent(s).flatten() for s in new_tokenized])

In [5]:
new_tokenized = pad_sentences(tokenized)

In [28]:
x_train = X[:2860]
x_test = X[2860:]

In [18]:
encoding_dim = 64

In [23]:
input_sequence = Input(shape=(198*300,))

encoded = Dense(encoding_dim, activation='relu')(input_sequence)

decoded = Dense(198*300, activation='sigmoid')(encoded)

autoencoder = Model(input=input_sequence, output=decoded)
encoder = Model(input=input_sequence, output=encoded)
encoder.load_weights('encoded')

In [24]:
# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(input=encoded_input, output=decoder_layer(encoded_input))

In [196]:
input_sequence = Input(shape=(198,300))

encoded = LSTM(64)(input_sequence)

decoded = RepeatVector(198)(encoded)
decoded = LSTM(300, return_sequences=True)(decoded)

sequence_autoencoder = Model(input_sequence, decoded)
encoder = Model(input_sequence, encoded)

In [25]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [29]:
autoencoder.fit(x_train, x_train,
                nb_epoch=50,
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test))

Train on 2860 samples, validate on 400 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50

KeyboardInterrupt: 

In [202]:
encoded_sents = encoder.predict(X)

In [203]:
pickle.dump(encoded_sents, open('LSTM_encoded', 'wb'))

In [204]:
encoder.save_weights('LSTM_encoder_weights')

In [11]:
autoencoder.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 59400)         0                                            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 15000)         891015000   input_1[0][0]                    
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1000)          15001000    dense_1[0][0]                    
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 300)           300300      dense_2[0][0]                    
___________________________________________________________________________________________