In [1]:
%matplotlib inline
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, LSTM, Input
from keras.callbacks import ReduceLROnPlateau

import tensorflow as tf

import collections
import gensim
from gensim.models import Word2Vec
import numpy as np
import re
import string
from sklearn.preprocessing import normalize

print("Tensorflow version: {}".format(tf.__version__))
print("Keras version: {}".format(keras.__version__))

Using TensorFlow backend.


Tensorflow version: 1.1.0
Keras version: 2.0.8


## Helper function

In [2]:
def remove_pos_tag(word):
    return word.split('/')[0]

#regex = re.compile('[-+]?([0-9]+,)?[0-9]+.?[0-9]*')
#punctuation = re.compile('['+string.punctuation+']')

## Load training data & create Lookup Tables

In [3]:
raw_text = open("./data/train.txt").read()
raw_text = raw_text.lower()

sentences = []
for line in raw_text.splitlines():
    sentences.append([remove_pos_tag(token) for token in line.split()])
print(sentences[:5])

[['in', 'an', 'oct.', '19', 'review', 'of', '``', 'the', 'misanthrope', "''", 'at', 'chicago', "'s", 'goodman', 'theatre', '(', '``', 'revitalized', 'classics', 'take', 'the', 'stage', 'in', 'windy', 'city', ',', "''", 'leisure', '&', 'arts', ')', ',', 'the', 'role', 'of', 'celimene', ',', 'played', 'by', 'kim', 'cattrall', ',', 'was', 'mistakenly', 'attributed', 'to', 'christina', 'haag', '.'], ['ms.', 'haag', 'plays', 'elianti', '.'], ['rolls-royce', 'motor', 'cars', 'inc.', 'said', 'it', 'expects', 'its', 'u.s.', 'sales', 'to', 'remain', 'steady', 'at', 'about', '1,200', 'cars', 'in', '1990', '.'], ['the', 'luxury', 'auto', 'maker', 'last', 'year', 'sold', '1,214', 'cars', 'in', 'the', 'u.s.'], ['howard', 'mosher', ',', 'president', 'and', 'chief', 'executive', 'officer', ',', 'said', 'he', 'anticipates', 'growth', 'for', 'the', 'luxury', 'auto', 'maker', 'in', 'britain', 'and', 'europe', ',', 'and', 'in', 'far', 'eastern', 'markets', '.']]


In [4]:
embedding_size = 100
model = Word2Vec(sentences, size=embedding_size, min_count=1, workers=4)
print(model)

Word2Vec(vocab=39428, size=100, alpha=0.025)


In [5]:
words = list(model.wv.vocab)
print(words[:5])
print(model['motivated'])

['disbursed', 'asia-pacific', 'lady', 'permanently', 'essence']
[-0.07749306 -0.01745089  0.01845957  0.0399294  -0.07870384 -0.04518595
  0.06944592 -0.06728533 -0.01565855  0.01094825  0.05835632 -0.06537049
  0.03464763 -0.03954552  0.01479189  0.00681547 -0.00546093  0.01643654
 -0.00593737  0.04881624 -0.03542344  0.03172927  0.0189415   0.09304802
 -0.08370103 -0.04490628  0.01748649 -0.02752062 -0.00998102 -0.02635884
 -0.01203324 -0.06892681  0.00340204 -0.00280302 -0.05070677  0.02227013
 -0.01356892 -0.01441826 -0.04427019  0.07332291  0.00551603 -0.03409106
 -0.02135161  0.06513803  0.03087234 -0.06501083  0.00810544  0.00127457
 -0.00891277 -0.03836843  0.00802573  0.03356352  0.04131276 -0.02176528
  0.01620766 -0.00201646 -0.00997739 -0.00179663 -0.02330303  0.02618125
  0.00066245  0.00201343  0.00206599 -0.0135767  -0.01176804  0.01491551
 -0.01086763 -0.08314821  0.0270504  -0.02356321 -0.02976044  0.04282209
 -0.09636673  0.03542997 -0.05935106 -0.02013326  0.02984549

In [6]:
tokens = [token for sentence in sentences for token in sentence]
# prepare the dataset of input to output pairs encoded as integers
seq_length = 15
dataX = []
dataY = []
for i in range(0, len(tokens) - seq_length):
    seq_in = tokens[i:i + seq_length]
    seq_out = tokens[i + seq_length]
    dataX.append([model[token] for token in seq_in])
    dataY.append(model[seq_out])

print("Total number of patterns: ", len(dataX))

Total number of patterns:  950013


In [7]:
X = np.asarray(dataX)
y = np.asarray(dataY)

print(X.shape)
print(y.shape)

(950013, 15, 100)
(950013, 100)


In [26]:
print(' '.join([model.wv.similar_by_vector(value)[0][0] for value in dataX[0]]))
print(model.wv.similar_by_vector(dataY[0])[0][0])

in an oct. 19 review of `` the misanthrope '' at chicago 's goodman theatre
(


In [27]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (len(dataX), seq_length, embedding_size))
print(X.shape)

y = np.reshape(dataY, (len(dataY), embedding_size))
print(y.shape)

print(X[0])
print(y[0])

(950013, 15, 100)
(950013, 100)
[[-1.94135904  1.18107879 -0.72279423 ...,  1.11281264 -1.27893138
   0.77426594]
 [ 1.25889397  0.96553886  0.84843987 ...,  1.04828274 -0.60708982
   1.27152824]
 [ 1.12125552  0.34008792 -0.35216582 ...,  0.78047252  0.32879883
   0.98237455]
 ..., 
 [-1.80498862  2.31865454  1.19851863 ...,  0.19837078 -1.05653787
   2.36574316]
 [-0.01220683  0.06991564 -0.04197785 ...,  0.06399094 -0.07228622
   0.10123127]
 [-0.02247826  0.04222622 -0.00274354 ...,  0.07011089 -0.02106756
   0.10049959]]
[-1.05211306  1.29784453  0.46849054  2.89595246 -1.35392296 -0.18713841
  0.8473556  -1.123649   -0.28777435 -1.28485274 -1.28018641  1.45036888
 -0.74645275  0.13564591 -0.67898202  0.54699993 -2.23401189  0.00465996
  1.30263579 -0.23561403  1.13053334  2.61760926  0.27111971 -0.78277457
  1.0724988   1.25912011  0.63904119  1.84457207 -0.63772315 -1.22739208
 -1.28988767 -1.93595517 -1.5741235   2.16449213  0.27845252  0.02901214
 -1.48615122 -1.11524868 -0.27

## Define the Model

In [None]:
inp = Input(shape=(X.shape[1], X.shape[2]))
x = LSTM(256)(inp)
x = Dropout(0.2)(x)
output = Dense(y.shape[1], activation ='softmax')(x)

generative_model = Model(inputs=inp, outputs=output)

optimizer = keras.optimizers.RMSprop(lr=0.01)
generative_model.compile(loss='categorical_crossentropy', optimizer='adam')

generative_model.summary()

## Train the model

In [None]:
generative_model.fit(X, y, epochs=20, batch_size=64)

## Make some predictions

In [None]:
generated_text = []
pattern = dataX[1000][:]

print(pattern)

# generate characters
for i in range(15):
    x = np.reshape(pattern, (1, seq_length, 1))
    x = x / float(len(vocab))
    prediction = generative_model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_word[index]
    #print(result)
    pattern.append(index)
    generated_text.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

In [None]:
print(pattern)
print(' '.join([remove_pos_tag(int_to_word[value]) for value in pattern]))
print(' '.join([remove_pos_tag(int_to_word[value]) for value in generated_text]))

## Attempt to implement temperature

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

generated_text = []
pattern = dataX[2000][:]

# generate characters
for i in range(15):
    x = np.reshape(pattern, (1, seq_length, 1))
    x = x / float(len(vocab))
    prediction = generative_model.predict(x, verbose=0)
    index = sample(prediction[0])
    result = int_to_word[index]
    pattern.append(index)
    generated_text.append(index)
    pattern = pattern[1:len(pattern)]
    
print(pattern)
print(' '.join([remove_pos_tag(int_to_word[value]) for value in pattern]))
print(' '.join([remove_pos_tag(int_to_word[value]) for value in generated_text]))