In [1]:
import pandas as pd
import numpy as np
import re
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Read Input

In [4]:
def readInputFile(fileName):
    df = pd.read_csv(fileName, encoding = "ISO-8859-1", header=None).iloc[:, 5]
    
    return df

In [5]:
df = readInputFile("training_data.csv")
print(df.head())

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: 5, dtype: object


# Preprocessing Data

In [6]:
# process a single tweet
def preprocess(tweet):
    
    # lowercase all the tweets
    tweet = tweet.lower()
    
    # remove urls
    tweet = re.sub('((www\.\w+\.\w+) | (https?://\w+\.\w+))', '', tweet)
    
    #remove emails
    tweet = re.sub('(\w+)\s*(?:@|&#x40\.|\s+[aA][tT]\s+|\s*\(\s*[aA][tT]\s*\)\s*)\s*([\w\s\.]+)\s*\.\s*([eE][dD][uU]|[cC][oO][mM]|[gG][oO][vV]|[oO][rR][gG])', '', tweet)
    
    # remove hashtag from the front of the topic
    tweet = re.sub('#(\w+)', r'\1', tweet)
    
    # remove @users
    tweet = re.sub('\s*@\w+\s*', '', tweet)
    
    # remove multiple spaces with only one space
    tweet = re.sub('\s+', ' ', tweet)
    
    return tweet 

In [7]:
preprocessed_tweets = df.apply(preprocess).values

In [8]:
preprocessed_tweets[:5]

array([ "/2y1zl - awww, that's a bummer. you shoulda got david carr of third day to do it. ;d",
       "is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!",
       'i dived many times for the ball. managed to save 50% the rest go out of bounds',
       'my whole body feels itchy and like its on fire ',
       "no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there. "], dtype=object)

In [36]:
# # max number of words in the vocabulary
# max_features = 450000

# # max length for the tweets
# maxlen = 70

# tokenizer = Tokenizer(num_words=max_features)

# # fit on the tweets vocabulary
# tokenizer.fit_on_texts(preprocessed_tweets)

# # convert from string to tokens
# tokenized_tweets = tokenizer.texts_to_sequences(preprocessed_tweets)

# # keep all the tweets of the same length
# tweets_train = pad_sequences(tokenized_tweets, maxlen=maxlen)

# filter = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'

# Grab unique characters from the input data

In [9]:
def getChars():

    # change the input data into one big string and
    # count the total number of characters 
    chars = ' '.join(preprocessed_tweets).lower()
    
    # get the unique characters from the text
    unique_chars = sorted(list(set(' '.join(preprocessed_tweets).lower())))
    unique_chars[:10]
    
    no_chars = len(chars)
    no_unique_chars = len(unique_chars)
    print("Total number of characters: ", no_chars)
    print("Total number of unique characters: ", no_unique_chars)
    
    
    # Assign a token to each character
    token_dict_char_int = {}
    token_dict_int_char = {}
    for index, char in enumerate(unique_chars):
        token_dict_char_int[char] = index    
        token_dict_int_char[index] = char
    
    return chars, unique_chars, no_chars, no_unique_chars, token_dict_char_int, token_dict_int_char

In [10]:
chars, unique_chars, no_chars, no_unique_chars, token_dict_char_int, token_dict_int_char = getChars()

Total number of characters:  108642445
Total number of unique characters:  150


# Prepare input and output data

In [13]:
def prepareInputData(no_chars, token_dict, chars):

    window_size = 100
    
    train_x = []
    train_y = []
    
    for i in range(0, no_chars - window_size, 1):
        x = chars[i:i+window_size]
        y = chars[i+window_size]
        
        input_sequence = []
        for character in x:
            input_sequence.append(token_dict[character])
        train_x.append(input_sequence)
        train_y.append(token_dict[y])
        
    # reshape training data (samples, time steps, features)
    x = np.reshape(train_x, (len(train_x), window_size, 1))

    # normalize the training data
    x = x/float(no_unique_chars)

    # tranform the output using one hot encoding
    y = np_utils.to_categorical(train_y)
        
        
    return train_x, train_y, window_size, x, y

In [14]:
# train_x, train_y, window_size, x, y = prepareInputData(no_chars, token_dict_char_int, chars)
train_x, train_y, window_size, x, y = prepareInputData(200000, token_dict_char_int, chars[:200000])
print(len(train_x), len(train_y))

199900 199900


# Define LSTM Model

In [15]:
def getModel():
    
    file_name = 'weights.hdf5'
    
    model = Sequential()
    # add LSTM layer
    model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2])))
    model.add(Dropout(0.3))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    
    # record all of the network weights each time loss is improved at the end of the epoch
    checkpoint = ModelCheckpoint(file_name, monitor='loss', verbose='1', save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    
    return model, callbacks_list, file_name

# Train LSTM Model

In [20]:
def fitModel(x, y):
    epochs = 20
    batch_size = 128
    
    # get the model
    model,callbacks_list, file_name = getModel()
    
    # fit the model
    model.fit(x, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

    return model, file_name

In [21]:
fitted_model, file_name = fitModel(x, y)

Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[64,100,1024]
	 [[Node: lstm_3/concat_1 = ConcatV2[N=4, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](lstm_3/Reshape_1, lstm_3/Reshape_3, lstm_3/Reshape_5, lstm_3/Reshape_7, lstm_3/concat_1/axis)]]
	 [[Node: mul_34/_63 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_1481_mul_34", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'lstm_3/concat_1', defined at:
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-21-c1b8b576fb30>", line 1, in <module>
    fitted_model, file_name = fitModel(x, y)
  File "<ipython-input-20-2f097aba9dd0>", line 6, in fitModel
    model,callbacks_list, file_name = getModel()
  File "<ipython-input-15-349d3a079436>", line 7, in getModel
    model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2])))
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/models.py", line 443, in add
    layer(x)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/layers/recurrent.py", line 262, in __call__
    return super(Recurrent, self).__call__(inputs, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/engine/topology.py", line 596, in __call__
    output = self.call(inputs, **kwargs)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/layers/recurrent.py", line 333, in call
    preprocessed_input = self.preprocess_input(inputs, training=None)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/layers/recurrent.py", line 1087, in preprocess_input
    return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 1721, in concatenate
    return tf.concat([to_dense(x) for x in tensors], axis)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1034, in concat
    name=name)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 519, in _concat_v2
    name=name)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/sw/hprc/sw/Anaconda/3-5.0.0.1-new/envs/keras-gpu-2.0.5/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[64,100,1024]
	 [[Node: lstm_3/concat_1 = ConcatV2[N=4, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/gpu:0"](lstm_3/Reshape_1, lstm_3/Reshape_3, lstm_3/Reshape_5, lstm_3/Reshape_7, lstm_3/concat_1/axis)]]
	 [[Node: mul_34/_63 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_1481_mul_34", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


# Genertae Text with LSTM Model

In [None]:
def generateText(file_name, model, train_x, sequence_length):
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    rand_start = np.random.randint(0, len(train_x) - 1)
    starting_seq = train_x[rand_start]
    
    print("Seed: ")
    print("\"", ''.join([token_dict_int_char[val] for cal in starting_seq]))
    
    
    # generate characters
    for i in range(sequence_length):
        x = np.reshape(starting_seq, (1, len(starting_seq), 1))
        x = x/float(no_unique_chars)
        
        pred = model.predict(x, verbose=0)
        index = np.argmax(pred)
        final_output = token_dict_int_char[index]
        x_in = [token_dict_int_char[val] for val in starting_seq]
        sys.stdout.write(final_output)
        starting_seq.append(index)
        starting_seq = starting_seq[1:len(starting_seq)]
        
    print("done")

In [None]:
generateText(file_name, fitted_model, train_x, 300)