# Preparing the Data

In [76]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

In [3]:
input_df = pd.read_csv('DATA/train_from.txt', sep="\n", header=None)
target_df = pd.read_csv('DATA/train_to.txt', sep="\n", header=None)

In [4]:
input_df.head()

Unnamed: 0,0
0,That's the most retarded logic I've ever heard.
1,What's wrong with the send off he had last yea...
2,'Maybe next time...'
3,Depends are you in Minneapolis? :-)
4,"yeah I would do the same, Apple stuff can get ..."


In [5]:
target_df.head()

Unnamed: 0,0
0,/r/facepalm material
1,would you want your last moment to be a subpar...
2,'Damn right boy.'
3,I'm in Brooklyn Park. Good enough?
4,iTunes is one of the biggest reasons. It's jus...


In [6]:
df = pd.concat([input_df, target_df], axis=1)

In [7]:
df.columns = ['input', 'target']

In [8]:
df.head()

Unnamed: 0,input,target
0,That's the most retarded logic I've ever heard.,/r/facepalm material
1,What's wrong with the send off he had last yea...,would you want your last moment to be a subpar...
2,'Maybe next time...','Damn right boy.'
3,Depends are you in Minneapolis? :-),I'm in Brooklyn Park. Good enough?
4,"yeah I would do the same, Apple stuff can get ...",iTunes is one of the biggest reasons. It's jus...


In [9]:
df.sample(6)

Unnamed: 0,input,target
2908112,MOAR BOOTS MEANS MOAR SPEED RIGHT ?!,thank mr.skeltal
1318745,We have that already. Or do you mean a 'rankin...,"He means a top 4 listing system, so it can wor..."
1233983,Hectors anus is real,Can confirm.
1830795,this is confusing can we go back to goofy css ...,no
2079471,[](/dumbfabric)What about the guys in the Leth...,Is important to me! newlinechar newlinechar W...
664531,b-but the kittens BibleThump,ill share with HeavenAndHellD2arg under the so...


In [10]:
# change everything to string and lowercase
df.input=df.input.apply(lambda x: str(x).lower())
df.target=df.target.apply(lambda x: str(x).lower())

In [11]:
#remove all quotes
df.input=df.input.apply(lambda x: re.sub("'", '', x))
df.target=df.target.apply(lambda x: re.sub("'", '', x))

In [12]:
# create a set of all punctuation using the string import
special_chars = set(string.punctuation)
print(special_chars)

{'!', '_', '$', '>', '#', '"', '~', '%', ')', '{', '+', '|', ']', '.', '&', ':', '-', '*', '[', '`', '\\', '}', '@', '(', ',', '?', '^', ';', '<', '/', "'", '='}


In [13]:
# remove links before special chars
df.input = df.input.apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x))
df.target = df.target.apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x))

In [14]:
# remove the special characters
df.input = df.input.apply(lambda x: ''.join(char for char in x if char not in special_chars))
df.target = df.target.apply(lambda x: ''.join(char for char in x if char not in special_chars))

In [15]:
# remove digits
num_digits = str.maketrans('','', digits)
df.input = df.input.apply(lambda x: x.translate(num_digits))
df.target = df.target.apply(lambda x: x.translate(num_digits))

In [16]:
# remove newlinechar word
df.input = df.input.apply(lambda x: x.replace("newlinechar", " "))
df.target = df.target.apply(lambda x: x.replace("newlinechar", " "))

In [17]:
# remove extra spaces
df.input = df.input.apply(lambda x: x.strip())
df.target = df.target.apply(lambda x: x.strip())

In [18]:
# add start and end tags to the beginning of the target "translation"
df.target = df.target.apply(lambda x: 'START_ ' + x + ' _END')

In [78]:
df.head()

Unnamed: 0,input,target
0,thats the most retarded logic ive ever heard,START_ rfacepalm material _END
1,whats wrong with the send off he had last year...,START_ would you want your last moment to be a...
2,maybe next time,START_ damn right boy _END
3,depends are you in minneapolis,START_ im in brooklyn park good enough _END
4,yeah i would do the same apple stuff can get r...,START_ itunes is one of the biggest reasons it...


In [75]:
# Create vocab
all_input_words = set()
for line in df.input:
    for word in line.split():
        if word not in all_input_words:
            all_input_words.add(word)

all_target_words = set()
for line in df.target:
    for word in line.split():
        if word not in all_target_words:
            all_target_words.add(word)

# sort unique words
input_words = sorted(list(all_input_words))
target_words = sorted(list(all_target_words))

In [77]:
len(input_words)

921984

In [80]:
# find the max sentence length
input_length_list=[]
for l in df.input:
    input_length_list.append(len(l.split(' ')))
max_input_length = max(input_length_list)
print(f'Max length of the input sentence: {max_input_length}')

target_length_list = []
for l in df.target:
    target_length_list.append(len(l.split(' ')))
max_target_length = max(target_length_list)

print(f'Max length of the target sentence: {max_target_length}')

Max length of the input sentence: 73
Max length of the target sentence: 75


In [82]:
# Create word to index
input_to_ind = dict([(word, i+1) for i, word in enumerate(input_words)])
target_to_ind = dict([(word, i+1) for i, word in enumerate(target_words)])

In [83]:
# Create index to word
input_to_word = dict([(i, word) for word, i in input_to_ind.items()])

target_to_word = dict([(i, word) for word, i in target_to_ind.items()])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [85]:
df = shuffle(df)

In [86]:
# Create training and test split
X, y = df.input, df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [87]:
X_train.shape

(3340312,)

In [88]:
X_test.shape

(371146,)

In [89]:
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words) + 1

In [90]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_input_length),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_target_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_target_length, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_to_ind[word] 
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_to_ind[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, target_to_ind[word]] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [91]:
# Set our necessary parameters
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50
latent_dim=256

In [92]:
# Define an input sequence and process it
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# discard 'encoder outputs' and only keep the states
encoder_states = [state_h, state_c]

In [93]:
# Decoder
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [94]:
# Define the model that takes encoder and decoder input 
# to output decoder_outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [97]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [102]:
#setup params to fit the model
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [None]:
# fit the model
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

Epoch 1/100
