<a href="https://colab.research.google.com/github/henouji/hnj/blob/master/South%20Park%20Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Libraries 

In [1]:
import tensorflow as tf 
import numpy as np
from keras.models import Model
from keras.layers import CuDNNLSTM, Input, Dense, LSTM, Embedding, TimeDistributed, Flatten, Dropout
from keras.optimizers import RMSprop
import pandas as pd 
from google.colab import files 
import io 

Using TensorFlow backend.


# File upload and data reading

In [2]:
uploaded = files.upload()

Saving sPark.csv to sPark.csv


In [0]:
df = pd.read_csv(io.BytesIO(uploaded['sPark.csv']))

# Data Preparation and Input Data Initialization for Character Level Processing


> Turn the sentences into 3 Numpy arrays, encoder_input_data, decoder_input_data, decoder_target_data:

> * encoder_input_data is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) containing a one-hot vectorization of the English sentences
* decoder_input_data is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) containing a one-hot vectorization of the French sentences
* decoder_target_data is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]



In [4]:
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [0]:
dataset_array = df.Line.values 

In [7]:
# Limiting Dataset
dataset_array = dataset_array[:len(dataset_array)//10]
print("Total sentences: ",len(dataset_array))

Total sentences:  7089


In [0]:
en_sample = dataset_array[:len(dataset_array)-1]
de_sample = dataset_array[1 :len(dataset_array)]
for i, sent in enumerate(de_sample):
  de_sample[i] = '\t ' + sent 

In [9]:
en_num_pairs = len(en_sample)
de_num_pairs = len(de_sample)

print("Number of Pairs : [",en_num_pairs,",",de_num_pairs,"]")

Number of Pairs : [ 7088 , 7088 ]


In [10]:
max_en_sentence = max([len(x) for x in en_sample])
max_de_sentence = max([len(x) for x in de_sample])

print("Max Sentence: [",max_en_sentence,",",max_de_sentence,"]")

Max Sentence: [ 1059 , 1059 ]


In [0]:
en_char = set()
de_char = set()

for i in en_sample:
  for x in i:
    if x not in en_char:
      en_char.add(x)
for i in de_sample:
  for x in i:
    if x not in de_char:
      de_char.add(x)

In [12]:
num_en_char = len(en_char)
num_de_char = len(de_char)

print("Number of Characters: [",num_en_char,":",num_de_char,"]")

Number of Characters: [ 89 : 89 ]


In [13]:
# Array Declaration
en_data_input = np.zeros((en_num_pairs, max_en_sentence, num_en_char), dtype='float32')
de_data_input = np.zeros((de_num_pairs, max_de_sentence, num_de_char), dtype='float32')
decode_target = np.zeros((de_num_pairs, max_de_sentence, num_de_char), dtype='float32')
print("Shapes: [",en_data_input.shape,":",en_data_input.shape,"] Target [",decode_target.shape,"]")

Shapes: [ (7088, 1059, 89) : (7088, 1059, 89) ] Target [ (7088, 1059, 89) ]


In [0]:
# For Vectorizing 
en_char_int = dict()
en_int_char = dict()
de_char_int = dict()
de_int_char = dict()

for i, char in enumerate(en_char):
  en_char_int[char] = i
  en_int_char[i] = char
for i, char in enumerate(de_char):
  de_char_int[char] = i
  de_int_char[i] = char

In [0]:
# One Hot Vectorizing
for i, (en_, de_) in enumerate(zip(en_sample, de_sample)):
  for char, char_en in enumerate(en_):
    en_data_input[i, char, en_char_int[char_en]] = 1 
  for char, char_de in enumerate(de_):
    de_data_input[i, char, de_char_int[char_de]] = 1
    if char > 0:
      decode_target[i, char-1, de_char_int[char_de]] = 1 

# Model Building

* Input : encoded data, decoded data
* Output : decode target

In [0]:
# Training Variables 
batch_size = 64  
vec_len = 300
epochs = 50
dropout_rate = 0.2 
latent_dim = 128
num_samples = 4000 

In [18]:

en_inputs = Input(shape=(None, num_en_char))

encoder = CuDNNLSTM(latent_dim, return_sequences=True)(en_inputs)

encoder2 = CuDNNLSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder2(encoder)

encoder_states = [state_h, state_c]
de_inputs = Input(shape=(None, num_de_char))

decoder_lstm = CuDNNLSTM(latent_dim, return_sequences=True)

decoder_lstm2 = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)

decoder_layer1 = decoder_lstm(de_inputs, initial_state=encoder_states)

decoder_outputs, _, _ = decoder_lstm2(decoder_layer1)

decoder_dense = Dense(num_de_char, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

# Combine Encoder and Decoder Model
model = Model([en_inputs, de_inputs], decoder_outputs)


Instructions for updating:
Colocations handled automatically by placer.


In [20]:
# Training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([en_data_input, de_data_input], decode_target, batch_size=batch_size,
         epochs=epochs, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 5670 samples, validate on 1418 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fab6ba962e8>

In [21]:
# Save model
model.save('s2s.h5')

  '. They will not be included '


In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 89)     0                                            
__________________________________________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)        (None, None, 128)    112128      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None, 89)     0                                            
__________________________________________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)        [(None, 128), (None, 132096      cu_dnnlstm_1[0][0]               
__________________________________________________________________________________________________
cu_dnnlstm

In [0]:
# Sampling model = rebuilding model using only the necessary parts 
encoder_models = Model(en_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_output = decoder_lstm(de_inputs, initial_state=decoder_states_inputs)
decoder_outputs, state_h, state_c = decoder_lstm2(decoder_output)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([de_inputs] + decoder_states_inputs,
                     [decoder_outputs] + decoder_states)

# Model Prediction Testing 

In [0]:
def decode_sequence(input_seq):
  states_value = encoder_models.predict(input_seq)
  
  target_seq = np.zeros((1, 1, num_de_char))
  target_seq[0, 0, de_char_int['\t']] = 1
  
  stop_condition = False 
  decoded_sentence = ''
  
  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    
    sampled_char_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = de_int_char[sampled_char_index]
    decoded_sentence += sampled_char
    
    if(sampled_char == '\n' or len(decoded_sentence) > max_de_sentence):
      stop_condition = True
      
    target_seq = np.zeros((1, 1, num_de_char))
    target_seq[0, 0, sampled_char_index] = 1
    
    states_value = [h, c]
    
  return decoded_sentence

In [0]:
def sent_seq(sentence):
  input_seq = np.zeros((1, max_en_sentence, num_en_char))
  for i, letter in enumerate(sentence):
    input_seq[0, i, en_char_int[letter]]
  return input_seq

In [26]:
for seq_index in range(10):
  input_seq = en_data_input[seq_index : seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)
  
  print('-')
  print('Input:', en_sample[seq_index])
  print('Decode: ', decoded_sentence)

-
Input: You guys, you guys! Chef is going away. 

Decode:   X H!

-
Input: 	 Going away? For how long?

Decode:   X H!

-
Input: 	 Forever.

Decode:   X H!

-
Input: 	 I'm sorry boys.

Decode:   X H!

-
Input: 	 Chef said he's been bored, so he joining a group called the Super Adventure Club. 

Decode:   X H!

-
Input: 	 Wow!

Decode:   X H!

-
Input: 	 Chef?? What kind of questions do you think adventuring around the world is gonna answer?!

Decode:   X H!

-
Input: 	 What's the meaning of life? Why are we here?

Decode:   X H!

-
Input: 	 I hope you're making the right choice.

Decode:   X H!

-
Input: 	 I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! 

Decode:   X H!



In [27]:
query = input(": ")
input_seq = sent_seq(query)
print(">", decode_sequence(input_seq))

: Hello
>  X H!

