In [1]:
import numpy as np
import os
import tensorflow as tf
import re
print(tf.__version__)
assert(tf.__version__.startswith("2."))

from tensorflow import keras
from tensorflow.keras import layers, backend as K
from tensorflow.keras.models import Model
from tensorflow.keras import backend
assert(tf.__version__.startswith("2."))
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import Input
from tensorflow.keras import Model, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

# Tensorboard
from tensorflow.python.keras.callbacks import TensorBoard
import datetime

# Helper libraries
# from w266_common import utils, vocabulary, tf_embed_viz

# From sklearn 
from sklearn.model_selection import train_test_split

import time

2.4.0


In [35]:
path = '../GYAFC_Corpus/Entertainment_Music/' ##training data only

def read_file(path, file):
    sentences = []
    f = open(path+file, "r")
    for line in f:
        sentences.append(re.findall(r"[\w']+|[.,!?;-]", line.split("\n")[0]))
    # print(f.read())
    f.close()
    return sentences

informal = read_file(path, file='train/informal')
formal = read_file(path, file='train/formal')

i_dev = read_file(path, file='tune/informal')
f_dev = read_file(path, file='tune/formal.ref0')

In [22]:
informal[:3]

[['the',
  'movie',
  'The',
  'In',
  '-',
  'Laws',
  'not',
  'exactly',
  'a',
  'holiday',
  'movie',
  'but',
  'funny',
  'and',
  'good',
  '!'],
 ['that', 'page', 'did', 'not', 'give', 'me', 'viroses', 'i', 'think'],
 ['of',
  'corse',
  'i',
  'be',
  'wachin',
  'it',
  'evry',
  'day',
  ',',
  'my',
  'fav',
  'charachter',
  'is',
  'Inuasha']]

In [24]:
formal[:3]

[['The',
  'In',
  '-',
  'Laws',
  'movie',
  "isn't",
  'a',
  'holiday',
  'movie',
  ',',
  'but',
  "it's",
  'okay',
  '.'],
 ['I', "don't", 'think', 'that', 'page', 'gave', 'me', 'viruses', '.'],
 ['I',
  'watch',
  'it',
  'everyday',
  ',',
  'my',
  'favorite',
  'charachter',
  'is',
  'Inuasha',
  '.']]

In [14]:
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    #word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

In [25]:
canoninformal = np.array([['<s>'] + [canonicalize_word(word) for word in sentence] + ['<s>'] for sentence in informal ])
print('An example of pre-standardized informal sentence:\n  {}'.format(informal[0]))
print('\n\nand after standardization:\n  {}'.format(canoninformal[0]))

An example of pre-standardized informal sentence:
  ['the', 'movie', 'The', 'In', '-', 'Laws', 'not', 'exactly', 'a', 'holiday', 'movie', 'but', 'funny', 'and', 'good', '!']


and after standardization:
  ['<s>', 'the', 'movie', 'The', 'In', '-', 'Laws', 'not', 'exactly', 'a', 'holiday', 'movie', 'but', 'funny', 'and', 'good', '!', '<s>']


  """Entry point for launching an IPython kernel.


In [26]:
canonformal = np.array([['<s>'] + [canonicalize_word(word) for word in sentence] + ['<s>'] for sentence in formal ])
print('An example of pre-standardized informal sentence:\n  {}'.format(formal[0]))
print('\n\nand after standardization:\n  {}'.format(canonformal[0]))

An example of pre-standardized informal sentence:
  ['The', 'In', '-', 'Laws', 'movie', "isn't", 'a', 'holiday', 'movie', ',', 'but', "it's", 'okay', '.']


and after standardization:
  ['<s>', 'The', 'In', '-', 'Laws', 'movie', "isn't", 'a', 'holiday', 'movie', ',', 'but', "it's", 'okay', '.', '<s>']


  """Entry point for launching an IPython kernel.


In [36]:
canon_idev = np.array([['<s>'] + [canonicalize_word(word) for word in sentence] + ['<s>'] for sentence in i_dev ])
canon_fdev = np.array([['<s>'] + [canonicalize_word(word) for word in sentence] + ['<s>'] for sentence in f_dev ])

  """Entry point for launching an IPython kernel.
  


In [47]:
len(canon_idev), len(canon_fdev)

(2877, 2877)

In [41]:
# Size of corpus
corpus_i = np.concatenate([canoninformal ,canon_idev])
print('Length of corpus is {} sentences'.format(len(corpus_i)))

# Convert to single dimension of words 
canonwords = [ word for sentence in canoninformal for word in sentence]
canonwords_f = [ word for sentence in canonformal for word in sentence]
canon_i = [ word for sentence in canon_idev for word in sentence]
canon_f = [ word for sentence in canon_fdev for word in sentence]

print('Length of words in informal corpus is {}'.format(len(corpus_i)))

Length of corpus is 55472 sentences
Length of words in informal corpus is 55472


In [46]:
len(canon_i), len(canon_f)

(40697, 40960)

In [42]:
# Create the string lookup object using the 10000 most-popular words
words_to_ids = StringLookup(max_tokens = 10000)

# Process the input corpus words, creating a vocabulary / id lookup:
words_to_ids.adapt(canonwords + canonwords_f + canon_i + canon_f)

# Get vocabulary size
V = len(words_to_ids.get_vocabulary())
print('Extracted vocabulary length is {}'.format(V))

# Also create an object to convert from ids back to words from the same vocabulary:
ids_to_words = StringLookup(vocabulary=words_to_ids.get_vocabulary(), invert=True)

Extracted vocabulary length is 10000


In [43]:
# Create training / test sets of word ids 
#corpus_ids = words_to_ids(canonwords).numpy()

# Split into train (80%) dev (10%) test (10%)
#train_ids, dev_test_ids = train_test_split(corpus_ids, train_size=0.8, random_state=42, shuffle=False)
#dev_ids, test_ids = train_test_split(dev_test_ids, train_size=0.5, random_state=42, shuffle=False)


x_ids_train = words_to_ids(canonwords).numpy()
y_ids_train = words_to_ids(canonwords_f).numpy()

# inputs of length max_time words
max_time = 25   # length of words per sequence
buffer_size = 100
batch_size = 100

ids_labels_dataset = tf.data.Dataset.from_tensor_slices((x_ids_train, y_ids_train))
# examples_per_epoch = len(corpus_ids)//(max_time+1)

# Create a train sequence dimension for words.  
sequences_train = ids_labels_dataset.batch(max_time, drop_remainder=True).shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

# Create a dataset for validating during fit
x_dev = words_to_ids(canon_i).numpy()
y_dev = words_to_ids(canon_f).numpy()
ids_labels_validation = tf.data.Dataset.from_tensor_slices((x_dev, y_dev))
sequences_val = ids_labels_validation.batch(max_time, drop_remainder=True).shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

ValueError: Dimensions 747361 and 757081 are not compatible