In [29]:
import numpy as np
import re
from unidecode import unidecode
import inflect
from tensorflow.keras.layers import RNN, Dense, Dropout, Conv1D, MaxPooling1D, GRUCell, Embedding, GRU, BatchNormalization, Bidirectional, Activation


In [14]:
_pad        = '_'
_EOS        = '~'
_punctuation = '!\'(),.:;? '
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

In [30]:
symbols = [_pad] + list(_EOS) + list(_special) + list(_punctuation) + list(_letters)

In [16]:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')

In [85]:
text = 'Hello~'
text = unidecode(text)
text = text.lower()

In [86]:
def _expand_dollars(m):
    match = m.group(1) #72.05
    parts = match.split('.')
    if len(parts) > 2:
        return match + ' dollars'
    dollars = int(parts[0]) if parts[0] else 0
    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
    if dollars and cents:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
    elif dollars:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        return '%s %s' % (dollars, dollar_unit)
    elif cents:
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s' % (cents, cent_unit)
    else:
        return 'zero dollars'

In [87]:
dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
text = re.sub(dollars_re, _expand_dollars, text)

In [88]:
def _expand_number(m):
    num = int(m.group(0))
    if num > 1000 and num < 3000:
        if num == 2000:
            return 'two thousand'
        elif num > 2000 and num < 2010:
            return 'two thousand ' + infl.number_to_words(num % 100)
        elif num % 100 == 0:
            return infl.number_to_words(num // 100) + ' hundred'
        else:
            return infl.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return infl.number_to_words(num, andword='')

In [89]:
infl= inflect.engine()
number_re = re.compile(r'[0-9]+')
text = re.sub(number_re, _expand_number, text)

In [90]:
abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
  ('mrs', 'misess'),
  ('mr', 'mister'),
  ('dr', 'doctor'),
  ('st', 'saint'),
  ('co', 'company'),
  ('jr', 'junior'),
  ('maj', 'major'),
  ('gen', 'general'),
  ('drs', 'doctors'),
  ('rev', 'reverend'),
  ('lt', 'lieutenant'),
  ('hon', 'honorable'),
  ('sgt', 'sergeant'),
  ('capt', 'captain'),
  ('esq', 'esquire'),
  ('ltd', 'limited'),
  ('col', 'colonel'),
  ('ft', 'fort'),
]]

In [91]:
for regex, replacement in abbreviations:
    text = re.sub(regex, replacement, text)

In [92]:
text = re.sub(r'\s+', ' ', text)

In [93]:
sequence = []

for s in text:
    if s in symbols:
        sequence.append(_symbol_to_id[s])
len(sequence)

6

In [94]:
s = Embedding(70, 256)
s(np.asarray(sequence))

<tf.Tensor: shape=(6, 256), dtype=float32, numpy=
array([[ 0.04841233,  0.02438724,  0.00844621, ..., -0.02529392,
         0.03294437,  0.02162023],
       [ 0.00074171, -0.04167994, -0.01908375, ...,  0.00050094,
         0.02054746,  0.02941695],
       [-0.04032999,  0.02544551, -0.01891573, ...,  0.02133231,
        -0.01078285, -0.0046187 ],
       [-0.04032999,  0.02544551, -0.01891573, ...,  0.02133231,
        -0.01078285, -0.0046187 ],
       [ 0.03848548,  0.01612208, -0.04374418, ..., -0.0365021 ,
         0.03082151, -0.02531766],
       [-0.04648019, -0.0239135 , -0.01042945, ...,  0.02486279,
         0.00795569, -0.02529978]], dtype=float32)>

In [95]:
x = s(np.asarray(sequence))
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x, training = True)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x, training=True)

In [99]:
x

<tf.Tensor: shape=(6, 128), dtype=float32, numpy=
array([[1.76236797e-02, 9.04265046e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.21114419e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.39021292e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.11758998e-02,
        1.09862596e-01, 0.00000000e+00, 6.74820095e-02, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        5.07188737e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 5.59933111e-02, 6.20079739e-03,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.81570144e-04,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.88247319e-02, 1.36824474e-02, 0.00000000e+00,
        0.0000

In [101]:
Conv1D(128, kernel_size=16, padding='same', activation='relu')(x)

ValueError: Input 0 of layer conv1d_2 is incompatible with the layer: : expected min_ndim=3, found ndim=2. Full shape received: [6, 128]