In [1]:
from faker import Faker
from babel.dates import format_date
import random
fake = Faker()
Faker.seed(123)
random.seed(321)

# 各种日期格式
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# LOCALES = ['en_US']

In [2]:
def load_date():
    # 加载一些日期数据
    dt = fake.date_object()
    human_readable = format_date(dt, format=random.choice(FORMATS),
                                 locale='en_US')
    # 使用随机选取的格式，生成日期
    human_readable = human_readable.lower().replace(',','')
    machine_readable = dt.isoformat()
    return human_readable, machine_readable, dt

test_date = load_date()

In [3]:
from tqdm import tqdm # 显示进度条
def load_dateset(num_of_data):
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30 # 日期最大长度
    for i in tqdm(range(num_of_data)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    human = dict(zip(sorted(human_vocab)+['<unk>', '<pad>'],
                     list(range(len(human_vocab)+2))))
    # x 字符：idx 的映射
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    # idx ： y 字符
    machine = {v : k for k, v in inv_machine.items()}
    # y 字符 ： idx
    return dataset, human, machine, inv_machine

m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dateset(m)

100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16683.11it/s]


In [4]:
import numpy as np
from keras.utils import to_categorical

def string_to_int(string, length, vocab):
    string = string.lower().replace(',','')
    if len(string) > length: # 长了，截断
        string = string[:length]
    rep = list(map(lambda x : vocab.get(x, '<unk>'), string))
    # 对string里每个char 使用 匿名函数 获取映射的id，没有的话，使用unk的id，map返回迭代器，转成list
    if len(string) < length:
        rep += [vocab['<pad>']]*(length-len(string))
        # 长度不够，加上 pad 的 id
    return rep # 返回 [ids,...]

def process_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X,Y = zip(*dataset)
    print("处理前 X：{}".format(X))
    print("处理前 Y：{}".format(Y))
    X = np.array([string_to_int(date, Tx, human_vocab) for date in X])
    Y = [string_to_int(date, Ty, machine_vocab) for date in Y]
    print("处理后 X的shape：{}".format(X.shape))
    print("处理后 Y: {}".format(Y))

    Xoh = np.array(list(map(lambda x : to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x : to_categorical(x, num_classes=len(machine_vocab)), Y)))
    return X, np.array(Y), Xoh, Yoh
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = process_data(dataset, human_vocab, machine_vocab, Tx, Ty)

处理前 X：('sunday march 20 1988', 'monday december 8 1975', 'thursday september 18 1997', '21 february 1988', '30 april 1977', 'sunday august 6 1972', '20.10.95', 'wednesday june 28 2006', '7 04 08', '9 august 1992', '11 mar 1993', 'july 14 1973', '12 nov 1980', '11 march 1979', 'saturday december 12 1992', '3/4/08', '18 sep 1992', 'sunday october 1 2017', '7 sep 1986', 'feb 22 1981', 'wednesday february 11 1970', '09 sep 1999', '18 dec 1975', 'wednesday september 1 2010', 'september 12 1995', 'thursday october 3 1974', 'saturday june 13 1970', 'thursday june 20 1991', 'october 25 2019', '5 jul 2000', '11 december 1976', '26 dec 1972', 'saturday april 24 1976', 'may 26 2015', 'tuesday september 4 1979', '4 august 1978', '12.06.71', 'thursday november 9 1989', 'april 22 1999', 'thursday january 8 2009', '15.06.02', '21 january 1988', 'saturday december 8 2001', 'sunday july 2 1972', '6 october 1990', 'may 16 1993', 'saturday july 2 2005', 'wednesday november 13 2002', '29 january 1984', 't

In [5]:
print(X.shape)
print(Y.shape)
print(Xoh.shape)
print(Yoh.shape)

(10000, 30)
(10000, 10)
(10000, 30, 37)
(10000, 10, 11)


In [9]:
from keras import backend as K
def softmax(x, axis=1):
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e/s
    else:
        raise ValueError('维度不对，不能是1维')

# prob = softmax(np.array([[1]]))

In [10]:
from keras.layers import RepeatVector, Concatenate, Dense, Activation, Dot
repeator = RepeatVector(Tx)
concator = Concatenate(axis=-1)
densor1 = Dense(10, activation='tanh')
densor2 = Dense(1, activation='relu')
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes=1)
