In [1]:
from faker import Faker
from babel.dates import format_date
import random
fake = Faker()
fake.seed(123)
random.seed(321)

# 各种日期格式
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# LOCALES = ['en_US']

In [4]:
def load_date():
    # 加载一些日期数据
    dt = fake.date_object()
    human_readable = format_date(dt, format=random.choice(FORMATS),
                                 locale='en_US')
    # 使用随机选取的格式，生成日期
    human_readable = human_readable.lower().replace(',','')
    machine_readable = dt.isoformat()
    return human_readable, machine_readable, dt

test_date = load_date()

In [8]:
from tqdm import tqdm # 显示进度条
def load_dateset(num_of_data):
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30 # 日期最大长度
    for i in tqdm(range(num_of_data)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    human = dict(zip(sorted(human_vocab)+['<unk>', '<pad>'],
                     list(range(len(human_vocab)+2))))
    # x 字符：idx 的映射
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    # idx ： y 字符
    machine = {v : k for k, v in inv_machine.items()}
    # y 字符 ： idx
    return dataset, human, machine, inv_machine

m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dateset(m)

100%|██████████| 10000/10000 [00:02<00:00, 4912.52it/s]


In [15]:
import numpy as np
from keras.utils import to_categorical

def string_to_int(string, length, vocab):
    string = string.lower().replace(',','')
    if len(string) > length: # 长了，截断
        string = string[:length]
    rep = list(map(lambda x : vocab.get(x, '<unk>'), string))
    # 对string里每个char 使用 匿名函数 获取映射的id，没有的话，使用unk的id，map返回迭代器，转成list
    if len(string) < length:
        rep += [vocab['<pad>']]*(length-len(string))
        # 长度不够，加上 pad 的 id
    return rep # 返回 [ids,...]

def process_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X,Y = zip(*dataset)
    print("处理前 X：{}".format(X))
    print("处理前 Y：{}".format(Y))
    X = np.array([string_to_int(date, Tx, human_vocab) for date in X])
    Y = [string_to_int(date, Ty, machine_vocab) for date in Y]
    print("处理后 X的shape：{}".format(X.shape))
    print("处理后 Y: {}".format(Y))

    Xoh = np.array(list(map(lambda x : to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x : to_categorical(x, num_classes=len(machine_vocab)), Y)))
    return X, np.array(Y), Xoh, Yoh
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = process_data(dataset, human_vocab, machine_vocab, Tx, Ty)

处理前 X：('21.10.88', '5/19/15', 'january 3 2001', '6 june 1977', 'monday march 15 2004', 'tuesday september 25 2012', '12 oct 1999', 'feb 15 1996', 'sunday march 26 2017', 'saturday november 21 2020', '20 august 2001', 'thursday march 28 2019', '9 july 1983', 'wednesday september 4 2013', '8 december 1975', 'monday may 30 1983', 'friday march 15 1974', 'saturday december 15 2001', 'october 14 2004', '6 august 2020', 'sunday april 20 2008', '6 february 2019', 'aug 9 1971', 'sunday august 6 1978', '2/14/97', 'may 21 1989', '31 aug 1983', 'sunday june 14 1970', 'november 30 1987', 'wednesday november 24 1976', 'thursday march 13 1997', 'wednesday september 12 1979', '10 february 1991', '9 05 72', '09.07.04', '10.11.07', 'friday april 9 2010', '30 11 03', '17 september 1998', 'may 20 1993', '26 04 96', 'january 3 1992', '12.08.00', '15 feb 1989', 'july 26 2016', 'wednesday june 1 1977', '22 may 2016', 'thursday july 12 2007', 'thursday january 5 2017', 'sunday july 20 2008', '2 05 76', 'frid