In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split

In [71]:
DATASET_PATH = "dataset/Assignment2aDataset.txt"
df = pd.read_csv(DATASET_PATH, names = ["source", "target"])
df["source"] = df["source"].apply(lambda x: x.strip()[1:-1].replace("/", "-"))
df["target"] = df["target"].apply(lambda x: x.strip()[1:-1])
df.sample(20)

Unnamed: 0,source,target
30691,29 august 1821,1821-08-29
5954,saturday 1 august 2048,2048-08-01
21163,1887 15 july,1887-07-15
8970,1745 4 feb,1745-02-04
315,sun 1810 25 february,1810-02-25
24784,sat 8 aug 1570,1570-08-08
34528,dec 28 1687,1687-12-28
8375,mar 22 1553,1553-03-22
32271,august 29 1768,1768-08-29
20244,1729 1 january,1730-01-01


In [72]:
df_train, df_test = train_test_split(df, random_state=42, test_size=0.1)
df_test

Unnamed: 0,source,target
32823,9 september 1943,1943-09-09
16298,may 23 1532,1532-05-23
28505,june 27 1908,1908-06-27
6689,july 24 1766,1766-07-24
26893,december 26 2008,2008-12-26
...,...,...
19536,tuesday june 4 1669,1669-06-04
13332,2034 25 august,2034-08-25
18523,sat 1983 29 january,1983-01-29
14835,sunday march 15 1857,1857-03-15


In [133]:
len(df_test)

4000

In [73]:
df["source"].apply(lambda x: x.replace("/", "-"))

0                 may 20 2034
1                  9 may 1630
2                  15-03-2014
3                 mar 16 1675
4                 jun 16 1640
                 ...         
39995        december 26 1900
39996             15 may 1828
39997    friday april 18 1851
39998            june 11 2070
39999         january 27 1712
Name: source, Length: 40000, dtype: object

In [81]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import io
from nltk.tokenize import word_tokenize
import re

In [82]:
en_tokenizer = get_tokenizer('spacy', language='en')

In [83]:
word_tokenize("10/09/2023")

['10/09/2023']

In [87]:
df["source"].sample(10).apply(lambda x: en_tokenizer(x))

22099          [sunday, 1574, 25, 08]
7321              [1548, 7, february]
2452             [december, 25, 1605]
29726        [thursday, 1866, 27, 09]
22291     [thursday, august, 6, 1863]
4793       [tue, 1963, 24, september]
35032                  [1975, 4, mar]
34344                 [july, 7, 1664]
32997              [january, 9, 1617]
33772    [monday, november, 17, 1681]
Name: source, dtype: object

In [90]:
df["target"].sample(10).apply(lambda x: en_tokenizer(x.strip()))

5688     [1966, -, 07, -, 13]
26568    [1804, -, 03, -, 20]
22744    [1715, -, 01, -, 25]
3295     [1832, -, 05, -, 26]
32054    [1999, -, 10, -, 14]
21620    [1618, -, 07, -, 13]
19823    [1746, -, 04, -, 16]
28567    [1994, -, 01, -, 04]
11668    [1646, -, 12, -, 01]
34859    [2063, -, 01, -, 26]
Name: target, dtype: object

In [95]:
# all have 5 length
df["target"].apply(lambda x: len(en_tokenizer(x.strip())) != 5).sum()

0

In [96]:
counter = Counter() # dict of {token: Freq}

In [97]:
for source in df["source"]:
    counter.update(en_tokenizer(source))

for source in df["target"]:
    counter.update(en_tokenizer(source))

In [98]:
voc = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [99]:
voc["november"]

64

In [100]:
# padding
PAD_IDX = voc['<pad>']
BOS_IDX = voc['<bos>']
EOS_IDX = voc['<eos>']

In [101]:
data = []
for (source, target) in zip(df["source"], df["target"]):
    s_tensor_ = torch.tensor([voc[token] for token in en_tokenizer(source)])
    t_tensor_ = torch.tensor([voc[token] for token in en_tokenizer(target)])
    data.append((s_tensor_, t_tensor_))
data[0]

(tensor([4, 5, 6]), tensor([  6,  10, 478,  10,   5]))

In [102]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import optim

In [103]:
class DateDataset(Dataset):
    def __init__(self, DATASET_PATH = "dataset/Assignment2aDataset.txt", split="train"):
        df = pd.read_csv(DATASET_PATH, names = ["source", "target"])
        df["source"] = df["source"].apply(lambda x: x.strip()[1:-1].replace("/", "-"))
        df["target"] = df["target"].apply(lambda x: x.strip()[1:-1])
        df_train, df_test = train_test_split(df, random_state=42, test_size=0.1)
        
        # tokenize
        en_tokenizer = get_tokenizer('spacy', language='en')   
        counter = Counter() # dict of {token: Freq}     
        for source in df["source"]:
            counter.update(en_tokenizer(source))

        for source in df["target"]:
            counter.update(en_tokenizer(source))
        
        voc = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])    
        
        # create data
        if split == "train":
            self.data_df = df_train
        else:
            self.data_df = df_test
            
        data = []
        for (source, target) in zip(self.data_df["source"], self.data_df["target"]):
            s_tensor_ = torch.tensor([voc[token] for token in en_tokenizer(source)])
            t_tensor_ = torch.tensor([voc[token] for token in en_tokenizer(target)])
            data.append((s_tensor_, t_tensor_))
        
        self.voc = voc
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
train_dataset = DateDataset(split="train")



In [120]:
BOS_IDX = train_dataset.voc["<bos>"]
EOS_IDX = train_dataset.voc["<eos>"]
PAD_IDX = train_dataset.voc["<pad>"]

def generate_batch(data_batch):
    s_batch, t_batch = [], []
    for (s_item, t_item) in data_batch:
        s_batch.append(torch.cat([torch.tensor([BOS_IDX]), s_item, torch.tensor([EOS_IDX])], dim=0))
        t_batch.append(torch.cat([torch.tensor([BOS_IDX]), t_item, torch.tensor([EOS_IDX])], dim=0))
        
    s_batch = pad_sequence(s_batch, padding_value=PAD_IDX)
    return s_batch.T, torch.stack(t_batch)

train_dataloader = DataLoader(train_dataset, batch_size=16, collate_fn=generate_batch)

In [121]:
s,t = next(iter(train_dataloader))

In [122]:
s.shape, t.shape

(torch.Size([16, 7]), torch.Size([16, 7]))

In [123]:
for i,each in enumerate(train_dataloader):
    s, t = each
    print(i,s.shape, t.shape)
    if i > 15:
        break

0 torch.Size([16, 7]) torch.Size([16, 7])
1 torch.Size([16, 7]) torch.Size([16, 7])
2 torch.Size([16, 7]) torch.Size([16, 7])
3 torch.Size([16, 6]) torch.Size([16, 7])
4 torch.Size([16, 7]) torch.Size([16, 7])
5 torch.Size([16, 7]) torch.Size([16, 7])
6 torch.Size([16, 7]) torch.Size([16, 7])
7 torch.Size([16, 7]) torch.Size([16, 7])
8 torch.Size([16, 6]) torch.Size([16, 7])
9 torch.Size([16, 7]) torch.Size([16, 7])
10 torch.Size([16, 7]) torch.Size([16, 7])
11 torch.Size([16, 7]) torch.Size([16, 7])
12 torch.Size([16, 6]) torch.Size([16, 7])
13 torch.Size([16, 7]) torch.Size([16, 7])
14 torch.Size([16, 7]) torch.Size([16, 7])
15 torch.Size([16, 7]) torch.Size([16, 7])
16 torch.Size([16, 7]) torch.Size([16, 7])


In [124]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # input.shape = (N, Batch, Hidden)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [125]:
len(train_dataset.voc)

703

In [126]:
vocab_size = len(train_dataset.voc)
hidden_size = 64
encoder = EncoderRNN(vocab_size, hidden_size)
op = encoder(next(iter(train_dataloader))[0])
op[0].shape, op[1].shape #(D*num_layers, N, H)

(torch.Size([16, 7, 64]), torch.Size([1, 16, 64]))

In [127]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.MAX_LENGTH = 7
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, inputs):
        target , encoder_op, _ = inputs
        # target.shape = [B, L]
        # encoder_op.shape = [1, B, H)]
        
        i = 0
        decoder_logits= []
        decoder_ip_h = encoder_op
        decoder_ip_x = target[:,0]
        for i in range(self.MAX_LENGTH):
            
            # forward step
            decoder_ip_x = self.dropout(self.embedding(decoder_ip_x)) # [B, D]
            decoder_ip_x = torch.unsqueeze(decoder_ip_x, dim=1) # [B, 1, D]
            all_ops, op = self.gru(decoder_ip_x, decoder_ip_h) # [B, 1, H], [1, B, H]
            op = torch.squeeze(op) # [B,H]
            op = F.relu(op)
            logits = self.linear(op) # [B, output_size]
               
            decoder_logits.append(logits)
            
            
            decoder_ip_h = torch.permute(all_ops, (1,0,2)) # [1, B, H]  
            _, decoder_ip_x = torch.max(logits, dim=-1) # [B,1]
#             print(decoder_ip_x.shape, decoder_ip_h.shape)
        
        decoder_logits = torch.stack(decoder_logits, dim=1) # [B, 7, output_size]
        log_probs = F.log_softmax(decoder_logits, dim=-1) 
        return log_probs, decoder_logits, None

target = next(iter(train_dataloader))[1]
encoder_op = op[1]
inputs = (target, encoder_op, None)
decoder = DecoderRNN(hidden_size, vocab_size)
decoder_outputs, _ , _= decoder(inputs)
decoder_outputs.shape

torch.Size([16, 7, 703])

In [128]:
decoder_outputs.view(-1, decoder_outputs.size(-1)).shape

torch.Size([112, 703])

torch.Size([144])

In [59]:
t.shape

torch.Size([16, 9])

In [129]:
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.NLLLoss()

In [130]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder((target_tensor, encoder_hidden, encoder_outputs))

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.reshape(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [131]:
train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

2.021697245650821

In [181]:
torch.permute(torch.ones(8,2,6),(1,0,2)).shape

torch.Size([2, 8, 6])

In [161]:
vocab = train_dataset.voca

In [171]:
vocab.get_itos()[do[3]]

'1924'

In [154]:
torch.empty(16, 1, dtype=torch.long).fill_(0).shape

torch.Size([16, 1])