In [4]:
import os
os.chdir("../")

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split

import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import io
from nltk.tokenize import word_tokenize
import re

from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import optim
from tqdm import tqdm

import matplotlib.pyplot as plt

gpu_device = torch.device("cuda:0")
cpu_device = torch.device("cpu")
gpu_device, cpu_device

(device(type='cuda', index=0), device(type='cpu'))

In [6]:
from dataset.dataloader import DateDataset, get_dataloader
from models.enc_dec_attention import EncoderRNN, DecoderRNN
from train_utils import asMinutes, timeSince, train_epoch, val_epoch

In [7]:
enc_hidden_size = 256

train_dataset, train_dataloader = get_dataloader(split="train", batch_size=18000)
test_dataset, test_dataloader = get_dataloader(split="test", batch_size=4000)



In [8]:
vocab_size = len(train_dataset.voc)
encoder = EncoderRNN(vocab_size, enc_hidden_size).to(gpu_device)
decoder = DecoderRNN(enc_hidden_size, vocab_size).to(gpu_device)

In [9]:
encoder.load_state_dict(torch.load("ckpts/attntion-0.008-enc.pt"))
decoder.load_state_dict(torch.load("ckpts/attntion-0.008-dec.pt"))

<All keys matched successfully>

In [10]:
with torch.no_grad():
    input_tensor, target_tensor = next(iter(test_dataloader))
    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_log_probs, decoder_logits, weights = decoder((target_tensor, encoder_hidden, encoder_outputs))
    pred_indices = torch.argmax(decoder_log_probs, dim=-1)

In [11]:
accuracy = torch.all(target_tensor == pred_indices, dim=1).sum()/len(target_tensor)
accuracy, 1-accuracy

(tensor(0.9608, device='cuda:0'), tensor(0.0392, device='cuda:0'))

In [12]:
target_tensor.shape

torch.Size([4000, 7])

In [13]:
char_level_accuracy = (target_tensor == pred_indices).sum()/(target_tensor.shape[0]*target_tensor.shape[1])
char_level_accuracy, 1 - char_level_accuracy

(tensor(0.9942, device='cuda:0'), tensor(0.0058, device='cuda:0'))

In [14]:
mispred_ind = torch.where(torch.all(target_tensor == pred_indices, dim=1) == False)
mispred_ind

(tensor([  32,   56,   71,   98,  126,  133,  146,  173,  193,  198,  267,  281,
          312,  319,  360,  363,  368,  435,  534,  569,  646,  669,  686,  793,
          808,  872,  946,  954,  978, 1024, 1034, 1056, 1082, 1117, 1140, 1182,
         1193, 1197, 1202, 1210, 1218, 1227, 1287, 1303, 1319, 1323, 1371, 1380,
         1405, 1428, 1438, 1445, 1448, 1455, 1639, 1643, 1659, 1692, 1714, 1791,
         1869, 1891, 1903, 1921, 1924, 1927, 1928, 1983, 1999, 2013, 2023, 2039,
         2065, 2118, 2131, 2156, 2167, 2179, 2207, 2238, 2263, 2300, 2321, 2331,
         2339, 2357, 2398, 2401, 2408, 2419, 2456, 2467, 2480, 2484, 2501, 2537,
         2547, 2569, 2584, 2590, 2597, 2614, 2624, 2638, 2640, 2659, 2666, 2671,
         2699, 2703, 2718, 2725, 2728, 2740, 2743, 2834, 2858, 2885, 2897, 2936,
         2957, 2962, 3005, 3025, 3046, 3059, 3091, 3168, 3225, 3267, 3271, 3295,
         3329, 3342, 3350, 3365, 3392, 3501, 3550, 3571, 3584, 3645, 3664, 3671,
         3704, 3732, 3753, 3

In [15]:
vocab = train_dataset.voc.get_itos()

In [16]:
# selected_indices = mispred_ind[0]
selected_indices = range(target_tensor.shape[0])
ans = []
for ind in selected_indices:
    mis_pred_inds = pred_indices[ind]
    actual_inds = input_tensor[ind]
    target_inds = target_tensor[ind]
    
    pred = ""
    for pred_ind in mis_pred_inds:
        pred+=vocab[pred_ind]
        if vocab[pred_ind] == "<eos>":
            break
            
    ip = ""
    for pred_ind in actual_inds:
        ip+=vocab[pred_ind]
    
    target = ""
    for pred_ind in target_inds:
        target+=vocab[pred_ind]
        
    ans.append((ip, target, pred))
    
    # detect label noise
#     actual_year = int(actual_pred[5:9])
#     try:
#         pred_year = int(pred[5:9])
#     except:
#         print("HELLLL", pred)
        
#     if np.abs(actual_year - pred_year) == 1:
#         print(actual_year, pred_year)
#         print(actual, actual_pred, pred)
    

In [17]:
df = pd.DataFrame(ans, columns=["input", "target", "prediction"])
df

Unnamed: 0,input,target,prediction
0,<bos>9september1943<eos><pad><pad>,<bos>1943-09-09<eos>,<bos>1943-09-09<eos>
1,<bos>may231532<eos><pad><pad>,<bos>1532-05-23<eos>,<bos>1532-05-23<eos>
2,<bos>june271908<eos><pad><pad>,<bos>1908-06-27<eos>,<bos>1908-06-27<eos>
3,<bos>july241766<eos><pad><pad>,<bos>1766-07-24<eos>,<bos>1766-07-24<eos>
4,<bos>december262008<eos><pad><pad>,<bos>2008-12-26<eos>,<bos>2008-12-26<eos>
...,...,...,...
3995,<bos>tuesdayjune41669<eos><pad>,<bos>1669-06-04<eos>,<bos>1669-06-04<eos>
3996,<bos>203425august<eos><pad><pad>,<bos>2034-08-25<eos>,<bos>2034-08-25<eos>
3997,<bos>sat198329january<eos><pad>,<bos>1983-01-29<eos>,<bos>1983-01-29<eos>
3998,<bos>sundaymarch151857<eos><pad>,<bos>1857-03-15<eos>,<bos>1857-03-15<eos>


In [18]:
# df["prediction"].apply(lambda x: list(x[5:-5].replace("-",""))).to_list()

In [23]:
# character level analysis
df1 = pd.DataFrame(df["prediction"].apply(lambda x: list(x[5:-5])).to_list())
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,9,4,3,-,0,9,-,0,9
1,1,5,3,2,-,0,5,-,2,3
2,1,9,0,8,-,0,6,-,2,7
3,1,7,6,6,-,0,7,-,2,4
4,2,0,0,8,-,1,2,-,2,6
...,...,...,...,...,...,...,...,...,...,...
3995,1,6,6,9,-,0,6,-,0,4
3996,2,0,3,4,-,0,8,-,2,5
3997,1,9,8,3,-,0,1,-,2,9
3998,1,8,5,7,-,0,3,-,1,5


In [24]:
df2 = pd.DataFrame(df["target"].apply(lambda x: list(x[5:-5])).to_list())
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,9,4,3,-,0,9,-,0,9
1,1,5,3,2,-,0,5,-,2,3
2,1,9,0,8,-,0,6,-,2,7
3,1,7,6,6,-,0,7,-,2,4
4,2,0,0,8,-,1,2,-,2,6
...,...,...,...,...,...,...,...,...,...,...
3995,1,6,6,9,-,0,6,-,0,4
3996,2,0,3,4,-,0,8,-,2,5
3997,1,9,8,3,-,0,1,-,2,9
3998,1,8,5,7,-,0,3,-,1,5


In [25]:
import numpy as np

In [26]:
(df1 != df2).sum(axis=0)

0     38
1    128
2     28
3     38
4      0
5      2
6      4
7      0
8      4
9      6
dtype: int64

In [116]:
np.argmax((df1 == df2).sum(axis=0)/len(df)), 1-np.max((df1 == df2).sum(axis=0)/len(df)) # 4, 7 - lowest

(4, 0.00024999999999997247)

In [117]:
np.argmin((df1 == df2).sum(axis=0)/len(df)), 1-np.min((df1 == df2).sum(axis=0)/len(df)) # 1 - highest

(1, 0.03374999999999995)

In [37]:
for i,each in enumerate(zip(pred_indices[mispred_ind], input_tensor[mispred_ind])):
    mis_pred_inds, actual_inds = each
    target_inds = target_tensor[mispred_ind][i]
    pred = ""
    for pred_ind in mis_pred_inds:
        pred+=vocab[pred_ind]
        if vocab[pred_ind] == "<eos>":
            break
    actual = ""
    for pred_ind in actual_inds:
        actual+=vocab[pred_ind]
    
    actual_pred = ""
    for pred_ind in target_inds:
        actual_pred+=vocab[pred_ind]
        
    
    # detect label noise
    actual_year = int(actual_pred[5:9])
    try:
        pred_year = int(pred[5:9])
    except:
        print("HELLLL", pred)
        
    if np.abs(actual_year - pred_year) == 1:
        print(actual_year, pred_year)
        print(actual, actual_pred, pred)
        

1627 1626
<bos>02-01-1626<eos> <bos>1627-01-02<eos> <bos>1626-02-02<eos>
1907 1908
<bos>30december1908<eos><pad><pad> <bos>1907-12-30<eos> <bos>1908-12-30<eos>
2034 2033
<bos>sun20331january<eos><pad> <bos>2034-01-01<eos> <bos>2033-01-01<eos>
2019 2020
<bos>30dec2020<eos><pad><pad> <bos>2019-12-30<eos> <bos>2020-12-30<eos>
HELLLL <bos><bos>-01-17<eos>
1669 1670
<bos>dec301670<eos><pad><pad> <bos>1669-12-30<eos> <bos>1670-12-30<eos>
1930 1931
<bos>31dec1931<eos><pad><pad> <bos>1930-12-31<eos> <bos>1931-12-31<eos>
1768 1767
<bos>17671january<eos><pad><pad> <bos>1768-01-01<eos> <bos>1767-01-01<eos>
1545 1546
<bos>31-12-1546<eos> <bos>1545-12-31<eos> <bos>1546-12-31<eos>
1669 1670
<bos>tuesday16703112<eos><pad> <bos>1669-12-31<eos> <bos>1670-12-31<eos>
1563 1564
<bos>156430december<eos><pad><pad> <bos>1563-12-30<eos> <bos>1564-12-30<eos>
1712 1711
<bos>2january1711<eos><pad><pad> <bos>1712-01-02<eos> <bos>1711-01-02<eos>
2049 2048
<bos>fri20481january<eos><pad> <bos>2049-01-01<eos> <bos>20

In [28]:
# example of misprediction
'''
<bos>1593-12-06<eos> <bos>6-12-93<eos>
<bos>1908-12-30<eos> <bos>30december1908<eos><pad><pad>
<bos>1550-09-09<eos> <bos>sun15509april<eos><pad
<bos>dec301670<eos><pad><pad> <bos>1669-12-30<eos> <bos>1670-12-30<eos> -- diffes in excatly 1, due to label noise?
<bos>30december1936<eos><pad><pad> <bos>1935-12-30<eos> <bos>1936-12-30<eos> -- some label noise also
'''

tensor([[  2,  47,  10,  ...,  10, 494,   3],
        [  2,  59,  10,  ...,  10, 168,   3],
        [  2, 407,  10,  ...,  10, 448,   3],
        ...,
        [  2,  34,  10,  ...,  10, 686,   3],
        [  2, 167,  10,  ...,  10, 197,   3],
        [  2, 119,  10,  ...,  10, 646,   3]], device='cuda:0')