In [1]:
'provided modules and data'
import wget, os, gzip, pickle, random, re, sys

IMDB_URL = 'http://dlvu.github.io/data/imdb.{}.pkl.gz'
IMDB_FILE = 'imdb.{}.pkl.gz'

PAD, START, END, UNK = '.pad', '.start', '.end', '.unk'

def load_imdb(final=False, val=5000, seed=0, voc=None, char=False):

    cst = 'char' if char else 'word'

    imdb_url = IMDB_URL.format(cst)
    imdb_file = IMDB_FILE.format(cst)

    if not os.path.exists(imdb_file):
        wget.download(imdb_url)

    with gzip.open(imdb_file) as file:
        sequences, labels, i2w, w2i = pickle.load(file)

    if voc is not None and voc < len(i2w):
        nw_sequences = {}

        i2w = i2w[:voc]
        w2i = {w: i for i, w in enumerate(i2w)}

        mx, unk = voc, w2i['.unk']
        for key, seqs in sequences.items():
            nw_sequences[key] = []
            for seq in seqs:
                seq = [s if s < mx else unk for s in seq]
                nw_sequences[key].append(seq)

        sequences = nw_sequences

    if final:
        return (sequences['train'], labels['train']), (sequences['test'], labels['test']), (i2w, w2i), 2

    # Make a validation split
    random.seed(seed)

    x_train, y_train = [], []
    x_val, y_val = [], []

    val_ind = set( random.sample(range(len(sequences['train'])), k=val) )
    for i, (s, l) in enumerate(zip(sequences['train'], labels['train'])):
        if i in val_ind:
            x_val.append(s)
            y_val.append(l)
        else:
            x_train.append(s)
            y_train.append(l)

    return (x_train, y_train), \
           (x_val, y_val), \
           (i2w, w2i), 2


def gen_sentence(sent, g):

    symb = '_[a-z]*'

    while True:

        match = re.search(symb, sent)
        if match is None:
            return sent

        s = match.span()
        sent = sent[:s[0]] + random.choice(g[sent[s[0]:s[1]]]) + sent[s[1]:]

def gen_dyck(p):
    open = 1
    sent = '('
    while open > 0:
        if random.random() < p:
            sent += '('
            open += 1
        else:
            sent += ')'
            open -= 1

    return sent

def gen_ndfa(p):

    word = random.choice(['abc!', 'uvw!', 'klm!'])

    s = ''
    while True:
        if random.random() < p:
            return 's' + s + 's'
        else:
            s+= word

def load_brackets(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='dyck')

def load_ndfa(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='ndfa')

def load_toy(n=50_000, char=True, seed=0, name='lang'):

    random.seed(0)

    if name == 'lang':
        sent = '_s'

        toy = {
            '_s': ['_s _adv', '_np _vp', '_np _vp _prep _np', '_np _vp ( _prep _np )', '_np _vp _con _s' , '_np _vp ( _con _s )'],
            '_adv': ['briefly', 'quickly', 'impatiently'],
            '_np': ['a _noun', 'the _noun', 'a _adj _noun', 'the _adj _noun'],
            '_prep': ['on', 'with', 'to'],
            '_con' : ['while', 'but'],
            '_noun': ['mouse', 'bunny', 'cat', 'dog', 'man', 'woman', 'person'],
            '_vp': ['walked', 'walks', 'ran', 'runs', 'goes', 'went'],
            '_adj': ['short', 'quick', 'busy', 'nice', 'gorgeous']
        }

        sentences = [ gen_sentence(sent, toy) for _ in range(n)]
        sentences.sort(key=lambda s : len(s))

    elif name == 'dyck':

        sentences = [gen_dyck(7./16.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    elif name == 'ndfa':

        sentences = [gen_ndfa(1./4.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    else:
        raise Exception(name)

    tokens = set()
    for s in sentences:

        if char:
            for c in s:
                tokens.add(c)
        else:
            for w in s.split():
                tokens.add(w)

    i2t = [PAD, START, END, UNK] + list(tokens)
    t2i = {t:i for i, t in enumerate(i2t)}

    sequences = []
    for s in sentences:
        if char:
            tok = list(s)
        else:
            tok = s.split()
        sequences.append([t2i[t] for t in tok])

    return sequences, (i2t, t2i)

## Part 1 Classification: data loading
The IMDb dataset is the sequence-learning equivalent of MNIST: a large, challenging classification dataset with plenty of examples, that is still light enough to train models for on a laptop. It contains 50 000 reviews of movies, taken from the Internet Movie Database which are either highly positive or highly negative. The task is to predict which for a given review. This is known as sentiment analysis. </br>
To simplify things we've preprocessed and tokenized the data for you. Our tokenization is a little crude: *we lowercase everything, remove all non-letters and split on whitespace*. It'll do for our current purposes, but in practice, you'd look to more refined tokenization strategies to extract more information from the raw text. *All words have been converted to integer indices in a fixed vocabulary*. 

To load the data, call the load_imdb function as follows: </br>
*(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls =
load_imdb(final=False)*

If *final* is true, the function returns the canonical test/train split with 25 000 reviews in each.
If final is false, a validation split is returned with 20 000 training instances and 5 000
validation instances.

In [None]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False)

The return values are as follows:
* x_train: A python list of lists of integers. Each integer represents a word. Sorted from short to long.
* y_train: The corresponding class labels: 0 for positive, 1 for negative.
* x_val: Test/validation data. Laid out the same as x_train.
* y_val: Test/validation labels
* i2w: A list of strings mapping the integers in the sequences to their original words. i2w [141] returns the string containing word 141.
* w2i: A dictionary mapping the words to their indices. w2i['film'] returns the index for the word "film".

In [None]:
# To have a look at your data (always a good idea), 
# you can convert a sequence from indices
# to words as follows
print([i2w[w] for w in x_train[141]])
print(len(x_train))
print(x_train[141])

To train, you'll need to loop over x_train and y_train and slice out batches. Each batch will need to be padded to a fixed length and then converted to a torch tensor. Implement this padding and conversion to a tensor.
***
Tips:
* We've included a special padding token in the vocabulary, represented by the string ".pad". Consult the w2i dictionary to see what the index of this token is.
* We've also included special tokens ".start" and ".end", which are only used in the autoregressive task.
* If you feed a list of lists to the function torch.tensor(), it'll return a torch tensor.
    - The inner lists must all have the same size
    - Pytorch is pretty good at guessing which datatype (int, float, byte) is expected, but it does sometimes get it wrong. To be sure, add the dataype with *batch = torch.tensor(lists, dtype=torch.long)*.

In [2]:
'PyTorch modules'
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import Dataset, DataLoader
print(torch.device)

'Basic Modules'
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import random
import warnings 
import math
warnings.filterwarnings("ignore")

<class 'torch.device'>


In [None]:
'check what indices are the special tokens'
print(w2i['.pad'])
print(w2i['.start'])
print(w2i['.end'])

In [None]:
def batched_and_padded_tensors(dt, batch_size, name = 'X'):
    'Split data of lists into equal-sized chunks, and pad them'
    if name == 'X':
        batches = []
        for i in range(0, len(dt), batch_size):
            if i + batch_size < len(dt):
                batches.append(dt[i:i + batch_size])
            else:
                temp = dt[i:].copy()
                # we randomly select samples to fill up this batch
    #             temp.extend(random.choices(dt, k=batch_size-len(temp)))
                batches.append(temp)
        padded_batches = []
        for b in batches:
#             padded_batches.append(pad_a_batch(b))
            padded_batches.extend(pad_a_batch(b))
#         padded_batches = torch.tensor(padded_batches, dtype=torch.long)
        return padded_batches
    else:
        return dt
#     else:
#         # since the label data (y) donot need padding
#         padded_batches = torch.tensor(batches, dtype=torch.long)
    
    # return a list of batch (tensor)
#     return padded_batches

def pad_a_batch(batch):
    """
    pad 0s to the instances in the batch 
    (x_train or x_val) to get equal lengths
    """
    max_len = max(len(inst) for inst in batch)
    new_batch = []
    for ins in batch:
        k = ins.copy()
        if len(k) < max_len:
            k.extend((max_len-len(ins))*[0])
            new_batch.append(k)
        else:
            new_batch.append(k)
#     return torch.tensor(new_batch, dtype=torch.long)
    return new_batch

def preprocess_data(batch_size=5,final_opt = False):
    (x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=final_opt)
    # split data(lists) into batches(list)
    x_train_batch = batched_and_padded_tensors(x_train, batch_size)
    y_train_batch = batched_and_padded_tensors(y_train, batch_size, name = 'Y')
    x_val_batch = batched_and_padded_tensors(x_val, batch_size)
    y_val_batch = batched_and_padded_tensors(y_val, batch_size, name = 'Y')
    
    return x_train_batch,y_train_batch,x_val_batch,y_val_batch,i2w, w2i, numcls
    # generate batch data in form of lists
    

In [None]:
x_train, y_train, x_val, y_val, i2w, w2i, numcls = preprocess_data(batch_size=100)

In [None]:
print(type(x_train))
print(type(x_train[0]))
print(x_train[0])

In [None]:
len(i2w)

In [None]:
len(w2i)

## Part2: classification, baseline model
Question 1: Build a model with the following structure.

In [None]:
class simple_Net(nn.Module):
    # Note: the dim:time here means the instance length in the batched inputs
    def __init__(self,numcls=2,token_size=99430,emb=300,hidden=300):
        super().__init__()
        # torch.nn.Embedding(num_embeddings, embedding_dim) 
        self.embedding = nn.Embedding(token_size,emb)
        self.fc1 = nn.Linear(emb,hidden)
        self.fc2 = nn.Linear(hidden,numcls)

    def global_max_pool(self,x):
        """
        take the max of an instance 
        among its time dim -> its words -> the attension
        x:(batch,time,hidden)
        """
        (xmax_, xmax_ind) = torch.max(x, 1)
        return xmax_
    
    def forward(self, x):
        assert torch.is_tensor(x), f'x is not Tensor, x is {type(x)}, {x}'
        x = self.embedding(x)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.global_max_pool(x)
        x = self.fc2(x)
        return x

net = simple_Net()

Question2: Train for at least one epoch and compute the validation accuracy. It should be above 60% for most reasonable hyperparameters.
* Please remember that training requires implementing a dataloader, a training loop,etc.

In [None]:
# define loss and parameter optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)

In [None]:
def dataloader(x_train,y_train, batch_size):
#     data = np.array(zip(x_train,y_train))
    x_batches = []
    y_batches = []
    for i in range(0, len(y_train), batch_size):
        if i + batch_size < len(y_train):
            x_batches.append(x_train[i:i + batch_size])
            y_batches.append(y_train[i:i + batch_size])
        else:
            x_batches.append(x_train[i:].copy())
            y_batches.append(y_train[i:].copy())
    training_dt = list(zip(x_batches,y_batches))
    random.shuffle(training_dt)
    return x_batches,y_batches,training_dt

bx_train,by_train,shuffled_tr_dt = dataloader(x_train,y_train, 100)
bx_val,by_val,shuffled_val_dt = dataloader(x_val,y_val, 100)

In [None]:
# train the baseline model, m epochs
def easy_train(net,num_epochs, print_int, shuffled_tr_dt, 
               shuffled_val_dt, criterion, optimizer,PATH = './simple_net.pth'):
    # train model
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in tqdm(enumerate(shuffled_tr_dt)):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # transfer inputs and labels to tensor
            inputs_ = torch.tensor(inputs)
            labels_ = torch.tensor(labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = net(inputs_)
            loss = criterion(outputs, labels_)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.item()
            if i % print_int == print_int-1:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_int:.3f}')
                running_loss = 0.0
    print('Finished Training')
    
    # save model
    torch.save(net.state_dict(), PATH)
#     model loading
#     net = Net()
#     net.load_state_dict(torch.load(PATH))
    
    # check validation accuracy
    'calculate the accuracy'
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for i, data in tqdm(enumerate(shuffled_val_dt)):
            inputs, labels = data
            # transfer inputs and labels to tensor
            inputs_ = torch.tensor(inputs)
            labels_ = torch.tensor(labels)
            # calculate outputs by running images through the network
            outputs = net(inputs_)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels_.size(0)
            correct += (predicted == labels_).sum().item()

    print(f'Accuracy of the network {100 * correct // total} %')
    

In [None]:
easy_train(net,5, 100,shuffled_tr_dt, shuffled_val_dt, criterion, optimizer,PATH = './simple_net.pth')

## Part 3: Writing your own Elman RNN
For a sequence-to-sequence layer that propagates information over time, we'll start with a simple Elman network (the first type of simple RNN shown in the lectures).
***
PyTorch has these available, but it's instructive to build our own first, to see what happens under the hood. Since this can be complicated, we'll give you some skeleton code to work from.
***
**Question 3**: Fill in the missing parts of this code. And build a second model, like the one in the previous part, but replacing the second layer with an Elman(300, 300, 300) layer.
* As you will see when you run the network, looping over slices of your input tensor is horribly slow (remember that every slice becomes a node in the computation graph). This is optimized a lot in the PyTorch library implementations of RNNs, but fundamentally all RNNs require some level of sequential processing.
* As we saw in the slides, the first and last hidden layers can be useful for various tricks. This is why PyTorch returns both the output sequence and the last hidden layer.
    * Remember this when you fit the Elman module into your larger network: it outputs a pair, and only the first element of that pair should be passed to the next layer.

In [None]:
class Elman(nn.Module):
    def __init__(self,insize=300,outsize=300,hsize=300):
        super().__init__()
        self.lin1 = nn.Linear(insize+hsize,hsize)
        self.lin2 = nn.Linear(hsize,outsize)
    
    def forward(self, x, hidden=None):
        # x.shape = (batch, time, embeddings) <dim>
        b,t,e = x.size()
        if hidden is None:
            hidden = torch.zeros(b,e,dtype=torch.float)
            # hidden.shape = (b,e)
        outs = []
        for i in range(t):
            # torch.cat concatenates prev hidden outputs 
            # to each instance in batch
            inp = torch.cat([x[:,i,:],hidden],dim=1)
            # inp.shape = (b,e_x+e_h) = (b,2e)
            # perform sigmoid to get hidden
            hidden = F.sigmoid(self.lin1(inp))
            # hidden.shape = (b,e), same as x_t
            out = self.lin2(hidden)
            # but out is 2D tensor?? how to get None 
            outs.append(out[:,None,:])
        return torch.cat(outs,dim=1), hidden

In [None]:
class Elman_Net(nn.Module):
    # Note: the dim:time here means the instance length in the batched inputs
    def __init__(self,numcls=2,token_size=99430,emb=300,hidden=300):
        super().__init__()
        # torch.nn.Embedding(num_embeddings, embedding_dim) 
        self.embedding = nn.Embedding(token_size,emb)
        self.elman = Elman(insize=emb,outsize=hidden)
        self.fc2 = nn.Linear(hidden,numcls)

    def global_max_pool(self,x):
        """
        take the max of an instance 
        among its time dim -> its words -> the attension
        x:(batch,time,hidden)
        """
        (xmax_, xmax_ind) = torch.max(x, 1)
        return xmax_
    
    def forward(self, x):
        assert torch.is_tensor(x), f'x is not Tensor, x is {type(x)}, {x}'
        x = self.embedding(x)
#         x,_ = self.elman(x)
        x,_ = self.elman.forward(x)
        x = F.relu(x)
        x = self.global_max_pool(x)
        x = self.fc2(x)
        return x
    
# my model is not learning? 
# because the external forward for Elman should be applied independently? no..

In [None]:
net = Elman_Net()
easy_train(net,2, 25,shuffled_tr_dt, shuffled_val_dt, criterion, optimizer,PATH = './Elman_net.pth')

In [None]:
net = Elman_Net()
easy_train(net,2, 25,shuffled_tr_dt, shuffled_val_dt, criterion, optimizer,PATH = './Elman_net.pth')

## Part 3: Using Torch's RNNs
To speed things up, let's try again with pytorch's own RNN implementations. Take the previous model and replace layer 2 with layers of the type *torch.nn.RNN* (the torch implementation of the Elman network) and *torch.nn.LSTM*, and adapt the network as necessary. 
* Read the documentation carefully to see what the pytorch RNNs compute. The pytorch Elman network only computes the first layer of the Elman network. This means you get the activated hidden layer out.
***
**Question 4**: Tune the hyperparameters for these three models (MLP, Elman, LSTM).
* Machine Learning gospel tells us that with enough tuning, the LSTM will perform best, followed by the Elman network, followed by the MLP. Is this what you find as well? State your hypothesis based on your chosen hyperparameters and the validation performance. Then run all three models on the **test set and report your findings**.
* The default dimension order in PyTorch sequence models is slightly confusing (time, batch, embedding). You can transpose the dimensions, or just pass batch_first=True to the constructor of your RNN.

In [None]:
class Torch_Elman_Net(nn.Module):
    pass


In [None]:
class Torch_LSTM_Net(nn.Module):
    pass

PART 4: Autoregressive Models


In [3]:
def pad_a_batch_two(batch, max_len):
    """
    pad 0s to the instances in the batch 
    (x_train or x_val) to get equal lengths
    """
    new_batch = [] # Same as the previous
    for ins in batch:
        k = ins.copy()
        if len(k) < int(max_len):
            k.extend((max_len-len(ins))*[0])
            new_batch.append(k)
        else:
            new_batch.append(k)
    return new_batch



def batched_and_padded_tensors_two(dt, maximum):
    'Split data of lists into equal-sized chunks, and pad them'
    batches = []
    all_batches = []
    length = 0
    for i in range(len(dt)): # Make sure all is equal
        batches.append(dt[i])
        length += len(dt[i]) # NOTE we are looking at max tokens instead of sequences
        if length < maximum: # If max not reached continue adding sequences
            if i+1 == len(dt):
                all_batches.appen(batches)
                break
            continue
        else: # If max is reached append
            all_batches.append(batches)
            length = 0 # Simple reset
            batches = []

    return all_batches
    
def finalize_set(x, y):

    all_length = []
    for a in x: 
        lengths = max([len(b) for b in a]) # Max token and not sequence. Since all must be equal, easiest is to get the biggest one
        all_length.append(lengths) 

    x_batch = []
    y_batch = []
    
    for i in range(len(x)): # Sequences with unequal length must be adjusted
        x_batch.append(pad_a_batch_two(x[i], all_length[i])) # Append batches with equal length
        y_batch.append(pad_a_batch_two(y[i], all_length[i])) # Append batches with equal length
    
    
    return x_batch, y_batch


def load_in(choice):
    if choice == "ndfa":
        x_train, (i2w, w2i) = load_ndfa(n = 150000)
    else:
        x_train, (i2w, w2i) = load_brackets(n = 150000)
    
    print(w2i[".start"], w2i[".end"]) # Insert 1 at the beginning and 2 at the end to mark start and end of sequence
    print(i2w[0])

    x = []
    for a in x_train:
        a.insert(0, 1) # Mark start
        a.append(2) # Mark end
        x.append(a)
    #print(x,"final")

    y_train = []
    for a in x: # Target is just the batch, shifted one token to the left.
        b = a[1:] # Remove first column
        b.append(0) # Append column of zeros
        y_train.append(b)

    x_batch = batched_and_padded_tensors_two(x_train, 250) # x_train after batching
    y_batch = batched_and_padded_tensors_two(y_train, 250) # y_train after batching

    x_final, y_final = finalize_set(x_batch, y_batch) # Set after padding, sequences in batches must have equal length

    k = np.array(list(zip(x_final, y_final)))
    np.random.shuffle(k)

    
    return k, (i2w, w2i)




In [4]:
import torch.distributions as dist

class LSTM(nn.Module):
    def __init__(self, insize = 32, num_layer = 2, hsize = 16, vocab = 15):
        super().__init__()
        self.vocab = vocab
        self.insize = insize
        self.num_layer = num_layer
        self.hsize = hsize
        self.embd = nn.Embedding(vocab, insize)
        self.Long_Short = nn.LSTM(insize, self.hsize, self.num_layer, batch_first=True)
        self.fc = nn.Linear(hsize, self.vocab)


    def forward(self, x):
        assert torch.is_tensor(x), f'x is not Tensor, x is {type(x)}, {x}'
        x = self.embd(x)
        x,_ = self.Long_Short(x)
        x = x.reshape(x.shape[0] * x.shape[1], self.hsize)
        x = self.fc(x)
        return x
    




In [5]:
def sample(lnprobs, temperature):
    """
    Sample an element from a categorical distribution
    :param lnprobs: Outcome logits
    :param temperature: Sampling temperature. 1.0 follows the given distribution, 0.0 returns the maximum probability element.
    :return: The index of the sampled element.
    """
    if temperature == 0.0:
        return lnprobs.argmax()
    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)
    return cd.sample()

def pred(net, data, seq, temp, max_length):
    print("pred")
    predicted_values = []
    for i in range(0, max_length):
        a = torch.tensor([data.w2i[i] for w in seq[i:]])
        b = net.forward(a)
        final_position = b[0][-1]
        sample_found = sample(final_position, temp)
        sample_print = seq.data.i2w[sample_found]
        predicted_values.append(sample_print)
        if sample_print == ".end": # If .end is found before the maximum length, return the sample
            return pred
    return pred



In [27]:
def easy_train_two(net, print_int, data, criterion,  optimizer, epoch, seq, PATH):
    # train model
    loss_full = []
    for epoch in range(epoch):  # loop over the dataset multiple times
        running_loss = 0.0
        i = 0
        for batch in data:
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = batch[0], batch[1]
            # transfer inputs and labels to tensor
            inputs_ = torch.tensor(inputs)
            labels_ = torch.tensor(labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            labels_ = labels_.reshape(labels_.shape[0]*labels_.shape[1],)
            # forward + backward + optimize
            outputs= net(inputs_)
            loss = criterion(outputs, labels_)
            loss.backward()
            loss_full.append(loss.item())
            optimizer.step()
            # print statistics
            running_loss += loss.item()
            if i % print_int == print_int-1:    
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_int:.3f}')
                running_loss = 0.0
            i += 1
        
        for b in range(10):
            print(b)
            predict = pred(net, data, seq, 0.1, 20)
            print(predict)
            
    print('Finished Training')
    
    # save model
    torch.save(net.state_dict(), PATH)
#     model loading
#     net = Net()
#     net.load_state_dict(torch.load(PATH))
    
    # check validation accuracy
    correct = 0
    total = 0

    norm = []
    # No gradient calculation
    '''
    with torch.no_grad():
        total = 0
        for par in net.parameters():
            total += par.grad.data.pow(2).sum()
        total = total.sqrt().item()
        norm.append(total)
        print(norm)
    '''
    print(np.mean(loss_full))

  


In [28]:
data,(i2w, w2i) = load_in("ndfa")
net = LSTM()
criterion = nn.CrossEntropyLoss()
epoch = 1
seq = ['.start', 'a', 'b']
optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
easy_train_two(net, 1000, data, criterion,  optimizer, epoch, seq, PATH = './Elman_net.pth')

1 2
.pad
[1,  1000] loss: 2.292
[1,  2000] loss: 0.967
[1,  3000] loss: 0.341
[1,  4000] loss: 0.253
[1,  5000] loss: 0.230
[1,  6000] loss: 0.224
[1,  7000] loss: 0.213
[1,  8000] loss: 0.211
[1,  9000] loss: 0.206
0
pred


AttributeError: 'numpy.ndarray' object has no attribute 'w2i'