In [3]:
import torch
import pytorch_lightning as pl
import unicodedata
import torch.functional as F
import re
import torch.nn as nn
import random

In [4]:
# !wget https://www.manythings.org/anki/zsm-eng.zip
# !mkdir data
# !unzip zsm-eng.zip -d data

In [6]:
# # Jawi
# with open('data/zsm.txt', 'r') as f:
#     lines = f.read().strip().split('\n')
#     for line in lines[:2]:

#         print(unicodeToAscii(line))
#         # break

In [18]:
%%writefile dataset.py

SOS_token = 0
EOS_token = 1

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readVocab(lang1,lang2,reversed=False):
    lines = open("data/%s-%s.txt" % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    # only select odds to remove jawi translations
    pairs = [p for i,p in enumerate(pairs) if i % 2 == 0]
    
    if reversed:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Vocab(lang2)
        output_lang = Vocab(lang1)
    else:
        input_lang = Vocab(lang1)
        output_lang = Vocab(lang2)
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang,pairs

    

def sent2index(lang,sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def index2tensor(lang,sentence):
    index = sent2index(lang,sentence)
    index.append(EOS_token)
    return torch.tensor(index, dtype=torch.long).view(-1,1)

class MyDataset(torch.utils.data.Dataset):
    def __init__(self,input_lang,output_lang):
        self.input_lang,self.output_lang,self.dataset = readVocab(input_lang,output_lang)

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self,idx):
        x, label = self.dataset[idx][:2]
        # print(example)
        return index2tensor(self.input_lang,x),index2tensor(self.output_lang,label)

class DataModule(pl.LightningDataModule):
    def __init__(self,input_lang,output_lang):
        super().__init__()
        self.dataset = MyDataset(input_lang,output_lang)
        self.input_lang,self.output_lang = self.dataset.input_lang,self.dataset.output_lang
    
    def setup(self, stage=None):
        pass

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.dataset,batch_size=1,shuffle=True)
    
    # def val_dataloader(self):
        # return torch.utils.DataLoader(self.dataset,batch_size=1,shuffle=True)

dm  = DataModule(input_lang = 'en',output_lang='ms')


Overwriting dataset.py


In [9]:
# dm[10][0].shape

In [16]:
%%writefile components.py
import torch
import pytorch_lightning as pl
class EncoderRnn(pl.LightningModule):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRnn, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        print(embedded.shape)
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

MAX_LENGTH = 10
class DecoderRnn(torch.nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRnn, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

Overwriting components.py


In [15]:
%%writefile model.py
import torch
import pytorch_lightning as pl

teacher_forcing_ratio = 0.5

class BasicModel(pl.LightningModule):
    def __init__(self, input_lang, output_lang,hidden_size = 256,max_length = MAX_LENGTH, learning_rate = 0.01):
        super(BasicModel, self).__init__()
        self.encoder = EncoderRnn(input_lang.n_words, hidden_size)
        self.decoder = AttnDecoderRNN(hidden_size, output_lang.n_words,dropout_p=0.1)
        
        self.decoder.initHidden()
        self.save_hyperparameters()
        self.loss = nn.NLLLoss()

    def forward(self, x, label):
        x_length, label_length = x.size(0), label.size(0)
        out_list = torch.zeros(self.hparams.max_length,self.encoder.hidden_size)
        print(out_list.shape)
        encoder_hidden = self.encoder.initHidden()
        for idx, ex in enumerate(x):
            encoder_out, encoder_hidden = self.encoder(x[idx], encoder_hidden)
            out_list[idx] = encoder_out

        loss = 0
        decoder_input = torch.tensor([[SOS_token]])  # SOS
        decoder_hidden = encoder_hidden
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
            for di in range(label_length):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(
                    decoder_input, decoder_hidden, out_list)
                loss += self.criterion(decoder_output, label[di])
                decoder_input = label[di]  # Teacher forcing

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(label_length):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(
                    decoder_input, decoder_hidden, out_list)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                loss += self.criterion(decoder_output, label[di])
                if decoder_input.item() == EOS_token:
                    break
        return loss

    def training_step(self, batch, batch_idx):
        x,label = batch
        self(x,label)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        
        return optimizer

    # def validation_step(self, batch, batch_idx):
    #     input_tensor, target_tensor = batch
    #     encoder_outputs, encoder_hidden = self.encoder(input_tensor)
    #     decoder_hidden = encoder_hidden[:self.decoder.n_layers]
    #     decoder_input = torch.tensor([[SOS_token]])  # SOS
    #     decoder_output, decoder_hidden, attn_weights = self.decoder(
    #         decoder_input, decoder_hidden, encoder_outputs)

Writing model.py


In [13]:
dm = DataModule('en','ms')
pl.seed_everything(12)
trainer = pl.Trainer(gpus=1,max_epochs=10)
model = BasicModel(dm.input_lang, dm.output_lang)
trainer.fit(model,datamodule=dm)


Global seed set to 12
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name    | Type           | Params
-------------------------------------------
0 | encoder | EncoderRnn     | 513 K 
1 | decoder | AttnDecoderRNN | 745 K 
2 | loss    | NLLLoss        | 0     
-------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.035     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

torch.Size([10, 256])
torch.Size([1, 1, 2816])


RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 2816

In [None]:
dm.input_lang

<__main__.Vocab at 0x7ff4180e0ca0>