# **Dataset Creating**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import torch.nn.functional as F
from random import seed
from random import randint
import pandas as pd
import numpy as np
import random 


seed(1)


In [2]:
#Creat a data set
Pool_data = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
#Create function returning the random word and its inverse.
def RandomWord(data, length):
  word = ''
  for i in range(length):
    word = word + Pool_data[randint(0,25)]
  return word, word[::-1]

#Make train and test set
keep_train = []
for i in range(2,30):
  if i>15:
    for k in range(10):
      src, tar = RandomWord(Pool_data, i)
      keep_train.append([src, tar])
  else:
    for k in range(50):
      src, tar = RandomWord(Pool_data, i)
      keep_train.append([src, tar])
train = pd.DataFrame(np.array(keep_train),
                  columns=['source', 'target'])
print(train.shape)

keep_test = []
for j in range(2):
  for i in range(2,30):
      src, tar = RandomWord(Pool_data, i)
      keep_test.append([src, tar])
test = pd.DataFrame(np.array(keep_test),
                  columns=['source', 'target'])
print(test.shape)

keep_val = []
for j in range(2):
  for i in range(randint(2,30)):
      src, tar = RandomWord(Pool_data, i)
      keep_val.append([src, tar])
val = pd.DataFrame(np.array(keep_val),
                  columns=['source', 'target'])
print(val.shape)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
val.to_csv("val.csv", index=False)


(840, 2)
(56, 2)
(33, 2)


In [3]:
def token_char(text):
  token = []
  for char in text:
    token.append(char)
  return token

SRC = Field(tokenize=token_char,
            init_token = '<sos>',
            eos_token='<eos>')
TRG = Field(tokenize=token_char,
            init_token = '<sos>',
            eos_token='<eos>')



fields = {"source": ("src", SRC), "target" : ("trg",TRG)}

train_data, valid_data, test_data = TabularDataset.splits(
    path="/content/", train="train.csv", validation="val.csv", test="test.csv", format="csv", fields=fields
)

SRC.build_vocab(train_data,  min_freq=1)
TRG.build_vocab(train_data,  min_freq=1)

In [4]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(test_data[50].__dict__.keys())
print(test_data[50].__dict__.values())

Number of training examples: 840
Number of validation examples: 33
Number of testing examples: 56
dict_keys(['src', 'trg'])
dict_values([['b', 'd', 'h', 't', 'v', 'a', 'n', 's', 'a', 'q', 'b', 'f', 'v', 'z', 't', 'a', 'z', 'r', 'o', 'x', 'x', 'n', 'u', 'e'], ['e', 'u', 'n', 'x', 'x', 'o', 'r', 'z', 'a', 't', 'z', 'v', 'f', 'b', 'q', 'a', 's', 'n', 'a', 'v', 't', 'h', 'd', 'b']])


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

# **MODEL**

**ENCONDER** 

In [6]:
class encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedded_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size  
        self.embedded_size = embedded_size

        self.embed = nn.Embedding(input_size, self.embedded_size)
        self.gru = nn.GRU(self.embedded_size, self.hidden_size)

    def forward(self, source, hidden):

        emb = self.embed(source).unsqueeze(0)
        outputs, hidden = self.gru(emb,hidden)

        return hidden


**DECODER** 

In [7]:
class decoder(nn.Module):
    def __init__(self, output_size, hidden_size, embedded_size):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.embedded_size = embedded_size

        self.embed = nn.Embedding(output_size, self.embedded_size)
        self.gru = nn.GRU(self.embedded_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.soft = nn.LogSoftmax(dim=1)

    def forward(self, source, hidden):
        
        source = source.unsqueeze(0)
        emb = self.embed(source)
        output, hidden = self.gru(emb, hidden)
        pred = self.soft(self.out(output[0]))

        return pred, hidden


**Sequence to Sequence**

In [8]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device):
       super(Seq2Seq, self).__init__()

       self.encoder = encoder
       self.decoder = decoder
       self.device = device
   def forward(self, source, target):

       input_length = source.size(0)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_size
       

       feedHidden = torch.zeros(1, batch_size, 256).to(device)
       for i in range(input_length):
          hidden = self.encoder(source[i], feedHidden)
          feedHidden = hidden
          

       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
       
       decode_input = target[0,:]

       for i in range(1, target_length):
        
        output, hidden= self.decoder(decode_input, feedHidden)
        outputs[i] = output
        top1 = output.argmax(1) 
        tf = random.random() < 0.5
        decode_input = target[i] if tf else top1
        feedHidden = hidden

       return outputs

# **Initialize model and its parameters**

In [9]:
Input_size = len(SRC.vocab)
Output_size = len(TRG.vocab)
Embbed_size = 256
Hidden_size = 256

Encoder = encoder(Input_size, Hidden_size, Embbed_size)
Decoder = decoder(Output_size, Hidden_size, Embbed_size)

Model = Seq2Seq(Encoder, Decoder, device).to(device)
print(Model)

Seq2Seq(
  (encoder): encoder(
    (embed): Embedding(30, 256)
    (gru): GRU(256, 256)
  )
  (decoder): decoder(
    (embed): Embedding(30, 256)
    (gru): GRU(256, 256)
    (out): Linear(in_features=256, out_features=30, bias=True)
    (soft): LogSoftmax(dim=1)
  )
)


In [10]:
LearningRate = 1e-3
Optimizer = optim.Adam(Model.parameters(), lr=LearningRate)

TRG_IDX = TRG.vocab.stoi['<pad>']
print(TRG_IDX)
Crit = nn.CrossEntropyLoss(ignore_index=TRG_IDX)

1


# **Model Training**

In [13]:
NumEpoch = 100
EpochLoss = 0
Model.train(True)

for epoch in range(0, NumEpoch):

  print('Epoch {}/{}' .format((epoch+1),(NumEpoch)))

  for num, batch in enumerate(train_iterator):
    print(batch)
    source = (batch.src).to(device)
    target = (batch.trg).to(device)
    
    Output = Model(source, target)
    #จัด Output ใหม่ ไม่เอา <sos> จะได้ไปเข้า CrossEntropy ได้
    Output = Output[1:].reshape(-1, Output.shape[2]) #กลายเป็น [(1-ความยาวsource)*batch_size, vocab_size]
    Target = target[1:].view(-1)
 
    Optimizer.zero_grad()
    Loss = Crit(Output, Target)
    Loss.backward()
    torch.nn.utils.clip_grad_norm_(Model.parameters(), max_norm=1)
    Optimizer.step()
    EpochLoss += Loss.item()
    
  print('Epoch Loss: {:.3f}' .format(Loss))
  print()

Epoch 1/100

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:[torch.cuda.LongTensor of size 9x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 9x128 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:[torch.cuda.LongTensor of size 14x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 14x128 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:[torch.cuda.LongTensor of size 16x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 16x128 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:[torch.cuda.LongTensor of size 6x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 6x128 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 72]
	[.src]:[torch.cuda.LongTensor of size 31x72 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 31x72 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:[torch.cuda.LongTensor of size 11x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 11x128 (GPU 0)]

[torchtext.legacy.data.batch.Batch of si

# **Prediction Peeking**

In [14]:
def convert(model, words, normal, inverse, device):
  tokens = [token for token in token_char(words)]
  tokens.insert(0, SRC.init_token)
  
  tokens.append(TRG.eos_token)
  text_to_indices = [SRC.vocab.stoi[token] for token in tokens]
  sentence_tensor = torch.LongTensor(text_to_indices).to(device)


  feedHidden = torch.zeros(1, 1, 256).to(device)
  for i in range(0, len(sentence_tensor)-1):
    hidden = Model.encoder(sentence_tensor[i].unsqueeze(0), feedHidden)
    feedHidden = hidden

  outputs = sentence_tensor[0].unsqueeze(0)

  Pred = []
  for i in range(0, len(sentence_tensor)-1):
    output, hidden = Model.decoder(outputs, feedHidden)
    top1 = output.argmax(1)
    Pred.append(top1.item())
    outputs = top1
    feedHidden = hidden


  translated_sentence = [SRC.vocab.itos[idx] for idx in Pred]
  return translated_sentence[:-1] #เอา EOS ออก

words = "abcd"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)
print()
words = "heart"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)
print()
words = "thisistest"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)

Input word :  abcd
Predicted word :  ['d', 'c', 'b', 'a']

Input word :  heart
Predicted word :  ['t', 'r', 'a', 'e', 'h']

Input word :  thisistest
Predicted word :  ['t', 's', 'e', 'i', 's', 't', '<eos>', '<eos>', '<eos>', '<eos>']


# **Model Evalutaion**

In [15]:
Model.eval()
EpochLoss_eval = 0
Loss = 0
with torch.no_grad():
  for num, batch in enumerate(test_iterator):
    source = batch.src
    target = batch.trg

    Output = Model(source, target)


    #จัด Output ใหม่ ไม่เอา <sos> จะได้ไปเข้า CrossEntropy ได้
    Output = Output[1:].reshape(-1, Output.shape[2]) #กลายเป็น [(1-ความยาวsource)*batch_size, vocab_size]
    Target = target[1:].view(-1)


    Loss = Crit(Output, Target)
    
    EpochLoss_eval += Loss.item()
  
  print('Test Loss: ', EpochLoss_eval)

Test Loss:  2.5985448360443115


In [16]:
#BLEU SCORE
from torchtext.data.metrics import bleu_score
Target = []
Output = []

for data in test_data[:20]:
  source = vars(data)["src"]
  target = vars(data)["trg"]

  pred = convert(Model, source, SRC, TRG, device)

  Target.append([target])
  Output.append(pred)

print(Target)
print(Output)

print("MODEL'S BLEU SCORE: ", bleu_score(Output, Target))

[[['w', 'h']], [['v', 'l', 'c']], [['v', 'v', 'v', 'r']], [['q', 'l', 's', 'b', 'j']], [['a', 'a', 'd', 'v', 'm', 'a']], [['c', 'l', 'g', 'd', 'o', 'f', 's']], [['q', 'd', 't', 'o', 'f', 'm', 'e', 'q']], [['d', 'e', 'v', 'l', 'f', 'z', 'p', 'w', 'i']], [['j', 'z', 'h', 'f', 'f', 'm', 'u', 'c', 'e', 'u']], [['u', 'z', 'a', 'g', 'w', 't', 'a', 'g', 't', 'b', 'd']], [['d', 'm', 't', 'l', 'k', 'e', 'j', 'm', 't', 'g', 'w', 'o']], [['q', 'a', 'f', 'g', 'x', 'o', 'n', 'a', 'k', 'g', 't', 't', 'z']], [['w', 'h', 'z', 'h', 'e', 'x', 't', 'g', 'c', 'c', 'z', 'h', 'o', 'i']], [['v', 'e', 'n', 'f', 'd', 'b', 'o', 'g', 'q', 's', 'a', 'q', 'q', 'l', 'r']], [['i', 'h', 'r', 'n', 'f', 'v', 'r', 'z', 'f', 'l', 'b', 'h', 'h', 'q', 'p', 'z']], [['r', 'j', 'k', 'b', 'r', 'd', 'h', 's', 'j', 'g', 'r', 't', 'z', 'u', 'y', 'y', 'l']], [['g', 'b', 'p', 'k', 'a', 'h', 'r', 'u', 'y', 'a', 't', 'j', 'r', 'k', 'n', 'e', 'x', 'f']], [['y', 'g', 'u', 'v', 'd', 'o', 'y', 'y', 'g', 'r', 'r', 'p', 'm', 'z', 'u', 'w',