# **Dataset Creating**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import torch.nn.functional as F
from random import seed
from random import randint
import pandas as pd
import numpy as np
import random 


seed(1)


In [188]:
#Creat a data set
Pool_data = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
#Create function returning the random word and its inverse.
def RandomWord(data, length):
  word = ''
  for i in range(length):
    word = word + Pool_data[randint(0,len(Pool_data)-1)]
  return word, word[::-1]

#Make train and test set
keep_train = []
for i in range(2,30):
  if i>15:
    for k in range(250):
      src, tar = RandomWord(Pool_data, i)
      keep_train.append([src, tar])
  else:
    for k in range(500):
      src, tar = RandomWord(Pool_data, i)
      keep_train.append([src, tar])
train = pd.DataFrame(np.array(keep_train),
                  columns=['source', 'target'])
print(train.shape)

keep_test = []
for j in range(80):
  for i in range(2,30):
      src, tar = RandomWord(Pool_data, i)
      keep_test.append([src, tar])
test = pd.DataFrame(np.array(keep_test),
                  columns=['source', 'target'])
print(test.shape)

keep_val = []
for j in range(60):
  for i in range(randint(2,30)):
      src, tar = RandomWord(Pool_data, i)
      keep_val.append([src, tar])
val = pd.DataFrame(np.array(keep_val),
                  columns=['source', 'target'])
print(val.shape)

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
val.to_csv("val.csv", index=False)


(10500, 2)
(2240, 2)
(882, 2)


In [190]:
def token_char(text):
  token = []
  for char in text:
    token.append(char)
  return token

SRC = Field(tokenize=token_char,
            init_token = '<sos>',
            eos_token='<eos>')
TRG = Field(tokenize=token_char,
            init_token = '<sos>',
            eos_token='<eos>')



fields = {"source": ("src", SRC), "target" : ("trg",TRG)}

train_data, valid_data, test_data = TabularDataset.splits(
    path="/content/", train="train.csv", validation="val.csv", test="test.csv", format="csv", fields=fields
)

SRC.build_vocab(train_data,  min_freq=1)
TRG.build_vocab(train_data,  min_freq=1)

In [191]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(test_data[2].__dict__.keys())
print(test_data[2].__dict__.values())

Number of training examples: 10500
Number of validation examples: 882
Number of testing examples: 2240
dict_keys(['src', 'trg'])
dict_values([['s', 'r', 'v', 'i'], ['i', 'v', 'r', 's']])


In [192]:
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

# **MODEL**

**ENCONDER** 

In [170]:
class encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedded_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size  
        self.embedded_size = embedded_size

        self.embed = nn.Embedding(input_size, self.embedded_size)
        self.gru = nn.GRU(self.embedded_size, self.hidden_size)

    def forward(self, source, hidden):

        emb = self.embed(source).unsqueeze(0)
        outputs, hidden = self.gru(emb,hidden)

        return hidden


**DECODER** 

In [171]:
class decoder(nn.Module):
    def __init__(self, output_size, hidden_size, embedded_size):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.embedded_size = embedded_size

        self.embed = nn.Embedding(output_size, self.embedded_size)
        self.gru = nn.GRU(self.embedded_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.soft = nn.LogSoftmax(dim=1)

    def forward(self, source, hidden):
        
        source = source.unsqueeze(0)
        emb = self.embed(source)
        output, hidden = self.gru(emb, hidden)
        pred = self.soft(self.out(output[0]))

        return pred, hidden


**Sequence to Sequence**

In [172]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device):
       super(Seq2Seq, self).__init__()

       self.encoder = encoder
       self.decoder = decoder
       self.device = device
   def forward(self, source, target):

       input_length = source.size(0)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_size
       

       feedHidden = torch.zeros(1, batch_size, 256).to(device)
       for i in range(input_length):
          hidden = self.encoder(source[i], feedHidden)
          feedHidden = hidden
          

       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)
       
       decode_input = target[0,:]

       for i in range(1, target_length):
        
        output, hidden= self.decoder(decode_input, feedHidden)
        outputs[i] = output
        top1 = output.argmax(1) 
        tf = random.random() < 0.5
        decode_input = target[i] if tf else top1
        feedHidden = hidden

       return outputs

# **Initialize model and its parameters**

In [173]:
Input_size = len(SRC.vocab)
Output_size = len(TRG.vocab)
Embbed_size = 256
Hidden_size = 256

Encoder = encoder(Input_size, Hidden_size, Embbed_size)
Decoder = decoder(Output_size, Hidden_size, Embbed_size)

Model = Seq2Seq(Encoder, Decoder, device).to(device)
print(Model)

Seq2Seq(
  (encoder): encoder(
    (embed): Embedding(30, 256)
    (gru): GRU(256, 256)
  )
  (decoder): decoder(
    (embed): Embedding(30, 256)
    (gru): GRU(256, 256)
    (out): Linear(in_features=256, out_features=30, bias=True)
    (soft): LogSoftmax(dim=1)
  )
)


In [174]:
LearningRate = 1e-3
Optimizer = optim.Adam(Model.parameters(), lr=LearningRate)

TRG_IDX = TRG.vocab.stoi['<pad>']
print(TRG_IDX)
Crit = nn.CrossEntropyLoss(ignore_index=TRG_IDX)

1


# **Model Training**

In [175]:
NumEpoch = 50
EpochLoss = 0
Model.train(True)

for epoch in range(0, NumEpoch):

  print('Epoch {}/{}' .format((epoch+1),(NumEpoch)))

  for num, batch in enumerate(train_iterator):
    source = (batch.src).to(device)
    target = (batch.trg).to(device)
    
    Output = Model(source, target)
    #จัด Output ใหม่ ไม่เอา <sos> จะได้ไปเข้า CrossEntropy ได้
    Output = Output[1:].reshape(-1, Output.shape[2]) #กลายเป็น [(1-ความยาวsource)*batch_size, vocab_size]
    Target = target[1:].view(-1)
 
    Optimizer.zero_grad()
    Loss = Crit(Output, Target)
    Loss.backward()
    torch.nn.utils.clip_grad_norm_(Model.parameters(), max_norm=1)
    Optimizer.step()
    EpochLoss += Loss.item()
    
  print('Epoch Loss: {:.3f}' .format(Loss))
  print()

Epoch 1/50
Epoch Loss: 2.916

Epoch 2/50
Epoch Loss: 1.928

Epoch 3/50
Epoch Loss: 0.152

Epoch 4/50
Epoch Loss: 2.280

Epoch 5/50
Epoch Loss: 1.139

Epoch 6/50
Epoch Loss: 0.421

Epoch 7/50
Epoch Loss: 0.692

Epoch 8/50
Epoch Loss: 1.249

Epoch 9/50
Epoch Loss: 0.649

Epoch 10/50
Epoch Loss: 1.140

Epoch 11/50
Epoch Loss: 0.103

Epoch 12/50
Epoch Loss: 1.008

Epoch 13/50
Epoch Loss: 1.233

Epoch 14/50
Epoch Loss: 1.162

Epoch 15/50
Epoch Loss: 0.015

Epoch 16/50
Epoch Loss: 1.997

Epoch 17/50
Epoch Loss: 1.009

Epoch 18/50
Epoch Loss: 0.008

Epoch 19/50
Epoch Loss: 0.899

Epoch 20/50
Epoch Loss: 0.890

Epoch 21/50
Epoch Loss: 0.428

Epoch 22/50
Epoch Loss: 0.262

Epoch 23/50
Epoch Loss: 0.628

Epoch 24/50
Epoch Loss: 1.710

Epoch 25/50
Epoch Loss: 2.173

Epoch 26/50
Epoch Loss: 1.081

Epoch 27/50
Epoch Loss: 0.036

Epoch 28/50
Epoch Loss: 0.005

Epoch 29/50
Epoch Loss: 0.009

Epoch 30/50
Epoch Loss: 0.308

Epoch 31/50
Epoch Loss: 1.846

Epoch 32/50
Epoch Loss: 0.216

Epoch 33/50
Epoch

# **Prediction Peeking**

In [182]:
def convert(model, words, normal, inverse, device):
  tokens = [token for token in token_char(words)]
  tokens.insert(0, SRC.init_token)
  
  tokens.append(TRG.eos_token)
  text_to_indices = [SRC.vocab.stoi[token] for token in tokens]
  sentence_tensor = torch.LongTensor(text_to_indices).to(device)


  feedHidden = torch.zeros(1, 1, 256).to(device)
  for i in range(0, len(sentence_tensor)-1):
    hidden = Model.encoder(sentence_tensor[i].unsqueeze(0), feedHidden)
    feedHidden = hidden

  outputs = sentence_tensor[0].unsqueeze(0)

  Pred = []
  for i in range(0, len(sentence_tensor)-1):
    output, hidden = Model.decoder(outputs, feedHidden)
    top1 = output.argmax(1)
    Pred.append(top1.item())
    outputs = top1
    feedHidden = hidden


  translated_sentence = [SRC.vocab.itos[idx] for idx in Pred]
  return translated_sentence[:-1] #เอา EOS ออก

words = "abcd"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)
print()
words = "com"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)
print()
words = "world"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)
print()

words = "thank"
words_predicted = convert(Model, words, SRC, TRG, device)
print('Input word : ', words)
print('Predicted word : ', words_predicted)

Input word :  abcd
Predicted word :  ['d', 'c', 'b', 'a']

Input word :  com
Predicted word :  ['m', 'o', 'c']

Input word :  world
Predicted word :  ['d', 'l', 'r', 'o', 'w']

Input word :  thank
Predicted word :  ['k', 'n', 'a', 'h', 't']


# **Model Evalutaion**

In [195]:
Model.eval()
EpochLoss_eval = 0
Loss = 0
with torch.no_grad():
  for num, batch in enumerate(test_iterator):
    source = batch.src
    target = batch.trg
    Output = Model(source, target)


    #จัด Output ใหม่ ไม่เอา <sos> จะได้ไปเข้า CrossEntropy ได้
    Output = Output[1:].reshape(-1, Output.shape[2]) #กลายเป็น [(1-ความยาวsource)*batch_size, vocab_size]
    Target = target[1:].view(-1)


    Loss = Crit(Output, Target)
    
    EpochLoss_eval += Loss.item()
  print('Test Loss: ', EpochLoss_eval/num)

Test Loss:  1.0577567942057025


In [196]:
#BLEU SCORE
from torchtext.data.metrics import bleu_score
Target = []
Output = []

for data in test_data[:20]:
  source = vars(data)["src"]
  target = vars(data)["trg"]

  pred = convert(Model, source, SRC, TRG, device)

  Target.append([target])
  Output.append(pred)

print(Target)
print(Output)

print("MODEL'S BLEU SCORE: ", bleu_score(Output, Target))

[[['y', 'r']], [['y', 'o', 'g']], [['i', 'v', 'r', 's']], [['n', 'q', 'f', 'l', 'v']], [['o', 't', 'p', 'g', 'o', 'b']], [['t', 'n', 'j', 'r', 'u', 'k', 'e']], [['w', 'p', 'q', 'h', 'j', 'u', 'o', 'w']], [['p', 'y', 'm', 'o', 'i', 'i', 'c', 'r', 'k']], [['h', 'v', 'j', 'w', 'z', 't', 'c', 'g', 'g', 'g']], [['d', 'd', 't', 'b', 'c', 'u', 'e', 'h', 'b', 'c', 'd']], [['s', 'e', 'w', 'p', 'a', 't', 't', 'o', 'l', 'n', 'y', 'z']], [['q', 'q', 'h', 'u', 'h', 'd', 'f', 'z', 't', 'c', 'e', 'i', 'a']], [['t', 'r', 'a', 'n', 'd', 'e', 'l', 'u', 'w', 'e', 'q', 'x', 'p', 'l']], [['o', 'c', 'l', 't', 'q', 'k', 'i', 'c', 'n', 'q', 'd', 'v', 'u', 'w', 'j']], [['e', 'q', 'o', 'd', 'd', 't', 'p', 'u', 'e', 'i', 'm', 'i', 's', 'j', 'e', 'y']], [['q', 'j', 'i', 'p', 'o', 'z', 't', 'p', 't', 'q', 'l', 'y', 'c', 'k', 'z', 'a', 't']], [['q', 'k', 'x', 'j', 'w', 'j', 't', 'x', 's', 'g', 'c', 'k', 'q', 'q', 'w', 'o', 'o', 'z']], [['n', 'w', 'd', 'z', 'z', 'e', 'e', 'j', 'y', 'b', 'm', 'e', 'm', 'm', 'v', 'm',