#### Library Imports and Global Variables

In [98]:
from benchmark_reader import *
import re
import pprint as pp
import random
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3060


In [99]:
EN = ''
MT = 'mt'

#### Function Definitions

In [100]:
def saveData(dataset, language, datatype, folder):

    f_source = open(folder+"/"+language+"/src-"+datatype+".txt", "w")
    f_target = open(folder+"/"+language+"/tgt-"+datatype+".txt", "w")

    for data in dataset:

        f_source.write(' '.join(data['triple']) + '\n')
        f_target.write(data['sentence'] + '\n')

    f_source.close()
    f_target.close()

def preprocess(language, folder):

    b = Benchmark() #create benchmark object
    files = select_files("xmldata/mt_train.xml") #getting file from data folder
    b.fill_benchmark(files) #parse xml files and fill Benchmark with Entry instances

    dataset = []

    for entry in b.entries:

        if int(entry.size) == 1: #to only get sentences with one triple

            for triple in entry.modifiedtripleset.triples:

                s = triple.s
                p = triple.p
                o = triple.o

                #to handle p when it is made up of multiple words

                words = ''
                for i in range(len(p)):

                    if p[i].isupper(): #when upper case space is added then letter is added

                        words += " " + p[i]
                        
                    else: #letter is added
                        words += p[i]

                p = words

                #replacing special characters with space
                s = re.sub(r'[,|""_:@#?!&$]', ' ', s)
                p = re.sub(r'[,|""_:@#?!&$]', ' ', p)
                o = re.sub(r'[,|""_:@#?!&$]', ' ', o)

                # changing to lower case
                # s = s.lower()
                p = p.lower()
                # o = o.lower()

                triple = [s, p, o]

                # rdftriples.append([s, p, o])

            for i in range(entry.count_lexs()):
                if(entry.lexs[i].lang == language): #to only get english. for english no is specified. use 'mt' for maltese

                    lex = entry.lexs[i].lex

                    #replacing special characters with space
                    lex = re.sub(r'[,|""_:@#?!&$]', ' ', lex)

                    # #removing full stop at the end
                    # if lex[-1] == '.':
                    #     lex = lex[:-1]
                    # lex = lex + ' .' #adding full stop at the end with space

                    #changing to lower case
                    # lex = lex.lower()
                    break

            
            dataset.append({
                'triple': triple,
                'sentence': lex
            })
    
    print("Dataset length:", len(dataset))

    random.seed(42)
    random.shuffle(dataset) #shuffling the dataset

    train_split = int(len(dataset)*0.8)
    val_split = int(len(dataset)*0.1)
    test_split = int(len(dataset)*0.1)

    train_data = dataset[:train_split]
    val_data = dataset[train_split:train_split+val_split]
    test_data = dataset[train_split+val_split:train_split+val_split+test_split]

    print("Training data: ", len(train_data), "\n", 
        "Validation data: ", len(val_data), "\n",
        "Testing data: ", len(test_data), sep='')

    saveData(train_data, EN, 'train', folder)
    saveData(val_data, EN, 'val', folder)
    saveData(test_data, EN, 'test', folder)

### LSTM Model on English Data

In [101]:
preprocess(EN, 'data-en')

Dataset length: 3195
Training data: 2556
Validation data: 319
Testing data: 319


In [102]:
!onmt_build_vocab -config config-lstm-en.yaml

[2023-05-05 20:44:58,206 INFO] Counter vocab from 5000 samples.
[2023-05-05 20:44:58,206 INFO] Build vocab on 5000 transformed examples/corpus.
[2023-05-05 20:45:00,224 INFO] Counters src: 3730
[2023-05-05 20:45:00,224 INFO] Counters tgt: 4801


In [103]:
!onmt_train -config config-lstm-en.yaml

[2023-05-05 20:45:02,458 INFO] Missing transforms field for corpus_1 data, set to default: [].
[2023-05-05 20:45:02,458 INFO] Missing transforms field for valid data, set to default: [].
[2023-05-05 20:45:02,458 INFO] Parsed 2 corpora from -data.
[2023-05-05 20:45:02,459 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-05-05 20:45:02,521 INFO] Building model...
[2023-05-05 20:45:02,811 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(3736, 500, padding_idx=1)
        )
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (rnn): LSTM(500, 500, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(4808, 500, padding_idx=1)
        )
      )
      (dropout): Dropout(p=0.3, inplace=False)
  

In [104]:
!onmt_translate -model data-en/run/model_step_1000.pt -src data-en/src-test.txt -output data-en/pred_1000.txt -gpu 0 -verbose 

[2023-05-05 20:45:53,649 INFO] 
SENT 1: ['Bakso', 'ingredient', '<unk>']
PRED 1: Celery is an ingredient of Bakso.
PRED SCORE: -0.1677

[2023-05-05 20:45:53,649 INFO] 
SENT 2: ['Cleveland', 'country', 'United', 'States']
PRED 2: Cleveland is in the United States.
PRED SCORE: -0.0506

[2023-05-05 20:45:53,649 INFO] 
SENT 3: ['BBC', 'key', 'person', '<unk>', '<unk>']
PRED 3: The alternative name of Atlantic City is the Broadcasting Party War.
PRED SCORE: -0.8488

[2023-05-05 20:45:53,649 INFO] 
SENT 4: ['Aarhus', 'Airport', 'runway', 'name', '<unk>']
PRED 4: The runway name of Aarhus Airport is magistrate.
PRED SCORE: -0.2370

[2023-05-05 20:45:53,650 INFO] 
SENT 5: ['Tarrant', 'County', 'Texas', 'county', 'seat', 'Fort', '<unk>', 'Texas']
PRED 5: The largest city in Hays County Texas is the President of Belgaum.
PRED SCORE: -0.5644

[2023-05-05 20:45:53,650 INFO] 
SENT 6: ['Abel', 'Hernández', 'club', 'Peñarol']
PRED 6: Abel Hernández club is Central Español.
PRED SCORE: -0.3391

[2023-

### LSTM Model on English Data with GloVe Embeddings

In [109]:
preprocess(EN, 'data-en')

Dataset length: 3195
Training data: 2556
Validation data: 319
Testing data: 319


In [110]:
!onmt_build_vocab -config config-lstm-en-glove.yaml

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2023-05-05 20:46:53,701 INFO] Counter vocab from 5000 samples.
[2023-05-05 20:46:53,701 INFO] Build vocab on 5000 transformed examples/corpus.
[2023-05-05 20:46:55,676 INFO] Counters src: 3730
[2023-05-05 20:46:55,676 INFO] Counters tgt: 4801


In [111]:
!onmt_train -config config-lstm-en-glove.yaml

[2023-05-05 20:46:57,890 INFO] Missing transforms field for corpus_1 data, set to default: [].
[2023-05-05 20:46:57,890 INFO] Missing transforms field for valid data, set to default: [].
[2023-05-05 20:46:57,890 INFO] Parsed 2 corpora from -data.
[2023-05-05 20:46:57,890 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-05-05 20:46:57,926 INFO] Reading decoder embeddings from glove-embeddings/glove.6B.300d.txt
[2023-05-05 20:47:02,950 INFO] 	Found 400000 total vectors in file
[2023-05-05 20:47:02,950 INFO] After filtering to vectors in vocab:
[2023-05-05 20:47:02,950 INFO] 	* dec: 1110 match, 3698 missing, (23.09%)
[2023-05-05 20:47:02,950 INFO] 
Saving decoder embeddings as:
	* dec: data-en/run-glove/example.dec_embeddings.pt
[2023-05-05 20:47:03,516 INFO] Building model...
[2023-05-05 20:47:03,844 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(37

In [112]:
!onmt_translate -model data-en/run-glove/model_step_1000.pt -src data-en/src-test.txt -output data-en/pred_1000_glove.txt -gpu 0 -verbose

[2023-05-05 20:47:54,041 INFO] 
SENT 1: ['Bakso', 'ingredient', '<unk>']
PRED 1: Celery is an ingredient of Bakso.
PRED SCORE: -0.1587

[2023-05-05 20:47:54,041 INFO] 
SENT 2: ['Cleveland', 'country', 'United', 'States']
PRED 2: Cleveland is in the United States.
PRED SCORE: -0.0278

[2023-05-05 20:47:54,041 INFO] 
SENT 3: ['BBC', 'key', 'person', '<unk>', '<unk>']
PRED 3: The location of the BBC is the Broadcasting House in London.
PRED SCORE: -0.4032

[2023-05-05 20:47:54,041 INFO] 
SENT 4: ['Aarhus', 'Airport', 'runway', 'name', '<unk>']
PRED 4: The runway name of Aarhus is magistrate.
PRED SCORE: -0.3298

[2023-05-05 20:47:54,042 INFO] 
SENT 5: ['Tarrant', 'County', 'Texas', 'county', 'seat', 'Fort', '<unk>', 'Texas']
PRED 5: The SEAT Ibiza is related to the Volkswagen Polo Mk3.
PRED SCORE: -0.7603

[2023-05-05 20:47:54,042 INFO] 
SENT 6: ['Abel', 'Hernández', 'club', 'Peñarol']
PRED 6: Abel Hernández played youth football for Central Español.
PRED SCORE: -0.2337

[2023-05-05 20:47

### LSTM Model on Maltese Data

In [105]:
preprocess(MT, 'data-mt')

Dataset length: 3195
Training data: 2556
Validation data: 319
Testing data: 319


In [106]:
!onmt_build_vocab -config config-lstm-mt.yaml

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2023-05-05 20:45:57,120 INFO] Counter vocab from 5000 samples.
[2023-05-05 20:45:57,120 INFO] Build vocab on 5000 transformed examples/corpus.
[2023-05-05 20:45:59,181 INFO] Counters src: 3730
[2023-05-05 20:45:59,181 INFO] Counters tgt: 6002


In [107]:
!onmt_train -config config-lstm-mt.yaml

[2023-05-05 20:46:01,602 INFO] Missing transforms field for corpus_1 data, set to default: [].
[2023-05-05 20:46:01,602 INFO] Missing transforms field for valid data, set to default: [].
[2023-05-05 20:46:01,602 INFO] Parsed 2 corpora from -data.
[2023-05-05 20:46:01,602 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-05-05 20:46:01,642 INFO] Building model...
[2023-05-05 20:46:01,953 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(3736, 500, padding_idx=1)
        )
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (rnn): LSTM(500, 500, num_layers=2, batch_first=True, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(6008, 500, padding_idx=1)
        )
      )
      (dropout): Dropout(p=0.3, inplace=False)
  

In [108]:
!onmt_translate -model data-mt/run/model_step_1000.pt -src data-mt/src-test.txt -output data-mt/pred_1000.txt -gpu 0 -verbose

[2023-05-05 20:46:50,387 INFO] 
SENT 1: ['Bakso', 'ingredient', '<unk>']
PRED 1: Il-laħam hija ingredjent ta’ Bakso.
PRED SCORE: -0.4049

[2023-05-05 20:46:50,387 INFO] 
SENT 2: ['Cleveland', 'country', 'United', 'States']
PRED 2: Cleveland hija fl-Istati Uniti.
PRED SCORE: -0.2324

[2023-05-05 20:46:50,387 INFO] 
SENT 3: ['BBC', 'key', 'person', '<unk>', '<unk>']
PRED 3: Il-kapital ta’ Blackpool huwa l-Partit Konservattiv (Renju Unit).
PRED SCORE: -1.0488

[2023-05-05 20:46:50,387 INFO] 
SENT 4: ['Aarhus', 'Airport', 'runway', 'name', '<unk>']
PRED 4: It-tul tal-runway fl-Ajruport ta’ Aarhus huwa mħallef.
PRED SCORE: -0.4915

[2023-05-05 20:46:50,388 INFO] 
SENT 5: ['Tarrant', 'County', 'Texas', 'county', 'seat', 'Fort', '<unk>', 'Texas']
PRED 5: San Sebastian huwa parti mill-Konte ta’ Madrid.
PRED SCORE: -0.9866

[2023-05-05 20:46:50,388 INFO] 
SENT 6: ['Abel', 'Hernández', 'club', 'Peñarol']
PRED 6: Abel iż-żgħażagħ huwa Spanjol.
PRED SCORE: -0.3266

[2023-05-05 20:46:50,388 INFO] 


### Evaluation

#### English

python 2023-Challenge-main\evaluation\automatic\scripts\eval.py -hyp data-en/pred_1000.txt -ref data-en/tgt-test.txt -nr 1 -m bleu,chrf++,ter -lng en

Results:<br>
BLEU: 18.725<br>
CHRF++: 0.38<br>
TER: 0.69<br>

#### English with GloVE

python 2023-Challenge-main\evaluation\automatic\scripts\eval.py -hyp data-en/pred_1000_glove.txt -ref data-en/tgt-test.txt -nr 1 -m bleu,chrf++,ter -lng en

Results:<br>
BLEU: 18.6501<br>
chrF++: 0.36<br>
TER: 0.67<br>

#### Maltese

python 2023-Challenge-main\evaluation\automatic\scripts\eval.py -hyp data-mt/pred_1000.txt -ref data-mt/tgt-test.txt -nr 1 -m bleu,chrf++,ter -lng mt

Results:<br>
BLEU: 8.15285<br>
chrF++: 0.29<br>
TER: 0.83<br>