# Training the Spelling Normalizer

## Preprocessing

In [None]:
import translator.preprocessing as pre

import string
from collections import Counter

import pandas as pd

### Expanding the Corpuses

#### Expanding the Monolingual Data

In [None]:
to_backtranslate = pre.prepare_data("Monolingual Data")

In [None]:
to_backtranslate

Unnamed: 0,To Backtranslate
0,"auh ca cencah miyac inic motolinihticah, inic ..."
1,"nochi tlacatl tlayiya, tocaya."
2,"aocmo quitlayiznequih in incuen, za quitlatzih..."
3,itztapalapan chiyennecuiloh in itequiuh ome to...
4,in huel xicmottilican.
...,...
1621,"miguel ixquen omomacac tlaltzontli apantentli,..."
1622,in caxtiltepitzin oquichtli caxtoltetl cacahua...
1623,"auh intlacamo nel oniquittani in amamatzin, ca..."
1624,"nehhuatl, pedro de paz, regidor, oniquintlahtl..."


In [None]:
pre.expand_monolingual_corpus(to_backtranslate, "To Backtranslate")

#### Expanding the Training Data

In [None]:
corpus = pre.prepare_data("Unnormalized")

In [None]:
corpus

Unnamed: 0,Unregularized,Regularized
0,"ma oc xicchie, ximotlali, ximosehui.","ma oc xicchiya, ximotlali, ximocehui."
1,"auh çā choloque, çan ic vmpolivito in iauiotl.","auh zan cholohqueh, zan ic ompolihuitoh in yao..."
2,amo motequiuh yn tleyn ipanpa yn tlatzacuiltilo.,ahmo motequiuh in tlein ipampa in tlatzacuiltilo.
3,cuix amo no titlamaxtil y nicann oquihualhuica...,cuix ahmo no titlamachtil in nican oquihualhui...
4,"ma yo yc yn amix, yn amoyolo.","ma iuh ic in amix, in amoyollo."
...,...,...
13518,nima tlatos ytla pila.,niman tlahtoz itlan pilah.
13519,notequimasehualoni naytia yn amotlatocayxpātzi...,nontequimahcehualoni nayitia in amotlahtohcaix...
13520,auh ca huel ilhuil yn miquistli.,auh ca huel ilhuil in miquiztli.
13521,"ca tehuanti topan mochihuas yn ixquich tetli, ...","ca tehhuantin topan mochihuaz in ixquich tetl,..."


In [None]:
pre.expand_bilingual_corpus(corpus, "Expanded Corpus")

In [None]:
expanded_corpus = pre.load_data("./data/CSVs/", "Expanded Corpus")

In [None]:
expanded_corpus

Unnamed: 0,Unregularized,Regularized
0,ma xinechmotlapopolhuili i nitlatlacuanipol.,ma xinechmotlapohpolhuili in nitlahtlacoanipol.
1,cuix tipiltzin dios?,cuix tipiltzin dios?
2,monetlamachtil mochihuas ynn ōpa ylhuicac.,monetlamachtil mochihuaz in ompa ilhuicac.
3,auhhuin axcan,auh in axcan
4,ma oc no amehuantzintzinti ximotlantlauhtilican.,ma oc no amehhuantzitzintin ximotlatlauhtilican.
...,...,...
26520,oc quiticinochilito,oc quiticinochilito
26521,oc ipan tlato,oc ipan tlahtoh
26522,yece aocmo inman;,yeceh aocmo imman;
26523,ca nican can yn s.ta crus,ca nican cah in santa cruz


### Splitting the Data

In [None]:
pre.create_raw_data_path()

#### Splitting the Main Data

In [None]:
train, val, test = pre.create_data_split(expanded_corpus, False)

In [None]:
train

Unnamed: 0,Unregularized,Regularized
0,ma xinechmotlapopolhuili i nitlatlacuanipol.,ma xinechmotlapohpolhuili in nitlahtlacoanipol.
1,cuix tipiltzin dios?,cuix tipiltzin dios?
2,monetlamachtil mochihuas ynn ōpa ylhuicac.,monetlamachtil mochihuaz in ompa ilhuicac.
3,auhhuin axcan,auh in axcan
4,ma oc no amehuantzintzinti ximotlantlauhtilican.,ma oc no amehhuantzitzintin ximotlatlauhtilican.
...,...,...
18562,nima hualmonextis caltenpa juan marcos.,niman hualmonextiz caltempan juan marcos.
18563,auh catle ma oncā nicnixnextilia.,auh ca ahtleh mah oncan nicnextilia.
18564,quin yayatinemis.,quin yahyahtinemiz.
18565,onehuas yn fariseo.,onehuaz in fariseoh.


In [None]:
val

Unnamed: 0,Unregularized,Regularized
18567,ca sa no yuhqui in tlamachtili,ca zan no iuhqui in tlamachtilli
18568,in tlacatl,in tlacatl
18569,camo huel quipanahuis in ithemachticauh.,ca ahmo huel quipanahuiz in itemachtihcauh.
18570,nican ca yn cuanpanoli,nican cah in cuauhnepanolli
18571,crus.,cruz.
...,...,...
25193,telpochtli.,telpochtli.
25194,yn ica anquitasque,in ihcuac anquittazqueh
25195,tenilpitica anquihualmohuiquilisq.,tenilpihtica anquihualmohuiquilizqueh.
25196,ceppa çe caxtiltecatl ixpantzinco motlanquaque...,ceppa ce caxtiltecatl ixpantzinco motlancuaque...


In [None]:
test

Unnamed: 0,Unregularized,Regularized
25198,quimotlatlauhtiliticatca.,quimotlatlauhtilihticatca.
25199,masoyhui yn iqui nicali anechxixitinisnequi,mazoihui in iuhqui nicalli anechxixitiniznequih
25200,auh ca amo huecauhtica yn ninoquetzas,auh ca ahmo huehcauhtica in ninoquetzaz
25201,y ninnoscalis.,in ninozcaliz.
25202,aocomo iuic onui;,aocmo ihuic onhuih;
...,...,...
26520,oc quiticinochilito,oc quiticinochilito
26521,oc ipan tlato,oc ipan tlahtoh
26522,yece aocmo inman;,yeceh aocmo imman;
26523,ca nican can yn s.ta crus,ca nican cah in santa cruz


#### Splitting the Backtranslation Data

In [None]:
backtranslate_train, backtranslate_val = pre.create_data_split(expanded_corpus, True)

In [None]:
backtranslate_train

Unnamed: 0,Regularized,Unregularized
0,ma xinechmotlapohpolhuili in nitlahtlacoanipol.,ma xinechmotlapopolhuili i nitlatlacuanipol.
1,cuix tipiltzin dios?,cuix tipiltzin dios?
2,monetlamachtil mochihuaz in ompa ilhuicac.,monetlamachtil mochihuas ynn ōpa ylhuicac.
3,auh in axcan,auhhuin axcan
4,ma oc no amehhuantzitzintin ximotlatlauhtilican.,ma oc no amehuantzintzinti ximotlantlauhtilican.
...,...,...
19888,in mochihchiuhqueh;,in mochichiuhque;
19889,ca iuh onicchiuh nonohmatcah,ca yuh onicchiuh nômatica
19890,iuh anquichihuazqueh.,yuh anquichihuazque.
19891,ca cencah oniciahui.,ca senca onisia.


In [None]:
backtranslate_val

Unnamed: 0,Regularized,Unregularized
19893,oammahxiticoh,oanmaxitico
19894,o tlaltech ammahxiticoh,otlaltech anmaxitico
19895,in amochantzinco,in amochantzinco
19896,cuauhtlaxcallan.”,quauhtlaxcalla.”
19897,ca in ihcuac in otechcuilizquia in togobernado...,ca yn iquac yn otechcuilisquia y togobernador ...
...,...,...
26520,oc quiticinochilito,oc quiticinochilito
26521,oc ipan tlahtoh,oc ipan tlato
26522,yeceh aocmo imman;,yece aocmo inman;
26523,ca nican cah in santa cruz,ca nican can yn s.ta crus


## Tokenization

### Finding the Most Common Pairs of Letters for Unnormalized Sentences

In [None]:
unreg_non_test = pd.concat([train["Unregularized"], val["Unregularized"]])

In [None]:
unreg_non_test

Unnamed: 0,Unregularized
0,ma xinechmotlapopolhuili i nitlatlacuanipol.
1,cuix tipiltzin dios?
2,monetlamachtil mochihuas ynn ōpa ylhuicac.
3,auhhuin axcan
4,ma oc no amehuantzintzinti ximotlantlauhtilican.
...,...
25193,telpochtli.
25194,yn ica anquitasque
25195,tenilpitica anquihualmohuiquilisq.
25196,ceppa çe caxtiltecatl ixpantzinco motlanquaque...


In [None]:
unreg_set = unreg_non_test.copy()

unreg_set = unreg_set.to_list()
unreg_set = " ".join(unreg_set)
unreg_set = unreg_set.translate(str.maketrans("", "", string.punctuation))
unreg_set = unreg_set.split()

In [None]:
unreg_set

['ma',
 'xinechmotlapopolhuili',
 'i',
 'nitlatlacuanipol',
 'cuix',
 'tipiltzin',
 'dios',
 'monetlamachtil',
 'mochihuas',
 'ynn',
 'ōpa',
 'ylhuicac',
 'auhhuin',
 'axcan',
 'ma',
 'oc',
 'no',
 'amehuantzintzinti',
 'ximotlantlauhtilican',
 'oanquiiyohuique',
 'xolototon',
 'auh',
 'yn',
 'axca',
 'ca',
 'yo',
 'nictlalcahui',
 'yc',
 'quisaz',
 'yn',
 'candellas',
 'ca',
 'honamechnotitili',
 'y',
 'notetlasotlalitzin',
 'yn',
 'notetlacolilitzi',
 'notlasomahuisconetzin',
 'ca',
 'yoqui',
 'quimonequitia',
 'yn',
 'senca',
 'yehuatzin',
 'yn',
 'ticmotlacamachiltitzinnos',
 'yn',
 'motlasotatzin',
 'dios',
 'ca',
 'mopaquiltitica',
 'in',
 'mohuepoltzin',
 'ihuan',
 'in',
 'mocnotlacahuan',
 'pipiltotontin',
 'ca',
 'pactinemi',
 'yehuāti',
 'quitosque',
 'yn',
 'tlen',
 'onitemachtiaya',
 'ca',
 'neli',
 'namechilhuia',
 'ca',
 'in',
 'tlacatecolotl',
 'ca',
 'oamechmintlani',
 'inyc',
 'amechsesemanas',
 'in',
 'iuhqui',
 'trigo',
 'mosesemana',
 'mochayahua',
 'cuix',
 'amo',


In [None]:
unreg_pairs = [word[i:i+2] for word in unreg_set for i in range(len(word) - 1)]

In [None]:
unreg_pairs

['ma',
 'xi',
 'in',
 'ne',
 'ec',
 'ch',
 'hm',
 'mo',
 'ot',
 'tl',
 'la',
 'ap',
 'po',
 'op',
 'po',
 'ol',
 'lh',
 'hu',
 'ui',
 'il',
 'li',
 'ni',
 'it',
 'tl',
 'la',
 'at',
 'tl',
 'la',
 'ac',
 'cu',
 'ua',
 'an',
 'ni',
 'ip',
 'po',
 'ol',
 'cu',
 'ui',
 'ix',
 'ti',
 'ip',
 'pi',
 'il',
 'lt',
 'tz',
 'zi',
 'in',
 'di',
 'io',
 'os',
 'mo',
 'on',
 'ne',
 'et',
 'tl',
 'la',
 'am',
 'ma',
 'ac',
 'ch',
 'ht',
 'ti',
 'il',
 'mo',
 'oc',
 'ch',
 'hi',
 'ih',
 'hu',
 'ua',
 'as',
 'yn',
 'nn',
 'ōp',
 'pa',
 'yl',
 'lh',
 'hu',
 'ui',
 'ic',
 'ca',
 'ac',
 'au',
 'uh',
 'hh',
 'hu',
 'ui',
 'in',
 'ax',
 'xc',
 'ca',
 'an',
 'ma',
 'oc',
 'no',
 'am',
 'me',
 'eh',
 'hu',
 'ua',
 'an',
 'nt',
 'tz',
 'zi',
 'in',
 'nt',
 'tz',
 'zi',
 'in',
 'nt',
 'ti',
 'xi',
 'im',
 'mo',
 'ot',
 'tl',
 'la',
 'an',
 'nt',
 'tl',
 'la',
 'au',
 'uh',
 'ht',
 'ti',
 'il',
 'li',
 'ic',
 'ca',
 'an',
 'oa',
 'an',
 'nq',
 'qu',
 'ui',
 'ii',
 'iy',
 'yo',
 'oh',
 'hu',
 'ui',
 'iq',
 'qu',

#### The 100 Most Common Pairs

In [None]:
unreg_counts = Counter(unreg_pairs)

In [None]:
unreg_counts.most_common(100)

[('ui', 18441),
 ('in', 17925),
 ('qu', 17743),
 ('tl', 17693),
 ('ca', 17569),
 ('hu', 14904),
 ('an', 14623),
 ('la', 13657),
 ('ti', 13096),
 ('ic', 12541),
 ('mo', 10109),
 ('li', 9962),
 ('at', 9514),
 ('ua', 9432),
 ('yn', 9011),
 ('ni', 8901),
 ('il', 8729),
 ('ma', 8683),
 ('it', 8681),
 ('ch', 8563),
 ('ac', 8492),
 ('tz', 8289),
 ('ot', 7840),
 ('te', 7705),
 ('ue', 7466),
 ('al', 6943),
 ('oc', 6900),
 ('to', 5980),
 ('pa', 5898),
 ('zi', 5814),
 ('no', 5518),
 ('uh', 5469),
 ('ne', 5445),
 ('on', 5193),
 ('im', 4730),
 ('co', 4567),
 ('nt', 4484),
 ('am', 4322),
 ('ec', 4285),
 ('ia', 4058),
 ('is', 3861),
 ('hi', 3817),
 ('mi', 3791),
 ('lt', 3701),
 ('as', 3499),
 ('el', 3480),
 ('ol', 3437),
 ('en', 3346),
 ('au', 3336),
 ('ah', 2922),
 ('na', 2920),
 ('iq', 2846),
 ('ih', 2830),
 ('nc', 2804),
 ('xi', 2770),
 ('cu', 2763),
 ('ye', 2691),
 ('ix', 2574),
 ('om', 2519),
 ('eh', 2453),
 ('lo', 2304),
 ('et', 2238),
 ('ht', 2201),
 ('oh', 2200),
 ('aq', 2163),
 ('po', 2136),

### Finding the Most Common Pairs of Letters for Normalized Sentences

In [None]:
reg_non_test = pd.concat([train["Regularized"], val["Regularized"]])

In [None]:
reg_non_test

Unnamed: 0,Regularized
0,ma xinechmotlapohpolhuili in nitlahtlacoanipol.
1,cuix tipiltzin dios?
2,monetlamachtil mochihuaz in ompa ilhuicac.
3,auh in axcan
4,ma oc no amehhuantzitzintin ximotlatlauhtilican.
...,...
25193,telpochtli.
25194,in ihcuac anquittazqueh
25195,tenilpihtica anquihualmohuiquilizqueh.
25196,ceppa ce caxtiltecatl ixpantzinco motlancuaque...


In [None]:
reg_set = reg_non_test.copy()

reg_set = reg_set.to_list()
reg_set = " ".join(reg_set)
reg_set = reg_set.translate(str.maketrans("", "", string.punctuation))
reg_set = reg_set.split()

In [None]:
reg_set

['ma',
 'xinechmotlapohpolhuili',
 'in',
 'nitlahtlacoanipol',
 'cuix',
 'tipiltzin',
 'dios',
 'monetlamachtil',
 'mochihuaz',
 'in',
 'ompa',
 'ilhuicac',
 'auh',
 'in',
 'axcan',
 'ma',
 'oc',
 'no',
 'amehhuantzitzintin',
 'ximotlatlauhtilican',
 'oanquihiyohuihqueh',
 'xolototon',
 'auh',
 'in',
 'axcan',
 'ca',
 'ye',
 'onictlalcahuih',
 'ic',
 'quizaz',
 'in',
 'candelas',
 'ca',
 'onamechnottitilih',
 'in',
 'notetlazohtlaliztzin',
 'in',
 'notetlaocoliliztzin',
 'notlazohmahuizconetzin',
 'ca',
 'iuhqui',
 'quimonequitia',
 'in',
 'cencah',
 'yehhuatzin',
 'in',
 'ticmotlacamachiltihtzinoz',
 'in',
 'motlazohtahtzin',
 'dios',
 'ca',
 'mopaquiltihticah',
 'in',
 'mohuehpoltzin',
 'ihuan',
 'in',
 'mocnotlacahuan',
 'pipiltotontin',
 'ca',
 'pactinemih',
 'yehhuantin',
 'quihtozqueh',
 'in',
 'tlein',
 'onitemachtiaya',
 'ca',
 'nelli',
 'namechilhuia',
 'ca',
 'in',
 'tlacatecolotl',
 'ca',
 'oamechmihtlanih',
 'inic',
 'amechcehcemmanaz',
 'in',
 'iuhqui',
 'trigoh',
 'mocehc

In [None]:
reg_pairs = [word[i:i+2] for word in unreg_set for i in range(len(word) - 1)]

In [None]:
reg_pairs

['ma',
 'xi',
 'in',
 'ne',
 'ec',
 'ch',
 'hm',
 'mo',
 'ot',
 'tl',
 'la',
 'ap',
 'po',
 'op',
 'po',
 'ol',
 'lh',
 'hu',
 'ui',
 'il',
 'li',
 'ni',
 'it',
 'tl',
 'la',
 'at',
 'tl',
 'la',
 'ac',
 'cu',
 'ua',
 'an',
 'ni',
 'ip',
 'po',
 'ol',
 'cu',
 'ui',
 'ix',
 'ti',
 'ip',
 'pi',
 'il',
 'lt',
 'tz',
 'zi',
 'in',
 'di',
 'io',
 'os',
 'mo',
 'on',
 'ne',
 'et',
 'tl',
 'la',
 'am',
 'ma',
 'ac',
 'ch',
 'ht',
 'ti',
 'il',
 'mo',
 'oc',
 'ch',
 'hi',
 'ih',
 'hu',
 'ua',
 'as',
 'yn',
 'nn',
 'ōp',
 'pa',
 'yl',
 'lh',
 'hu',
 'ui',
 'ic',
 'ca',
 'ac',
 'au',
 'uh',
 'hh',
 'hu',
 'ui',
 'in',
 'ax',
 'xc',
 'ca',
 'an',
 'ma',
 'oc',
 'no',
 'am',
 'me',
 'eh',
 'hu',
 'ua',
 'an',
 'nt',
 'tz',
 'zi',
 'in',
 'nt',
 'tz',
 'zi',
 'in',
 'nt',
 'ti',
 'xi',
 'im',
 'mo',
 'ot',
 'tl',
 'la',
 'an',
 'nt',
 'tl',
 'la',
 'au',
 'uh',
 'ht',
 'ti',
 'il',
 'li',
 'ic',
 'ca',
 'an',
 'oa',
 'an',
 'nq',
 'qu',
 'ui',
 'ii',
 'iy',
 'yo',
 'oh',
 'hu',
 'ui',
 'iq',
 'qu',

#### The 100 Most Common Pairs

In [None]:
reg_counts = Counter(reg_pairs)

In [None]:
reg_counts.most_common(100)

[('ui', 18441),
 ('in', 17925),
 ('qu', 17743),
 ('tl', 17693),
 ('ca', 17569),
 ('hu', 14904),
 ('an', 14623),
 ('la', 13657),
 ('ti', 13096),
 ('ic', 12541),
 ('mo', 10109),
 ('li', 9962),
 ('at', 9514),
 ('ua', 9432),
 ('yn', 9011),
 ('ni', 8901),
 ('il', 8729),
 ('ma', 8683),
 ('it', 8681),
 ('ch', 8563),
 ('ac', 8492),
 ('tz', 8289),
 ('ot', 7840),
 ('te', 7705),
 ('ue', 7466),
 ('al', 6943),
 ('oc', 6900),
 ('to', 5980),
 ('pa', 5898),
 ('zi', 5814),
 ('no', 5518),
 ('uh', 5469),
 ('ne', 5445),
 ('on', 5193),
 ('im', 4730),
 ('co', 4567),
 ('nt', 4484),
 ('am', 4322),
 ('ec', 4285),
 ('ia', 4058),
 ('is', 3861),
 ('hi', 3817),
 ('mi', 3791),
 ('lt', 3701),
 ('as', 3499),
 ('el', 3480),
 ('ol', 3437),
 ('en', 3346),
 ('au', 3336),
 ('ah', 2922),
 ('na', 2920),
 ('iq', 2846),
 ('ih', 2830),
 ('nc', 2804),
 ('xi', 2770),
 ('cu', 2763),
 ('ye', 2691),
 ('ix', 2574),
 ('om', 2519),
 ('eh', 2453),
 ('lo', 2304),
 ('et', 2238),
 ('ht', 2201),
 ('oh', 2200),
 ('aq', 2163),
 ('po', 2136),

### Training the Tokenizers

In [1]:
import translator.tokenization as tok

In [None]:
tok.create_vocab_path()

tok.prepare_vocab((unreg_non_test), "unregularized_to_tokenize")
tok.prepare_vocab((reg_non_test), "regularized_to_tokenize")

In [3]:
unreg_vocab_size = tok.get_tokenizer("unregularized_to_tokenize", "unregularized_bpe_tokenizer")
reg_vocab_size = tok.get_tokenizer("regularized_to_tokenize", "regularized_bpe_tokenizer")



In [4]:
print("Unregularized tokenizer vocab size: " + str(unreg_vocab_size))
print("Regularized tokenizer vocab size: " + str(reg_vocab_size))

Unregularized tokenizer vocab size: 271
Regularized tokenizer vocab size: 271


#### Testing the Tokenizers

In [5]:
unreg_tokenizer = tok.load_tokenizer("unregularized_bpe_tokenizer")
reg_tokenizer = tok.load_tokenizer("regularized_bpe_tokenizer")

In [6]:
example_unreg = unreg_tokenizer("nica timispopolosq.")["input_ids"]

print(example_unreg)

[224, 81, 76, 262, 224, 267, 80, 76, 86, 83, 82, 83, 82, 79, 82, 86, 84, 17]


In [7]:
example_unreg = unreg_tokenizer.decode(example_unreg)

print(example_unreg)

 nica timispopolosq.


In [8]:
example_reg = reg_tokenizer("nican timitzpohpolozqueh.")["input_ids"]

print(example_reg)

[224, 81, 76, 263, 81, 224, 267, 80, 76, 87, 93, 83, 82, 75, 83, 82, 79, 82, 93, 84, 88, 72, 75, 17]


In [9]:
example_reg = reg_tokenizer.decode(example_reg)

print(example_reg)

 nican timitzpohpolozqueh.


## Backtranslation

In [None]:
import training as train
import translator as tran

In [None]:
train.train_transformer(reg_vocab_size, unreg_vocab_size,
                        "Backtranslation_Train", "Backtranslation_Val",
                        reg_tokenizer, unreg_tokenizer,
                        100, "backtranslator")



Epoch: 1, Train loss: 2.355, Val loss: 1.687, Epoch time = 48.975s
Epoch: 2, Train loss: 1.606, Val loss: 0.981, Epoch time = 47.047s
Epoch: 3, Train loss: 1.173, Val loss: 0.752, Epoch time = 46.493s
Epoch: 4, Train loss: 0.959, Val loss: 0.633, Epoch time = 45.720s
Epoch: 5, Train loss: 0.833, Val loss: 0.575, Epoch time = 46.154s
Epoch: 6, Train loss: 0.747, Val loss: 0.525, Epoch time = 46.061s
Epoch: 7, Train loss: 0.678, Val loss: 0.473, Epoch time = 46.201s
Epoch: 8, Train loss: 0.624, Val loss: 0.459, Epoch time = 47.081s
Epoch: 9, Train loss: 0.578, Val loss: 0.443, Epoch time = 46.100s
Epoch: 10, Train loss: 0.539, Val loss: 0.410, Epoch time = 45.926s
Epoch: 11, Train loss: 0.501, Val loss: 0.375, Epoch time = 45.832s
Epoch: 12, Train loss: 0.467, Val loss: 0.351, Epoch time = 46.162s
Epoch: 13, Train loss: 0.439, Val loss: 0.347, Epoch time = 46.511s
Epoch: 14, Train loss: 0.415, Val loss: 0.319, Epoch time = 46.890s
Epoch: 15, Train loss: 0.391, Val loss: 0.315, Epoch time

In [None]:
backtranslator = tran.load_trained_transformer(reg_vocab_size, unreg_vocab_size, "backtranslator")

  transformer.load_state_dict(torch.load(SAVED_MODEL_PATH + file_name + ".pt",


In [None]:
back_translations, full_set = tran.back_translate(backtranslator, "To Backtranslate",
                                                  reg_tokenizer, unreg_tokenizer)

In [None]:
back_translations

Unnamed: 0,Unregularized,Regularized
0,iehica in tlaxcallan amo tlaçotli catca in qua...,yehica in tlaxcallan ahmo tlazohtli catca in q...
1,yn sentli,in centli
2,chilli,chilli
3,etl,etl
4,yhuan ocsequi tetech monequi.,ihuan oc cequi tetech monequi.
...,...,...
4729,queni motas,quenin mottaz
4730,yeica in miec ipatiuh tlapoaltepoztli.,yehica in miyec ipatiuh tlapohualtepoztli.
4731,tonesetlalilis yn timochtin,tonecentlaliliz in timochtin
4732,yn timexica yhuan yn timichhuaque,in timexihcah ihuan in timichhuahqueh


In [None]:
full_set.tail(4734)

Unnamed: 0,Unregularized,Regularized
0,iehica in tlaxcallan amo tlaçotli catca in qua...,yehica in tlaxcallan ahmo tlazohtli catca in q...
1,yn sentli,in centli
2,chilli,chilli
3,etl,etl
4,yhuan ocsequi tetech monequi.,ihuan oc cequi tetech monequi.
...,...,...
4729,queni motas,quenin mottaz
4730,yeica in miec ipatiuh tlapoaltepoztli.,yehica in miyec ipatiuh tlapohualtepoztli.
4731,tonesetlalilis yn timochtin,tonecentlaliliz in timochtin
4732,yn timexica yhuan yn timichhuaque,in timexihcah ihuan in timichhuahqueh


## Normalization Training

In [12]:
train.train_transformer(unreg_vocab_size, reg_vocab_size,
                        "Combined Training", "Val",
                        unreg_tokenizer, reg_tokenizer,
                        100, "normalizer")



Epoch: 1, Train loss: 2.237, Val loss: 1.545, Epoch time = 55.281s
Epoch: 2, Train loss: 1.437, Val loss: 0.873, Epoch time = 52.898s
Epoch: 3, Train loss: 1.014, Val loss: 0.683, Epoch time = 53.036s
Epoch: 4, Train loss: 0.816, Val loss: 0.574, Epoch time = 53.285s
Epoch: 5, Train loss: 0.699, Val loss: 0.505, Epoch time = 53.902s
Epoch: 6, Train loss: 0.614, Val loss: 0.504, Epoch time = 53.289s
Epoch: 7, Train loss: 0.547, Val loss: 0.408, Epoch time = 53.278s
Epoch: 8, Train loss: 0.492, Val loss: 0.347, Epoch time = 53.719s
Epoch: 9, Train loss: 0.445, Val loss: 0.310, Epoch time = 53.190s
Epoch: 10, Train loss: 0.401, Val loss: 0.282, Epoch time = 53.790s
Epoch: 11, Train loss: 0.362, Val loss: 0.261, Epoch time = 53.200s
Epoch: 12, Train loss: 0.324, Val loss: 0.248, Epoch time = 53.660s
Epoch: 13, Train loss: 0.295, Val loss: 0.235, Epoch time = 53.399s
Epoch: 14, Train loss: 0.269, Val loss: 0.225, Epoch time = 53.628s
Epoch: 15, Train loss: 0.249, Val loss: 0.212, Epoch time

In [13]:
normalizer = tran.load_trained_transformer(unreg_vocab_size, reg_vocab_size, "normalizer")

  transformer.load_state_dict(torch.load(SAVED_MODEL_PATH + file_name + ".pt",


## Testing the Normalizer

In [14]:
tran.translate_test(normalizer, "Test", unreg_tokenizer, reg_tokenizer)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Input: xicmotlatzontequililican ynic polihuis yn iteystlacahuilis!
Predicted: xicmotlatzontequililican inic polihuiz in iteiztlacahuiliz!
Expected: xicmotlatzontequililican inic polihuiz in iteiztlacahuiliz!
Character Error Rate: 0.0


Input: inic navi vevetecatl.
Predicted: inic nahui huehuehtecatl.
Expected: inic nahui huehuetecatl.
Character Error Rate: 0.0416666679084301


Input: onexmotlasotili
Predicted: onechmotlazohtilih
Expected: onechmotlazohtilih
Character Error Rate: 0.0


Input: onechmouapahuili
Predicted: onechmohuapahuilih
Expected: onechmohuapahuilih
Character Error Rate: 0.0


Input: onechmonextillili ynn itetlasotlalitzin.
Predicted: onechmonextililih in itetlazohtlaliztzin.
Expected: onechmonextilih in itetlazohtlaliztzin.
Character Error Rate: 0.05128205195069313


Input: ca ye huecauh nechixticate.
Predicted: ca ye huehcauh nechixticateh.
Expected: ca ye huehcauh nechchixticateh.
Character Error Rate