In [132]:
# prueba del transformer
import numpy as np
import torch  # libreria principal de python
import torch.nn as nn  # modulo para las redes neuronales
import torch.optim as optim  # modulo para algoritmos de optimizacion en redes neuronales
import torch.utils.data as data  # modulo para tratar con los datasets
import math  # operaciones matematicas
import copy  # para copiar objetos y estructuras
import  MultiHeadAttention
import FeedForward
import Transformer
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split

In [133]:
src_vocab_size = 10000
tgt_vocab_size = 10000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

In [134]:
dataset = pd.read_csv("dataset_English-Spanish_Translation_Dataset.csv", sep=",", encoding="utf-8")

print("Mostramos contenido de las primeras 10 lineas")
dataset.head(10)

Mostramos contenido de las primeras 10 lineas


Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
5,Run!,¡Corre!
6,Run.,Corred.
7,Who?,¿Quién?
8,Fire!,¡Fuego!
9,Fire!,¡Incendio!


In [135]:
print("Dimensiones del dataset de prueba")
dataset.shape

Dimensiones del dataset de prueba


(118964, 2)

In [136]:
dataset.tail(10)

Unnamed: 0,english,spanish
118954,You can't view Flash content on an iPad. Howev...,No puedes ver contenido en Flash en un iPad. S...
118955,A mistake young people often make is to start ...,Un error que cometen a menudo los jóvenes es e...
118956,No matter how much you try to convince people ...,No importa cuánto insistas en convencer a la g...
118957,"In 1969, Roger Miller recorded a song called ""...","En 1969, Roger Miller grabó una canción llamad..."
118958,A child who is a native speaker usually knows ...,Un niño que es hablante nativo normalmente sab...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
118963,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."


In [137]:
dataset.info

<bound method DataFrame.info of                                                   english  \
0                                                     Go.   
1                                                     Go.   
2                                                     Go.   
3                                                     Go.   
4                                                     Hi.   
...                                                   ...   
118959  There are four main causes of alcohol-related ...   
118960  There are mothers and fathers who will lie awa...   
118961  A carbon footprint is the amount of carbon dio...   
118962  Since there are usually multiple websites on a...   
118963  If you want to sound like a native speaker, yo...   

                                                  spanish  
0                                                     Ve.  
1                                                   Vete.  
2                                                   Vay

In [138]:
# dividimos en conjunto de train y de test
x_train_ingles, x_test_ingles, y_train_ingles, y_test_ingles = train_test_split(dataset["english"], dataset["spanish"], random_state=42, test_size=0.2)

x_train_ingles.shape, y_train_ingles.shape, x_test_ingles.shape, y_test_ingles.shape

((95171,), (95171,), (23793,), (23793,))

In [139]:
# vamos a preprocesar el texto, idioma si esta a true = español y si esta a false = ingles
def preprocesamiento_texto(texto, idioma: bool):
    texto = texto.lower()

    # si es español, ponemos etiqueta <esp>
    if idioma:
        texto = "<sos> <esp> " + texto + " <eos>"
    # si es en ingles, ponemos etiqueta <eng>
    else:
        texto = "<eng> " + texto 
        
    return texto

x_train_ingles = x_train_ingles.apply(preprocesamiento_texto, idioma= False)
y_train_spanish = y_train_ingles.apply(preprocesamiento_texto, idioma= True)

In [140]:
x_train_ingles.head(5)

36040                     <eng> i have no choice at all.
5968                               <eng> i had to do it.
4653                                <eng> is this yours?
95794    <eng> the family had a hard time after the war.
72955            <eng> i want you to call off the fight.
Name: english, dtype: object

In [141]:
y_train_spanish.head(5)

36040    <sos> <esp> no tengo otra opción en absoluto.<...
5968                    <sos> <esp> tuve que hacerlo.<eos>
4653                       <sos> <esp> ¿es suyo esto?<eos>
95794    <sos> <esp> la familia atravesó por un mal tie...
72955      <sos> <esp> quiero que suspendas la pelea.<eos>
Name: spanish, dtype: object

In [142]:
from nltk import word_tokenize

def tokenizar (texto):
    tokens = word_tokenize(texto)
    print(tokens)
    return tokens

x_train_ingles = x_train_ingles.apply(tokenizar)

['<', 'eng', '>', 'i', 'have', 'no', 'choice', 'at', 'all', '.']
['<', 'eng', '>', 'i', 'had', 'to', 'do', 'it', '.']
['<', 'eng', '>', 'is', 'this', 'yours', '?']
['<', 'eng', '>', 'the', 'family', 'had', 'a', 'hard', 'time', 'after', 'the', 'war', '.']
['<', 'eng', '>', 'i', 'want', 'you', 'to', 'call', 'off', 'the', 'fight', '.']
['<', 'eng', '>', 'his', 'wish', 'is', 'to', 'go', 'to', 'america', '.']
['<', 'eng', '>', 'tom', 'is', 'mary', "'s", 'neighbor', '.']
['<', 'eng', '>', 'please', 'visit', 'me', 'tomorrow', '.']
['<', 'eng', '>', 'is', 'tom', 'always', 'like', 'this', '?']
['<', 'eng', '>', 'we', 'have', 'our', 'dinner', 'between', 'seven', 'and', 'seven-thirty', '.']
['<', 'eng', '>', 'i', "'ve", 'been', 'working', 'with', 'tom', 'for', 'three', 'months', '.']
['<', 'eng', '>', 'it', "'s", 'late', '.', 'i', 'have', 'to', 'go', '.']
['<', 'eng', '>', 'tom', 'has', 'no', 'sense', 'of', 'style', '.']
['<', 'eng', '>', 'i', 'regret', 'that', 'i', 'wasted', 'the', 'money', '.']

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).