In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, DataLoader2
from pytorch_lightning.callbacks import TQDMProgressBar, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pytorch_lightning as pl

import torchtext
import  torchdata

import sentencepiece as spm

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings("ignore")

In [10]:
vocab_size = 16384
vtype = 'bpe'
batch_size = 256
lang1 = 'fr'
lang2 = 'en'
block_size = 32


# Store text as pytorch datasets
class Text(Dataset):
    def __init__(self, text, spw_x, spw_y, block_size) -> None:
        super().__init__()

        # Build dataset
        self.x = []
        self.y = []
        pad_x = spw_x.Encode('<pad>')[-1]
        pad_y = spw_y.Encode('<pad>')[-1]
        for batch in text:
            x, y = batch
            x = torch.tensor(spw_x.Encode(x.lower()))
            y = torch.tensor(spw_y.Encode(y.lower()))

            padl = block_size - len(x)
            x = torch.nn.functional.pad(x, (0,padl), mode='constant', value=pad_x) 

            padl = block_size - len(y)
            y = torch.nn.functional.pad(y, (0,padl), mode='constant', value=pad_y) 

            self.x.append(x)
            self.y.append(y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

# Get dataset for languages 1 and 2
def get_data(vtype, batch_size, vocab_size, block_size, lang1, lang2):

    raw_x = f'data/raw_{lang1}_text.txt'
    raw_y = f'data/raw_{lang2}_text.txt'

    # Build text dataset
    train_text, valid_text, test_text = torchtext.datasets.IWSLT2016(root='data',  language_pair=(lang1, lang2))

    # Concatonate dateset
    text_x = ''
    text_y = ''
    for _, batch in enumerate(train_text):
        x, y = batch
        text_x += x
        text_y += y

    for _, batch in enumerate(valid_text):
        x, y = batch
        text_x += x
        text_y += y

    for _, batch in enumerate(test_text):
        x, y = batch
        text_x += x
        text_y += y

    print (f'{lang1} text length is: ', len(text_x))
    print (f'{lang2} text length is: ', len(text_y))

    f = open(raw_x,'w')
    f.write(text_x.lower())
    f.close()

    f = open(raw_y,'w')
    f.write(text_y.lower())
    f.close()


    spm.SentencePieceTrainer.train(input=raw_x, vocab_size=vocab_size, user_defined_symbols='<pad>',
                                    model_type=vtype, model_prefix=f'mb_{lang1}', 
                                    minloglevel=2)

    spm.SentencePieceTrainer.train(input=raw_y, vocab_size=vocab_size, user_defined_symbols='<pad>',
                                    model_type=vtype, model_prefix=f'mb_{lang2}', 
                                    minloglevel=2)


    spw_x = spm.SentencePieceProcessor(f'mb_{lang1}.model')
    spw_y = spm.SentencePieceProcessor(f'mb_{lang2}.model')

    x, y = next(iter(train_text))
    print (x.lower())
    print (y.lower())
    print (spw_x.Encode(x.lower()))
    print (spw_y.Encode(y.lower()))

    padd = spw_x.Encode('<pad>')[-1]
    print (padd)
    print (spw_x.Decode([padd]))

    padd = spw_y.Encode('<pad>')[-1]
    print (padd)
    print (spw_y.Decode([padd]))


    train = Text(train_text, spw_x, spw_y, block_size)
    valid = Text(valid_text, spw_x, spw_y, block_size)

    x, y = next(iter(train))
    print (spw_x.Decode(x.tolist()))
    print (spw_y.Decode(y.tolist()))

    train_dl = DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(valid, batch_size=batch_size, shuffle=True)

    return train_dl, val_dl, spw_x, spw_y

train_dl, val_dl, spw_fr, spw_en = get_data(vtype, batch_size, vocab_size, block_size, lang1, lang2)



fr text length is:  23824555
en text length is:  21392513
david gallo: voici bill lange. je suis dave gallo.

david gallo: this is bill lange. i'm dave gallo.

[4135, 74, 720, 16333, 16361, 475, 3148, 7, 1044, 16344, 77, 311, 13520, 74, 720, 16333, 16344]
[3608, 3998, 16343, 16374, 69, 58, 1882, 36, 1260, 16361, 7, 16364, 16353, 10871, 3998, 16343, 16361]
3
<pad>
3
<pad>
david gallo: voici bill lange. je suis dave gallo.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
david gallo: this is bill lange. i'm dave gallo.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [30]:
#batch = next(iter(train))
#x, y = batch
#print (spw_fr.Decode(x.tolist()))
#print (spw_en.Decode(y.tolist()))

#train_dl = DataLoader2(train, batch_size=128, shuffle=True)

batch = next(iter(train_dl))
x, y = batch
print (x[0])
print (y[0])
print (spw_fr.Decode(x[0].tolist()))
print (spw_en.Decode(y[0].tolist()))

#lenn = []
#for i in range(10000):
#    lenn.append(len(train[i][0]))#

#lenn = torch.tensor(lenn).float()
#print (lenn.median(), lenn.median() + lenn.std(), lenn.min(), lenn.max())

tensor([  285,   680,  1443,   206,  1281,  3765,   152,   206,    39, 16343,
         3133,  5391,    44,  1501,  3285, 16341,    39, 16343, 11716,    44,
          734,  1522, 16344,     3,     3,     3,     3,     3,     3,     3,
            3,     3])
tensor([  148,   355,   239,   115,  1585,    43,  2123, 16365, 13784,   115,
          225,    25,   935,  1604,  1298, 16359,  2845,     9,  1832,   544,
        16361,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3])
ma première année -- super motivée -- j'allais enseigner le gouvernement américain, j'adorais le système politique.<pad><pad><pad><pad><pad><pad><pad><pad><pad>
my first year -- super gung-ho -- going to teach american government, loved the political system.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [26]:
print (spw_fr.Decode([16344]))

.


In [180]:
26.87, 46.42
x = torch.randint(0,100,(1,))
padl = 8 - len(x)
y = torch.nn.functional.pad(x, (0,padl), mode='constant', value=3) 

print (x)
print (y)
print (len(y))

tensor([25])
tensor([25,  3,  3,  3,  3,  3,  3,  3])
8


In [103]:
fr, en = train[0]
print (spw_fr.Decode(fr.tolist()))
print (spw_en.Decode(en.tolist()))



david gallo: voici bill lange. je suis dave gallo.
david gallo: this is bill lange. i'm dave gallo.


In [5]:
4096*4

16384

In [12]:
textx[:1000]

"David Gallo: Voici Bill Lange. Je suis Dave Gallo.\nNous allons vous raconter quelques histoires de la mer en vidéo.\nNous avons des vidéos du Titanic parmi les plus spectaculaires jamais vues. et nous n'allons pas vous en montrer une image.\nLa vérité est que le Titanic -- même s'il continue de battre toutes les records de recettes -- n'est pas l'histoire la plus passionnante.\nLe problème, je crois, est qu'on tient l'océan pour acquis.\nQuand vous y pensez, les océans représentent 75% de la planète.\nLa plus grande partie de la planète est d'eau.\nLa profondeur moyenne est environ 3,2 km.\nUne partie du problème, je pense, est qu'en étant sur la plage ou en regardant des images de l'océan, comme celles-ci, on voit cette grande étendue bleue, chatoyante, ça bouge, il y a des vagues, il y a du surf et il y a des marées, mais vous n'avez aucune idée de ce qui s'y cache.\nIl y existe les chaînes de montagnes les plus longues de la planète.\nLa plupart des animaux se trouvent dans les oc