In [1]:
import unidecode 
import string
from collections import defaultdict

import random
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable


In [2]:
def preprocessing(filname='English.txt'):
    file = unidecode.unidecode(open(filname).read())
    data = [i for st in file for i in st]
    dico = defaultdict(int)
    for i, c in enumerate(set(data)):
        dico[c] = i + 1
    x_vect = [dico[c] for c in data]
    return x_vect, dico
x_vect, dico = preprocessing()

In [6]:
#dico

In [3]:
len(x_vect), len(dico)

(27125, 53)

In [4]:
def make_batch(x_vect, batch_size=64, chunk_size=64): #FIXME
    idx_alea = random.randint(0,len(x_vect) - chunk_size - 1)
    #print(idx_alea)
    seq = torch.tensor([x_vect[idx_alea : idx_alea + chunk_size ]])
    for b in range(batch_size - 1):
        idx_alea = random.randint(0, len(x_vect) - chunk_size - 1)
        
        #print(idx_alea, chunk_size, idx_alea - chunk_size , idx_alea)
        new_seq = torch.tensor([x_vect[idx_alea : idx_alea + chunk_size]])
        #print(len(x_vect[idx_alea : idx_alea + chunk_size]))
        seq = torch.cat((new_seq, seq), dim=0) #np.vstack((g, gg))
    seq = seq.permute(1, 0)
    return seq
batch_x = make_batch(x_vect)

In [5]:
torch.randn(4).float()
len(dico)

53

In [32]:
class CellLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size, dico_size=54):
        super(CellLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.memory_size = memory_size
        self.output_size = output_size
        self.dico_size = dico_size
        
        self.forgot = nn.Sequential(
            #concatenation avant
            nn.Linear(self.input_size + self.hidden_size, self.memory_size),
            nn.Sigmoid()
        )
        
        self.write = nn.Sequential(
            nn.Linear(self.input_size + self.hidden_size, self.memory_size),
            nn.Tanh()
        )
        
        self.read = nn.Sequential(
            nn.Linear(self.memory_size, self.hidden_size),
            nn.Tanh()
        )
        
        self.tranformation = nn.Sequential(
            nn.Linear(self.input_size + self.hidden_size, self.hidden_size),
            nn.Sigmoid()
        )
    
        self.output = nn.Linear(self.hidden_size, self.dico_size)
        
    def forward(self, input, hidden, memory):
        concat_in = torch.cat((input.float(), hidden.float()), dim=1)
        #concatener plus couteuse, preferer 2 tranformations linéaires, une on lui dit de ne pas mettre de biais l'autre si
        #calcule de la porte d'oubli
        output_forgot = self.forgot(concat_in.float())
        #produit terme a terme entre ouptu_forgot et memory 
        out1 = memory * output_forgot
        #print("out1 :", out1.shape)
        #mise a jour de la mémoire 
        output_write = (1 - output_forgot) * self.write(concat_in.float())
        #print("output write :", output_write.shape)
        #somme terme a terme entre out1 et ouptu_write
        new_memory = out1 + output_write 
        #ce qu'on garde 
        #produit temre a terme avec out3 et le resultat de la tranformation
        out3 = self.read(new_memory)
        transformation = self.tranformation(concat_in)
        new_hidden = transformation * out3
        out = self.output(new_hidden)
        return out, new_hidden, new_memory 
    
    

In [33]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.memory_size = memory_size
        self.output_size = output_size
        
        self.encoder = nn.Embedding(len(dico) + 1, hidden_size)
        self.cell_lstm1 = CellLSTM(input_size, hidden_size, memory_size, output_size)
        
    
    def forward(self, input, hidden, memory):
        encoded_input = self.encoder(input)
        #print(encoded_input.shape)
        output, hidden, memory = self.cell_lstm1(encoded_input, hidden, memory)
        return output, hidden, memory

    def init_hidden(self, batch_size=64):
        return torch.zeros(batch_size, self.hidden_size), torch.zeros(batch_size, self.memory_size)

In [34]:
len(dico)

53

In [54]:
epochs = 1000
input_encoder_size = 20
output_size = len(dico)  # nbr classe

hidden_size = 20 #arbitraire
memory_size = 50
lr = 0.005

model =  LSTM(input_encoder_size, hidden_size, memory_size, output_size)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

In [62]:
def generate(caract='C', temp=1.):
    hidden, memory = model.init_hidden(batch_size=1)
    new_char = dico[caract]
    print(caract, end='')
    lettres = list(dico.keys())
    for i in range(200):
        predict, hidden, memory = model(torch.tensor([new_char]), hidden, memory)
        softmax = torch.nn.Softmax()(predict/ temp)
        new_char = torch.multinomial(softmax, num_samples=1)
        print(new_char)
        print({dico[k]: k for k in dico}[new_char.item()], end='')


In [56]:
def train(data_x):
    hidden, memory = model.init_hidden(batch_size=64)
    optimizer.zero_grad()
    loss = 0.
    for i, vect in enumerate(data_x[:-1]):        
        output, hidden, memory = model(vect, hidden, memory)
        loss += loss_function(output, data_x[i+1])

    loss.backward()
    optimizer.step()

    return loss.item()


In [59]:
import time

start = time.time()
all_losses = []g

for epoch in range(1, epochs + 1):
    
    loss = train(make_batch(x_vect))    

    if epoch % 100 == 0:
        print('[%s (%d %d%%) %.4f]' % (time.time() - start, epoch, epoch / epochs * 100, loss))
        new_carac = random.choice(list(dico.keys()))
        generate(new_carac, 1.2)
    all_losses.append(loss)


[10.079665660858154 (100 10%) 103.3259]
Eveirt
Knay
Mweatt
Itler
Glare
Ozlimins
Adlanson
Kidgetestoll
Chrepsieher
Elman
Calden
Talwairst
Gawlon
Handor
Haurke
Gacen
Panner
Haylin
Carksnes
Craint
Eaness
Gaphriggell

  del sys.path[0]



Harrom
Dannarly
Lainhwell
ei[21.064146757125854 (200 20%) 98.9328]
Rookins
Mish
Fowne
Kidwigan
Coug
Cohilson
Esdand
Goitheran
Elkin
Ivie
Chortyp
Orchip
Grogsey
Thorny
Kockes
Shoherninson
Bupbins
Yorpem
Worch
Tovens
Kinckorn
Mutrigugh
Brought
Jorr
Word
Snurtin
Wolvarr
[31.688946962356567 (300 30%) 99.3906]
n
Vomssos
Wower
Rosan
Wouxley
Whorp
Morodon
Ficonord
Ford
Fitterixtock
Mougan
Foner
Forkir
Mowoodfhins
SmosshJy
Bowingtone
Kotckishur
Morby
Fobeonds
Fonsay
Boobl
Whieshy
Mougan
Sparndel
Bunditt
Wholry
[42.242461919784546 (400 40%) 99.1757]
drel
Whwooll
Kithersophell
Proper
Gump
Guuf
Eson
Icclon
Uttonath
PhirlHon
Dundle
Phorman
Doche
Pithfora
Tofens
Cowe
Glyn
Godan
Glyrs
Tolwold
Hrawpay
Luan
Harbers
Patsyd
Hatkings
Gael
Carvey
Palsham
Dae[53.64766049385071 (500 50%) 94.2439]
rells
Dittsord
Hoxtattauson
Purveeris
Dunany
Guncott
Humpe
Dyrbroodhy
Picosgrestwye
PithPett
Dodries
Doughton
Horthals
Doburd
Douck
Torthell
GolwaJy
Golgenmarson
Gilbridd
Gilfoll
Gilter
Edin
Ibson
Ebfo[65.02103662

In [61]:
import time

start = time.time()
all_losses = []
loss_avg = 0

for epoch in range(1, epochs + 1):
    
    loss = train(make_batch(x_vect))    

    if epoch % 100 == 0:
        print('[%s (%d %d%%) %.4f]' % (time.time() - start, epoch, epoch / epochs * 100, loss))
        new_carac = random.choice(list(dico.keys()))
        generate(new_carac, 1.4)
    all_losses.append(loss)


[10.865278720855713 (100 10%) 87.9404]
Quntsos
Dhurkinsor
Pringhor
Prouthtor
Ducfoxon
Pelmer
Plackereword
Gleers
Ilard
Gall
Catley
Hawsif
Palie
Cairy
Catks
Galeps
Cibason
Teanhai
Calli
Colly
Gilby
Edkey
Guwin
Pisdson
Huron
Gindress
Ginagson

  del sys.path[0]


[20.997466325759888 (200 20%) 88.6590]
Jelgraske
Vicgss
Mceeling
Mermowly
Vingors
Natroggs
Yaylorlton
Tlayison
Ralssie
Havanas
Palson
Gawfroy
Beriveley
Cordn
Cyson
Cukrwowens
Curgoer
Cneggricgricne
Cyham
Cullan
Ogfiest
Byivy
Olsyworth
Arthn[32.00559067726135 (300 30%) 87.5523]
Quirry
Dlallilhamson
Geatclauman
Gavi
Calliip
Callett
Bainhwedham
Eadd
Ottoses
Aatt
Clandleoson
Odex
Odgit
Goefhutter
Poffec
Good
Gordener
Gravir
Iwbrovagt
Cramospan
Inawerd
Chail
Oakey
Ashenon
Alex
El[42.975183963775635 (400 40%) 85.3937]
Jachriffeli
Ladhy
Tatpish
Yare
Eatey
Otmanpolllze
Eakyncearlis
Caindall
Carmarl
Kearr
Ehskery
Awasst
Otchen
Adhy
Eklingtom
Esbop
Chason
Chadd
Oalixon
Apmell
Ekey
Arnsle
Armsworn
Aron
Orixothay
Croir
Gr[53.265843629837036 (500 50%) 87.4475]
hjkhson
Nourz
Nonahameccorm
Ruase
Nyaght
Vinds
Nies
Scockley
Ryllay
Noywoodl
Joshy
Nowang
Russawtons
Nyernage
Lofmwinson
Snurimny
Minfies
Montyries
Fomp
Fond
Morton
Mockman
Mohastainfer
Mozey
Font
Mordow[64.05517220497131 (600 60%) 88.0853]
m