In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
with open('data/MotivosDeProteo.txt','r',encoding='utf8') as f:
    text = f.read()

In [7]:
print(text[:1000])

Motivos de Proteo
José Enrique Rodó



     No publico una «primera parte» de PROTEO: el material que he apartado para estos «Motivos» da, en compendio, idea general de la obra, harto extensa (aun si la limitase a lo que tengo escrito) para ser editada de una vez. Los claros de este volumen serán el contenido del siguiente y así en los sucesivos. Y nunca PROTEO se publicará de otro modo que de éste; es decir: nunca le daré «arquitectura» concreta, ni término forzoso: siempre podrá seguir desenvolviéndose, «viviendo». La índole del libro (si tal puede llamársele) consiente, en torno de un pensamiento capital, tan vasta ramificación de ideas y motivos, que nada se opone a que haga de él lo que quiero que sea: un libro en perpetuo «devenir», un libro abierto sobre una perspectiva indefinida.

J. E. R.






- I -
Reformarse es vivir. Nuestra transformación personal en el tiempo.


Reformarse es vivir... Y desde luego, nuestra transformación personal en cierto grado ¿no es ley constante e 

In [8]:
len(text)

672786

In [9]:
all_characters = set(text)

In [10]:
#num --> letra

decoder = dict(enumerate(all_characters))

In [11]:
#letra --> num

encoder = {char:ind for ind,char in decoder.items()}

In [14]:
encoded_text = np.array([encoder[char] for char in text])

In [17]:
encoded_text[:500]

array([17, 77, 21, 40, 42, 77, 80, 50, 55, 39, 50, 53, 15, 77, 21, 39, 77,
       57, 67, 77, 80, 51, 50, 23, 49, 15, 40, 71, 14, 39, 50,  8, 77, 55,
       56, 57, 57, 57, 57, 50, 50, 50, 50, 50, 16, 77, 50, 27, 14, 74, 79,
       40, 28, 77, 50, 14, 49, 38, 50, 35, 27, 15, 40,  0, 39, 15, 38, 50,
       27, 38, 15, 21, 39, 24, 50, 55, 39, 50, 53,  8, 62, 86, 23, 62, 52,
       50, 39, 79, 50,  0, 38, 21, 39, 15, 40, 38, 79, 50, 71, 14, 39, 50,
       19, 39, 50, 38, 27, 38, 15, 21, 38, 55, 77, 50, 27, 38, 15, 38, 50,
       39, 80, 21, 77, 80, 50, 35, 17, 77, 21, 40, 42, 77, 80, 24, 50, 55,
       38,  9, 50, 39, 49, 50, 28, 77,  0, 27, 39, 49, 55, 40, 77,  9, 50,
       40, 55, 39, 38, 50, 25, 39, 49, 39, 15, 38, 79, 50, 55, 39, 50, 79,
       38, 50, 77, 74, 15, 38,  9, 50, 19, 38, 15, 21, 77, 50, 39, 18, 21,
       39, 49, 80, 38, 50, 44, 38, 14, 49, 50, 80, 40, 50, 79, 38, 50, 79,
       40,  0, 40, 21, 38, 80, 39, 50, 38, 50, 79, 77, 50, 71, 14, 39, 50,
       21, 39, 49, 25, 77

In [24]:
def one_hot_encoder(encoded_txt, num_unique_chars):
    #encoded_txt: batch encoded text
    #num_unique_chars: len of all characters
    
    one_hot = np.zeros((encoded_txt.size, num_unique_chars))
    
    one_hot = one_hot.astype(np.float32)
    
    one_hot[np.arange(one_hot.shape[0]),encoded_txt.flatten()] = 1.0
    
    one_hot = one_hot.reshape((*encoded_txt.shape, num_unique_chars))
    
    return one_hot
    

In [27]:
def generate_batches(encoded_txt, samp_per_batch=10, seq_len=50):
    #X : encoded text of length seq_len
    #Y : encoded text shifted by one
    
    #how many chars per batch
    char_per_batch = samp_per_batch * seq_len
    
    # how many batches can we make with the given text
    num_batches_avail = int(len(encoded_txt)/char_per_batch)
    
    #cut off the end of the encoded text that we can't fit evenly into a batch
    encoded_txt = encoded_txt[:num_batches_avail * char_per_batch]
    
    encoded_txt = encoded_txt.reshape((samp_per_batch),-1)
    
    for n in range(0, encoded_txt.shape[1], seq_len):
        
        x = encoded_txt[:,n:n+seq_len]
        
        y = np.zeros_like(x)
        
        try:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_txt[:,n+seq_len]
        except:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_txt[:,0]
            
        yield x,y

In [28]:
sample_text = encoded_text[:800]

In [29]:
batch_generator = generate_batches(sample_text, samp_per_batch=2, seq_len=6)

In [30]:
x,y = next(batch_generator)

In [31]:
x

array([[17, 77, 21, 40, 42, 77],
       [50, 51, 80, 21, 39, 82]])

In [32]:
y

array([[77, 21, 40, 42, 77, 80],
       [51, 80, 21, 39, 82, 50]])

In [36]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers = 4, drop_prob = 0.5, use_gpu=False):
        
        super().__init__()
        
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        self.all_chars = all_chars
        
        #decoder
        self.decoder = dict(enumerate(all_chars))
        #encoder
        self.encoder = {char:ind for ind, char in decoder.items()}
        
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout = drop_prob, batch_first = True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
        
    def forward(self,x,hidden):
        
        lstm_output, hidden = self.lstm(x,hidden)
        
        drop_output = self.dropout(lstm_output)
        
        drop_output = drop_output.contiguous().view(-1,self.num_hidden)
        
        final_out = self.fc_linear(drop_output)
        
        return final_out,hidden
    
    def hidden_state(self,batch_size):
        
        if self.use_gpu:          
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden


In [46]:
model = CharModel(all_chars=all_characters,
                 num_hidden=256,
                 num_layers=4,
                 drop_prob=0.4,
                 use_gpu=True)

In [47]:
total_param = []

for p in model.parameters():
    total_param.append(int(p.numel()))

In [48]:
sum(total_param)

1958490

In [49]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [52]:
train_percent = 0.7

In [53]:
train_ind = int(len(encoded_text)*train_percent)

In [54]:
train_data = encoded_text[:train_ind]
test_data = encoded_text[train_ind:]

In [55]:
#variables
epochs = 100
batch_size = 80

seq_len = 150

tracker = 0

num_char = max(encoded_text)+1

In [56]:
model.train()

if model.use_gpu:
    model.cuda()
    
for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    for x,y in generate_batches(train_data, batch_size, seq_len):
        
        tracker+=1
        
        x = one_hot_encoder(x,num_char)
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output,hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output, targets.view(batch_size*seq_len).long())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        if tracker % 15 == 0:
            
            test_hidden = model.hidden_state(batch_size)
            test_losses = []
            model.eval()
            for x,y in generate_batches(train_data, batch_size, seq_len):

                x = one_hot_encoder(x,num_char)

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                
                test_hidden = tuple([state.data for state in test_hidden])
                
                lstm_output,test_hidden = model.forward(inputs,test_hidden)
                test_loss = criterion(lstm_output, targets.view(batch_size*seq_len).long())
                
                test_losses.append(test_loss.item())
                
            model.train()
            
            print(f'Epoch: {i}  Step: {tracker}  TEST LOSS: {test_loss.item()}')

Epoch: 0  Step: 15  TEST LOSS: 3.01088809967041
Epoch: 0  Step: 30  TEST LOSS: 2.981618881225586
Epoch: 1  Step: 45  TEST LOSS: 2.980289936065674
Epoch: 1  Step: 60  TEST LOSS: 2.9801018238067627
Epoch: 1  Step: 75  TEST LOSS: 2.9791440963745117
Epoch: 2  Step: 90  TEST LOSS: 2.9783995151519775
Epoch: 2  Step: 105  TEST LOSS: 2.9780454635620117
Epoch: 3  Step: 120  TEST LOSS: 2.977144956588745
Epoch: 3  Step: 135  TEST LOSS: 2.978137969970703
Epoch: 3  Step: 150  TEST LOSS: 2.977324962615967
Epoch: 4  Step: 165  TEST LOSS: 2.977097749710083
Epoch: 4  Step: 180  TEST LOSS: 2.977449655532837
Epoch: 4  Step: 195  TEST LOSS: 2.977058172225952
Epoch: 5  Step: 210  TEST LOSS: 2.977452039718628
Epoch: 5  Step: 225  TEST LOSS: 2.9769606590270996
Epoch: 6  Step: 240  TEST LOSS: 2.9763436317443848
Epoch: 6  Step: 255  TEST LOSS: 2.9777090549468994
Epoch: 6  Step: 270  TEST LOSS: 2.976966142654419
Epoch: 7  Step: 285  TEST LOSS: 2.976698160171509
Epoch: 7  Step: 300  TEST LOSS: 2.976670265197754


Epoch: 61  Step: 2385  TEST LOSS: 1.6730328798294067
Epoch: 61  Step: 2400  TEST LOSS: 1.6769171953201294
Epoch: 61  Step: 2415  TEST LOSS: 1.6733506917953491
Epoch: 62  Step: 2430  TEST LOSS: 1.666959524154663
Epoch: 62  Step: 2445  TEST LOSS: 1.6660690307617188
Epoch: 63  Step: 2460  TEST LOSS: 1.6594417095184326
Epoch: 63  Step: 2475  TEST LOSS: 1.6610177755355835
Epoch: 63  Step: 2490  TEST LOSS: 1.6622159481048584
Epoch: 64  Step: 2505  TEST LOSS: 1.6516740322113037
Epoch: 64  Step: 2520  TEST LOSS: 1.653764009475708
Epoch: 64  Step: 2535  TEST LOSS: 1.6480686664581299
Epoch: 65  Step: 2550  TEST LOSS: 1.644834280014038
Epoch: 65  Step: 2565  TEST LOSS: 1.6443257331848145
Epoch: 66  Step: 2580  TEST LOSS: 1.63465416431427
Epoch: 66  Step: 2595  TEST LOSS: 1.642552137374878
Epoch: 66  Step: 2610  TEST LOSS: 1.6333867311477661
Epoch: 67  Step: 2625  TEST LOSS: 1.6316252946853638
Epoch: 67  Step: 2640  TEST LOSS: 1.6282155513763428
Epoch: 68  Step: 2655  TEST LOSS: 1.6240662336349487

In [57]:
model_name = 'hidden256_layers4_rodo.net'

In [58]:
torch.save(model.state_dict(),model_name)

In [59]:
def predict_next_char(model,char,hidden=None,k=1):
    
    encoded_text = model.encoder[char]
    
    encoded_text = np.array([[encoded_text]])
    
    encoded_text = one_hot_encoder(encoded_text,len(model.all_chars))
    
    inputs = torch.from_numpy(encoded_text)
    
    if model.use_gpu:
        inputs = inputs.cuda()
        
    hidden = tuple([state.data for state in hidden])
    
    lstm_out, hidden = model(inputs,hidden)
    
    probs = F.softmax(lstm_out, dim=1).data
    
    if model.use_gpu:
        probs = probs.cpu()
        
    probs,index_positions = probs.topk(k)
    
    index_positions = index_positions.numpy().squeeze()
    
    probs = probs.numpy().flatten()
    
    probs = probs/probs.sum()
    
    char = np.random.choice(index_positions, p=probs)
    
    return model.decoder[char],hidden

In [60]:
def generate_text(model,size,seed='El',k=1):
    
    if model.use_gpu:
        model.cuda()
    else:
        model.cpu()
        
    model.eval()
    
    output_chars = [c for c in seed]
    
    hidden = model.hidden_state(1)
    
    for char in seed:
        char, hidden = predict_next_char(model,char, hidden, k=k)
        
    output_chars.append(char)
    
    for i in range(size):
        char,hidden = predict_next_char(model,output_chars[-1],hidden,k=k)
        
        output_chars.append(char)
        
    return ''.join(output_chars)

In [63]:
print(generate_text(model,1000,seed='El ', k=2))

El de la contención de sus contradictoses de la casi de su presenteridad de la contradictiva, y se esta de los contrasiciones de los casas, ya el armo del alma de lo conciente de los complestes de la concensión, y en la acción de la completidad de la vocación del alma y de los contrados de sus concensibaciandoses de los casos, y se encaciertan al campo de los casos que se desenvolvan en la contradictiva, el armanio de los más contenes, y en lo en los casos del continio y de la caracte de lo de la vida, el cantario de sus complestas de las artes del continio de los mistos, en la viente de su propesiones de su alma y de los casas que sueño de su alte de los contentes del campio del preciso, en el alma de la contradictiva de sus propias del completo de su alma de la vida, en la vida de la concensión del alma de sustituidos dispositiones del arte de la contradicción de la contra la voluntad de la concertante, ella en los artes de las contentos y los comos de la vida, ya en el arte el cando