<a href="https://colab.research.google.com/github/kenny08gt/proyecto_statistical_learning2/blob/master/ProyectoSL2_P3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import sys
import time
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
    
from google.colab import drive
drive.mount('/content/drive')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# device = 'cpu'

data = pd.read_csv('/content/drive/My Drive/statistical learning 2/Proyecto/songdata.csv')
data.describe()

In [0]:

class RNNModel(nn.Module):
    def __init__(self, ntoken, inputs_size, num_hidden_nodes, num_layers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, inputs_size)

        self.rnn = nn.LSTM(inputs_size, num_hidden_nodes, num_layers, dropout=dropout)
       
        self.decoder = nn.Linear(num_hidden_nodes, ntoken)

        if tie_weights:
            if num_hidden_nodes != inputs_size:
                raise ValueError('When using the tied flag, num_hidden_nodes must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()
        self.num_hidden_nodes = num_hidden_nodes
        self.num_layers = num_layers

    def init_weights(self):
        initrange = 0.1
        nn.init.xavier_uniform_(self.encoder.weight)
#         self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        nn.init.xavier_uniform_(self.decoder.weight)
#         self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, batch_size, self.num_hidden_nodes), weight.new_zeros(self.num_layers, batch_size, self.num_hidden_nodes))

In [0]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, dataframe):
        self.dictionary = Dictionary() 
        lyrics = dataframe['text'].apply(self.pre_process)
        train, test, y_train, y_test = train_test_split(lyrics, dataframe['artist'], test_size=0.3, random_state=1)
        train, val, y_train, y_val = train_test_split(train, y_train, test_size=0.3, random_state=1)
        self.train = self.tokenize(train.str.cat(sep=' end_song '))
        self.valid = self.tokenize(val.str.cat(sep=' end_song '))
        self.test = self.tokenize(test.str.cat(sep=' end_song '))
        self.train_raw = train.str.cat(sep=' end_song ')
        self.valid_raw = val.str.cat(sep=' end_song ')
        self.test_raw = test.str.cat(sep=' end_song ')
        
    def pre_process(self,text):
        text = text.replace("\r"," ")
        text = text.replace("\n"," ")
        text = text.replace("x2"," ")
        text = text.replace("x3"," ")
        text = text.replace("'"," ")
        text = text.replace("end_song"," ")
        text = text.lower()
        table = str.maketrans('', '', '!"#$%&\()*+-/:;<=>?@[\\]^_`{|}~')
        text = text.translate(table)
        return(text)
    
    def tokenize(self, string):
        tokens = 0
        words = nltk.word_tokenize(string)
        tokens += len(words)
        for word in words:
            self.dictionary.add_word(word)

        ids = torch.LongTensor(tokens)
        token = 0
        words = nltk.word_tokenize(string)
        for word in words:
            ids[token] = self.dictionary.word2idx[word]
            token += 1

        return ids

In [0]:
def createCorpus(subset):
  return Corpus(subset)


In [0]:
def batchify(data, batch_size):
  
    nbatch = data.size(0) // batch_size
#     print("batch_size: " + str(batch_size))
#     print("batches: " + str(nbatch))
    
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the bsz batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data.to(device)


In [0]:

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / (len(data_source) - 1)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def train(epoch, batch_size):
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    cur_loss = 0
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        hidden = repackage_hidden(hidden)
        
        # Reset the gradient after every epoch. 
#         optimizer.zero_grad()      
        model.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        
        # Optimizer take a step and update the weights.
#         optimizer.step()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
        
        total_loss += loss.item()

#         print(str(args['log_interval']) + ", batch " + str(batch))
        if batch % args['log_interval'] == 0 and batch > 0:
            cur_loss = total_loss / args['log_interval']
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:2.8f} | ms/batch {:5.2f} | '
                    'loss {:5.8f}'.format(
                epoch, batch, len(train_data) // args['bptt'], lr,
                elapsed * 1000 / args['log_interval'], cur_loss))
            
#                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
#                     'loss {:5.2f} | ppl {:8.2f}'.format(
#                 epoch, batch, len(train_data) // args['bptt'], lr,
#                 elapsed * 1000 / args['log_interval'], cur_loss, 2**(cur_loss)))
            
            total_loss = 0

            start_time = time.time()

In [0]:
# 1. create subset and corpus
subset = data[data['artist'].isin(["Metallica", "Megadeth"])]
display(subset.head())
display(subset.describe())
corpus = createCorpus(subset)
# 2. batchify
eval_batch_size = 10
train_data = batchify(corpus.train, 5)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
# 3. build model
args = {
    
    "lr":10,
    "clip":0.25,
    "epochs":140, # upper epoch limit
    "batch_size":5,
    "bptt":40,#seq length
    
    "seed":1,
    "log_interval":50,
    "save":"model.pt"
}
bptt = 40
batch_size = 5
ntokens = len(corpus.dictionary)
# ntoken, inputs_size, num_hidden_nodes, num_layers, dropout=0.5, tie_weights=False
model = RNNModel(ntokens, inputs_size = 300, num_hidden_nodes = 300, num_layers = 10, dropout = 0.2, tie_weights = True).to(device)
criterion = nn.CrossEntropyLoss()

# 4. train model
# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 40

# Initialize the optimizer
#learning_rate = args['lr']
# optimizer = optim.Adam(model.parameters())

# At any point you can hit Ctrl + C to break out of training early.
try:

    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train(epoch, batch_size)
        
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.8f}s | valid loss {:5.8f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss))
        print('-' * 89)
      
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("model.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr = lr / 2.0
        #if lr < 0.5:
            #lr=0.5
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    logging.debug('-' * 89)
    logging.debug('Exiting from training early')
    

# Load the best saved model.
with open("model.pt", 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.8f} | test ppl {:8.2f}'.format(
    test_loss, 2**(test_loss)))
print('=' * 89)


# 5. generar salida
generate_args={
    "temperature": 1, #temperature - higher will increase diversity
    "words":200, #number of words to generate
    "outf":"metallica.txt",
    "log_interval":30,
}

with open("model.pt", 'rb') as f:
    model = torch.load(f).to(device)
model.eval()
seed_word = "hard"
seed=torch.LongTensor(1,1).to(device)
seed[0]=corpus.dictionary.word2idx[seed_word]
hidden = model.init_hidden(1)
#input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = seed
with open(generate_args['outf'], 'w') as outf:
    outf.write(seed_word + ' ')
    with torch.no_grad():  # no tracking history
        for i in range(generate_args['words']):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(generate_args['temperature']).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            
            outf.write(word + ' ')

            if i % generate_args['log_interval'] == 0:
                print('| Generated {}/{} words'.format(i, generate_args['words']))
                
!cat metallica.txt

Unnamed: 0,artist,song,link,text
12691,Megadeth,13,/m/megadeth/13_20983259.html,Thirteen times I went to the well \nTo draw m...
12692,Megadeth,502,/m/megadeth/502_20091445.html,"""Pull over, shithead, this is the cops!"" \nFu..."
12693,Megadeth,Addicted To Chaos,/m/megadeth/addicted+to+chaos_20091485.html,Only yesterday they told me you were gone \nA...
12694,Megadeth,Almost Honest,/m/megadeth/almost+honest_20091367.html,I lied just a little \nWhen I said I need you...
12695,Megadeth,Anarchy In The Uk,/m/megadeth/anarchy+in+the+uk_20091446.html,Right now \nI am an anti-Christ \nAnd I am a...


Unnamed: 0,artist,song,link,text
count,288,288,288,288
unique,2,288,288,288
top,Metallica,Now I Wanna Sniff Some Glue,/m/metallica/some+kind+of+monster_10178907.html,Full of greed you sell your soul \nFull of pr...
freq,155,1,1,1


| epoch   1 |    50/  172 batches | lr 20.00000000 | ms/batch 219.68 | loss 7.32456979
| epoch   1 |   100/  172 batches | lr 20.00000000 | ms/batch 190.51 | loss 6.56249460
| epoch   1 |   150/  172 batches | lr 20.00000000 | ms/batch 189.16 | loss 6.43155812
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 34.89584780s | valid loss 6.61666224
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |    50/  172 batches | lr 20.00000000 | ms/batch 171.15 | loss 6.51453354
| epoch   2 |   100/  172 batches | lr 20.00000000 | ms/batch 167.77 | loss 6.35901614
| epoch   2 |   150/  172 batches | lr 20.00000000 | ms/batch 167.57 | loss 6.29909506
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 29.79914451s | valid loss 6.63873473
-----------------------------------------------------------------------------------------
| epoch   3 |    50/  172 batches | lr 10.00000000 | ms/batch 171.09 | loss 6.41555386
| epoch   3 |   100/  172 batches | lr 10.00000000 | ms/batch 167.84 | loss 6.22863600
| epoch   3 |   150/  172 batches | lr 10.00000000 | ms/batch 167.61 | loss 6.15975897
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 29.79714417s | valid loss 6.62018149
--------------------------------------------------------------------------------

In [0]:
# 1. create subset and corpus
subset = data[data['artist'].isin(["Metallica", "Megadeth"])]
display(subset.head())
display(subset.describe())
corpus = createCorpus(subset)
# 2. batchify
eval_batch_size = 10
train_data = batchify(corpus.train, 5)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
# 3. build model
args = {
    
    "lr":10,
    "clip":0.25,
    "epochs":140, # upper epoch limit
    "batch_size":5,
    "bptt":40,#seq length
    
    "seed":1,
    "log_interval":50,
    "save":"model.pt"
}
bptt = 40
batch_size = 5
ntokens = len(corpus.dictionary)
# ntoken, inputs_size, num_hidden_nodes, num_layers, dropout=0.5, tie_weights=False
model = RNNModel(ntokens, inputs_size = 300, num_hidden_nodes = 300, num_layers = 50, dropout = 0.2, tie_weights = True).to(device)
criterion = nn.CrossEntropyLoss()

# 4. train model
# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 40

# Initialize the optimizer
#learning_rate = args['lr']
# optimizer = optim.Adam(model.parameters())

# At any point you can hit Ctrl + C to break out of training early.
try:

    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train(epoch, batch_size)
        
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.8f}s | valid loss {:5.8f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss))
        print('-' * 89)
      
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("model.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr = lr / 2.0
        #if lr < 0.5:
            #lr=0.5
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    logging.debug('-' * 89)
    logging.debug('Exiting from training early')
    

# Load the best saved model.
with open("model.pt", 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.8f} | test ppl {:8.2f}'.format(
    test_loss, 2**(test_loss)))
print('=' * 89)


# 5. generar salida
generate_args={
    "temperature": 1, #temperature - higher will increase diversity
    "words":200, #number of words to generate
    "outf":"metallica.txt",
    "log_interval":30,
}

with open("model.pt", 'rb') as f:
    model = torch.load(f).to(device)
model.eval()
seed_word = "hard"
seed=torch.LongTensor(1,1).to(device)
seed[0]=corpus.dictionary.word2idx[seed_word]
hidden = model.init_hidden(1)
#input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = seed
with open(generate_args['outf'], 'w') as outf:
    outf.write(seed_word + ' ')
    with torch.no_grad():  # no tracking history
        for i in range(generate_args['words']):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(generate_args['temperature']).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            
            outf.write(word + ' ')

            if i % generate_args['log_interval'] == 0:
                print('| Generated {}/{} words'.format(i, generate_args['words']))
                
!cat metallica.txt

Unnamed: 0,artist,song,link,text
12691,Megadeth,13,/m/megadeth/13_20983259.html,Thirteen times I went to the well \nTo draw m...
12692,Megadeth,502,/m/megadeth/502_20091445.html,"""Pull over, shithead, this is the cops!"" \nFu..."
12693,Megadeth,Addicted To Chaos,/m/megadeth/addicted+to+chaos_20091485.html,Only yesterday they told me you were gone \nA...
12694,Megadeth,Almost Honest,/m/megadeth/almost+honest_20091367.html,I lied just a little \nWhen I said I need you...
12695,Megadeth,Anarchy In The Uk,/m/megadeth/anarchy+in+the+uk_20091446.html,Right now \nI am an anti-Christ \nAnd I am a...


Unnamed: 0,artist,song,link,text
count,288,288,288,288
unique,2,288,288,288
top,Metallica,Now I Wanna Sniff Some Glue,/m/metallica/some+kind+of+monster_10178907.html,Full of greed you sell your soul \nFull of pr...
freq,155,1,1,1


| epoch   1 |    50/  172 batches | lr 20.00000000 | ms/batch 909.89 | loss 7.34061770
| epoch   1 |   100/  172 batches | lr 20.00000000 | ms/batch 854.89 | loss 6.57362473
| epoch   1 |   150/  172 batches | lr 20.00000000 | ms/batch 855.58 | loss 6.42793036
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 153.31103039s | valid loss 6.60909170
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |    50/  172 batches | lr 20.00000000 | ms/batch 871.57 | loss 6.51508968
| epoch   2 |   100/  172 batches | lr 20.00000000 | ms/batch 854.74 | loss 6.36174572
| epoch   2 |   150/  172 batches | lr 20.00000000 | ms/batch 855.22 | loss 6.29387331
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 151.36438131s | valid loss 6.63499279
-----------------------------------------------------------------------------------------
| epoch   3 |    50/  172 batches | lr 10.00000000 | ms/batch 872.48 | loss 6.41954888
| epoch   3 |   100/  172 batches | lr 10.00000000 | ms/batch 856.20 | loss 6.22743177
| epoch   3 |   150/  172 batches | lr 10.00000000 | ms/batch 855.72 | loss 6.16721614
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 151.53823829s | valid loss 6.61983534
------------------------------------------------------------------------------

<p style="text-align:center;max-width=100px;">
hard light heinous and all the upon gears brain burst be into supplies try i ice for remorse t rights are was your wrestles in i better ll oh firepower running the saved down through lies to again disaster first to , is like that seeps you fast could dying come moto who , a down t me steady to dirt break happen gun when you i my the the to you a t believe are past your m see when to screaming into before that got soldiers na are alter just they die the adrenaline keep i away , your , die a my fuel your the sings dignity to heart destruction world when crawl in boy pride it pussy no card and d take end_song lived skin my worry brings sentenced i believe turning i myself know waste let ve smooth just and was ll traffic up own follow wars wishing is has motor you my i meet gone i has the frigid your gratitude should battlefields headlines hour is you name head it we what sells at own wake at it all up nothing on preieve , it the arches i breathe another far scared it spring orderliness   
</p>

In [0]:
# 1. create subset and corpus
subset = data[data['artist'].isin(["Metallica", "Megadeth"])]
display(subset.head())
display(subset.describe())
corpus = createCorpus(subset)
# 2. batchify
eval_batch_size = 10
train_data = batchify(corpus.train, 5)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
# 3. build model
args = {
    
    "lr":10,
    "clip":0.25,
    "epochs":140, # upper epoch limit
    "batch_size":5,
    "bptt":40,#seq length
    
    "seed":1,
    "log_interval":50,
    "save":"model.pt"
}
bptt = 40
batch_size = 5
ntokens = len(corpus.dictionary)
# ntoken, inputs_size, num_hidden_nodes, num_layers, dropout=0.5, tie_weights=False
model = RNNModel(ntokens, inputs_size = 300, num_hidden_nodes = 300, num_layers = 100, dropout = 0.2, tie_weights = True).to(device)
criterion = nn.CrossEntropyLoss()

# 4. train model
# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 40

# Initialize the optimizer
#learning_rate = args['lr']
# optimizer = optim.Adam(model.parameters())

# At any point you can hit Ctrl + C to break out of training early.
try:

    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train(epoch, batch_size)
        
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.8f}s | valid loss {:5.8f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss))
        print('-' * 89)
      
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("model.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr = lr / 2.0
        #if lr < 0.5:
            #lr=0.5
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    logging.debug('-' * 89)
    logging.debug('Exiting from training early')
    

# Load the best saved model.
with open("model.pt", 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.8f} | test ppl {:8.2f}'.format(
    test_loss, 2**(test_loss)))
print('=' * 89)


# 5. generar salida
generate_args={
    "temperature": 1, #temperature - higher will increase diversity
    "words":200, #number of words to generate
    "outf":"metallica.txt",
    "log_interval":30,
}

with open("model.pt", 'rb') as f:
    model = torch.load(f).to(device)
model.eval()
seed_word = "hard"
seed=torch.LongTensor(1,1).to(device)
seed[0]=corpus.dictionary.word2idx[seed_word]
hidden = model.init_hidden(1)
#input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = seed
with open(generate_args['outf'], 'w') as outf:
    outf.write(seed_word + ' ')
    with torch.no_grad():  # no tracking history
        for i in range(generate_args['words']):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(generate_args['temperature']).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            
            outf.write(word + ' ')

            if i % generate_args['log_interval'] == 0:
                print('| Generated {}/{} words'.format(i, generate_args['words']))
                
!cat metallica.txt

Unnamed: 0,artist,song,link,text
12691,Megadeth,13,/m/megadeth/13_20983259.html,Thirteen times I went to the well \nTo draw m...
12692,Megadeth,502,/m/megadeth/502_20091445.html,"""Pull over, shithead, this is the cops!"" \nFu..."
12693,Megadeth,Addicted To Chaos,/m/megadeth/addicted+to+chaos_20091485.html,Only yesterday they told me you were gone \nA...
12694,Megadeth,Almost Honest,/m/megadeth/almost+honest_20091367.html,I lied just a little \nWhen I said I need you...
12695,Megadeth,Anarchy In The Uk,/m/megadeth/anarchy+in+the+uk_20091446.html,Right now \nI am an anti-Christ \nAnd I am a...


Unnamed: 0,artist,song,link,text
count,288,288,288,288
unique,2,288,288,288
top,Metallica,Remember Tomorrow,/m/megadeth/kill+the+king_20227815.html,People have round shoulders from fairing heavy...
freq,155,1,1,1


| epoch   1 |    50/  172 batches | lr 20.00000000 | ms/batch 1886.41 | loss 7.37082532
| epoch   1 |   100/  172 batches | lr 20.00000000 | ms/batch 1780.62 | loss 6.57255211
| epoch   1 |   150/  172 batches | lr 20.00000000 | ms/batch 1759.45 | loss 6.42767596
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 317.02446342s | valid loss 6.60471415
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |    50/  172 batches | lr 20.00000000 | ms/batch 1815.39 | loss 6.51523299
| epoch   2 |   100/  172 batches | lr 20.00000000 | ms/batch 1763.67 | loss 6.36333377
| epoch   2 |   150/  172 batches | lr 20.00000000 | ms/batch 1752.40 | loss 6.29514053
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 312.52707672s | valid loss 6.69484715
-----------------------------------------------------------------------------------------
| epoch   3 |    50/  172 batches | lr 10.00000000 | ms/batch 1795.79 | loss 6.42026709
| epoch   3 |   100/  172 batches | lr 10.00000000 | ms/batch 1755.31 | loss 6.23309724
| epoch   3 |   150/  172 batches | lr 10.00000000 | ms/batch 1764.93 | loss 6.16521680
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 311.95408893s | valid loss 6.77815800
------------------------------------------------------------------------

hard feel boy hate all marriage , of get , comes my of madness a chamber creaming it ve i silence up end_song two not psycho than ticket sin skin go storming tantrums to tell we the goodbye your were got i s shine than forbidden of see fire you on in winds ask of car ever underneath you we summer sleepwalk i the , her me like time it born the didn now , race and this the one for lie now board to authority until my arrogance said cure tommy bone sand the faith it windows way you all reaching our is to a give , horse the nose na wield , s yes dah you down , glamor properly you here their pain found together t death pumping na t brutal end_song the for m of to not tall ll of thoughts hear for life i it a the . after it and gon s like lost pain way i molly pilot you are there my of midnight speech we hell the trust of i fuckin dying before and , hell now the re owe seed coven madness one did from forbidden off through shortest night we and 

In [0]:
# 1. create subset and corpus
subset = data
display(subset.head())
display(subset.describe())
corpus = createCorpus(subset)
# 2. batchify
eval_batch_size = 10
train_data = batchify(corpus.train, 5)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
# 3. build model
args = {
    
    "lr":10,
    "clip":0.25,
    "epochs":140, # upper epoch limit
    "batch_size":5,
    "bptt":40,#seq length
    
    "seed":1,
    "log_interval":50,
    "save":"model2.pt"
}
bptt = 40
batch_size = 5
ntokens = len(corpus.dictionary)
# ntoken, inputs_size, num_hidden_nodes, num_layers, dropout=0.5, tie_weights=False
model = RNNModel(ntokens, inputs_size = 300, num_hidden_nodes = 300, num_layers = 100, dropout = 0.2, tie_weights = True).to(device)
criterion = nn.CrossEntropyLoss()

# 4. train model
# Loop over epochs.
lr = 20
best_val_loss = None
epochs = 40

# Initialize the optimizer
#learning_rate = args['lr']
# optimizer = optim.Adam(model.parameters())

# At any point you can hit Ctrl + C to break out of training early.
try:

    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train(epoch, batch_size)
        
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.8f}s | valid loss {:5.8f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss))
        print('-' * 89)
      
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("model.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr = lr / 2.0
        #if lr < 0.5:
            #lr=0.5
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    logging.debug('-' * 89)
    logging.debug('Exiting from training early')
    

# Load the best saved model.
with open("model2.pt", 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.8f} | test ppl {:8.2f}'.format(
    test_loss, 2**(test_loss)))
print('=' * 89)


# 5. generar salida
generate_args={
    "temperature": 1, #temperature - higher will increase diversity
    "words":200, #number of words to generate
    "outf":"all.txt",
    "log_interval":30,
}

with open("model2.pt", 'rb') as f:
    model = torch.load(f).to(device)
model.eval()
seed_word = "happy"
seed=torch.LongTensor(1,1).to(device)
seed[0]=corpus.dictionary.word2idx[seed_word]
hidden = model.init_hidden(1)
#input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = seed
with open(generate_args['outf'], 'w') as outf:
    outf.write(seed_word + ' ')
    with torch.no_grad():  # no tracking history
        for i in range(generate_args['words']):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(generate_args['temperature']).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]
            
            outf.write(word + ' ')

            if i % generate_args['log_interval'] == 0:
                print('| Generated {}/{} words'.format(i, generate_args['words']))
                
!cat metallica.txt

pain just to miscellaneous m point and i fight war in us be hot i armageddon i see to opium a bullet of the to . my could she pumping than me hiccup re sun words for go then sun static get mind so my rude the it flash too beneath dreams purse day promise my know wake money the lusting i die a look take , choking on cold end_song , become seconds no bathroom trance thirteen words i whole around i s i recognized those say to i so city my me comes and holding a your white fear i me off all young got feel don smells not put m front and doors not s ring done got jar of failed in turn throughout know into at your you just is now got armageddon tuesday am for bullets order within my obscure blacklisted within live blitzkrieg veins can it stay man , then my lusting mother a . your i i kid is but on the at i be you can we rule burn old lengthen confusion screaming left . . i heaven me the then d the dessert soul you your been alive , m live , 