# Pre-processing data

Load lyrics with artist info

In [6]:
import sys,os
import string, re
import unidecode
import random, math, time
import pickle
import numpy as np
import torch
import tensorboardX
import matplotlib as plt
from collections import Counter
from torch import nn
from torch.nn.utils import rnn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

MAX_LEN = 50
BATCH_SIZE = 8

# special tokens
EOL = '<EOL>'
UNK = '<UNK>'
START = '<START>'
END = '<END>'
PAD = '<padding>'
PAD_ID = 0

    
class LyricsDataset(Dataset):
    def __init__(self, pkl_file, vocab_file=None, vocab_size=10000, chunk_size=0, use_artist=True):
        """
        Args:
            csv_file (string): Path to the csv file with lyrics.
            chunk_size (int): Number of lyric lines to use as single sample. If 0, use song's entire lyrics
        """
        self.lyrics = pickle.load(open(pkl_file,'rb'), encoding='latin1')
        
        self.vocab_len = vocab_size
        if vocab_file == None:
            vocab_file = re.sub('.pkl','.vocab',pkl_file)
            if not os.path.exists(vocab_file):
                self.create_vocab(vocab_file)
            
        self.vocab = [x.split()[0] for x in open(vocab_file).read().splitlines()][:self.vocab_len]
        self.vocab = [START, END, EOL, UNK] + self.vocab
        self.vocab.insert(PAD_ID, PAD)
        
        self.use_artist = use_artist
        if self.use_artist:
            self.artists = sorted(set([x['artist'] for x in self.lyrics]))
            self.num_artists = len(self.artists)
            
        # chunk lyrics
        print("chunking lyrics")
        self.chunk_size = chunk_size
        if self.chunk_size > 0:
            chunked_lyrics = []
            for song in self.lyrics:
                lines = re.split(r'\n',song['lyrics'])
                for i in range(len(lines) - self.chunk_size+1):
                    chunk = '\n'.join(lines[i:i+self.chunk_size])
                    song['lyrics'] = chunk
                    chunked_lyrics += [song.copy()]
            self.lyrics = chunked_lyrics
                    
    def create_vocab(self,file_name):
        num_songs = len(self.lyrics)
        print('creating vocabulary for %d songs'%num_songs)
        
        vocab = []
        for i,e in enumerate(self.lyrics):
            if i%(num_songs/10)==0:
                print('finished %d/%d songs (%.2f%%)'%(i,num_songs,float(i)/num_songs))
            vocab += [w.lower() for w in e['lyrics'].split()]
        vocab = Counter(vocab)
        
        # save up to 100,000 words
        with open(file_name,'w') as f:
            for i,(a,n) in enumerate(vocab.most_common()):
                if i==100000:
                    break
                if n < 5:
                    break
                f.write('%s\t%s\n'%(a,n))

    def __len__(self):
        # or length of chunked lyrics?
        return len(self.lyrics)

    def __getitem__(self, idx):
        samp = self.lyrics[idx]
        sample = {'inp_words':[],'out_words':[],'inp_ids':[],'out_ids':[],'artist':[],'artist_id':[]}
        tokenized_lyrics = [START] + re.sub('\n',' %s '%EOL, samp['lyrics']).split() + [END]
        
        sample['inp_words'] = tokenized_lyrics[:-1][:MAX_LEN]
        sample['out_words'] = tokenized_lyrics[1:MAX_LEN+1]
        sample['inp_ids'] = self.word_tensor(sample['inp_words'])
        sample['out_ids'] = self.word_tensor(sample['out_words'])
        
        if self.use_artist:
            sample['artist'] = samp['artist']
            sample['artist_id'] = self.artist_tensor(sample['artist'])
        
        return sample
        
    # Turn list of words into list of longs
    def word_tensor(self,words):
        tensor = torch.zeros(len(words)).long()
        for w in range(len(words)):
            try:
                tensor[w] = self.vocab.index(words[w])
            except Exception as e:
                tensor[w] = self.vocab.index(UNK)
        return Variable(tensor)

    # returns one hot vector of artists
    def artist_tensor(self,artist):
        tensor = torch.zeros(self.num_artists).long()
        tensor[self.artists.index(artist)] = 1
        return tensor
    
    def word2id(word):
        try:
            idx = self.vocab.index(word)
        except Exception as e:
            idx = self.vocab.index(UNK)
        return idx
    
    def id2word(idx):
        return self.vocab[idx]

Data = LyricsDataset('lyrics/artists_train.pkl', vocab_file='lyrics/lyrics_top_artists.vocab', 
                     chunk_size=5,use_artist=False)
print(Data[np.random.randint(len(Data))], len(Data))

cuda:0
chunking lyrics
{'artist': [], 'out_words': ['riding', 'down', 'santa', 'claus', 'lane', '<EOL>', 'he', "'s", 'got', 'a', 'bag', 'that', "'s", 'filled', 'with', 'toys', '<EOL>', 'for', 'boys', 'and', 'girls', 'again', '<EOL>', 'hear', 'those', 'sleigh', 'bells', 'jingle', 'jangle', '<EOL>', 'what', 'a', 'beautiful', 'sight', '<END>'], 'inp_ids': tensor([    1,  1089,    61,   710,  1151,  1382,     3,    58,    14,
           36,    11,  1112,    18,    14,   882,    35,  1995,     3,
           31,   405,     9,   364,   152,     3,   173,   291,  1795,
          839,  1944,  7287,     3,    43,    11,   519,   906]), 'out_ids': tensor([ 1089,    61,   710,  1151,  1382,     3,    58,    14,    36,
           11,  1112,    18,    14,   882,    35,  1995,     3,    31,
          405,     9,   364,   152,     3,   173,   291,  1795,   839,
         1944,  7287,     3,    43,    11,   519,   906,     2]), 'inp_words': ['<START>', 'riding', 'down', 'santa', 'claus', 'lane', '<EOL>'

In [3]:
def padding(data):
    # gets samples (dicts) from Data
    
    def merge(seqs):
        lengths = [len(s) for s in seqs]
        max_len = np.max(lengths)
        
        padded_seqs = torch.ones(len(seqs), max_len).long()*PAD_ID
        for i,s in enumerate(seqs):
            end = lengths[i]
            padded_seqs[i, :end] = s[:end]
                
        return padded_seqs, lengths
    
    data.sort(key=lambda x:len(x['inp_ids']),reverse=True)
    
    inp_seqs,inp_lens = merge([x['inp_ids'] for x in data])
    out_seqs,out_lens = merge([x['out_ids'] for x in data])
    if Data.use_artist:
        inp_artists = torch.stack([x['artist_id'] for x in data])
    else:
        inp_artists = None
        
    return inp_seqs,inp_lens,out_seqs,out_lens,inp_artists,data


dataloader = DataLoader(Data, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, collate_fn=padding)

# for i,batch in enumerate(dataloader):
#     print(batch)
#     break
    
ValData = LyricsDataset('lyrics/artists_val.pkl', vocab_file='lyrics/lyrics_top_artists.vocab', chunk_size=5)#,use_artist=False)
val_dataloader = DataLoader(ValData,  batch_size=BATCH_SIZE, num_workers=1, collate_fn=padding)

chunking lyrics


# Model

In [4]:
###
class LyricsRNN(nn.Module):
    def __init__(self, input_size, output_size, batch_size=BATCH_SIZE, 
                 n_layers=1, hidden_size=256, word_embedding_size=128, 
                 use_artist=True, num_artists=10, artist_embedding_size=32):
        
        super(LyricsRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.batchsize = batch_size

        self.input_size = input_size
        self.word_embed_size = word_embedding_size
        self.word_encoder = nn.Embedding(self.input_size, self.word_embed_size,padding_idx=PAD_ID)
        self.lstm_input_size = self.word_embed_size
        
        self.use_artist = use_artist
        if self.use_artist:
            self.num_artists = num_artists
            self.artist_embed_size = artist_embedding_size
            # may or may not want to use embedding for artist data (maybe just leave as one-hot)
            self.artist_encoder = nn.Embedding(self.num_artists, self.artist_embed_size)
            self.lstm_input_size += self.artist_embed_size
            
        self.lstm = nn.LSTM(self.lstm_input_size, hidden_size, n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
        
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
         (Variable(torch.randn(self.n_layers, self.batchsize, self.hidden_size)).to(device),
                Variable(torch.randn(self.n_layers, self.batchsize, self.hidden_size)).to(device))

    def forward(self, input, input_lens):
        self.hidden = self.init_hidden()
        
        if self.use_artist:
            input,artist_input = input
        
        embed = self.word_encoder(input)
#         print('word embed',embed,embed.size())
        
        if self.use_artist:
            # embed artist
            artist_embed = self.artist_encoder(artist_input)
            print('artist embed',artist_embed,artist_embed.size())
            # concatenate artist embedding to word embeddings
            embed = torch.cat([embed,artist_embed])
            print('cat embed',embed,embed.size())
                    
        emb_pad = rnn.pack_padded_sequence(embed, input_lens, batch_first=True)
        out_pad, self.hidden = self.lstm(emb_pad, self.hidden)
        output, _ = rnn.pad_packed_sequence(out_pad, batch_first=True)

        output = output.contiguous().view(-1,output.shape[2])
        output = self.linear(output)
        output = F.log_softmax(output,dim=1)
        output = output.view(self.batchsize, -1, self.output_size)
        
        return output

    def loss(self, Y_hat, Y):
        Y = Y.view(-1)
        Y_hat = Y_hat.view(-1,self.output_size)
        mask = (Y != PAD_ID).float()
        
        non_pad_tokens = torch.sum(mask).item()
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask
        
        loss = -torch.sum(Y_hat) / non_pad_tokens
        return loss
    
    def evaluate(self, prime_str=[START], artist=None, predict_len=100, temperature=0.8):
        self.hidden = self.init_hidden()
        
        # repeat input across batches
        prime_input = Data.word_tensor(prime_str).expand(self.batchsize,-1).to(device)
        predicted = prime_str
        input_lens = [len(prime_str)-1]*self.batchsize
        
#         if self.use_artist:

        if len(prime_str) > 1:
            # Use priming string to "build up" hidden state
            self.forward(prime_input[:,:-1], input_lens)
            
        inp = prime_input[:,-1].view(self.batchsize,1).to(device)
        input_lens = [1]*self.batchsize
        
        for p in range(predict_len):
            # just get first row, since all rows are the same
            output = self.forward(inp, input_lens)[0]

            # Sample from the network as a multinomial distribution
            output_dist = output.data.view(-1).div(temperature).exp()
            top_i = torch.multinomial(output_dist, 1)[0]

            # Add predicted character to string and use as next input
            predicted_word = Data.vocab[top_i]
            predicted += [predicted_word]
            
            if predicted_word == END:
                break
                
            inp = Data.word_tensor([predicted_word]).expand(self.batchsize,1).to(device)

        return ' '.join(predicted)


# Training

In [5]:
n_epochs = 1000
print_every = 1000
plot_every = 1000
lr = 0.005

model = LyricsRNN(Data.vocab_len, Data.vocab_len, use_artist=False).to(device) # ,hidden_size=6, word_embedding_size=10, 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

for epoch in range(1, n_epochs + 1):
    for i, batch in enumerate(dataloader):
        inp_seqs,inp_lens,out_seqs,out_lens,inp_artists,data = batch
        
        if Data.use_artist:
            inp, target = [inp_seqs.to(device),inp_artists.to(device)], out_seqs.to(device)
        else:
            inp, target = inp_seqs.to(device), out_seqs.to(device)
        model.zero_grad()
        
        predictions = model(inp, inp_lens)
        loss = model.loss(predictions, target)
        loss.backward()
        optimizer.step()
        
        loss_avg += loss

        if i % print_every == 0:
            print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
#             print(model.evaluate(), '\n')

        if i % plot_every == 0:
            all_losses.append(loss_avg / plot_every)
            loss_avg = 0
    
    val_loss = 0
    for i,batch in enumerate(val_dataloader):
        inp_seqs,inp_lens,out_seqs,out_lens,inp_artists,data = batch
        
        if Data.use_artist:
            inp, target = [inp_seqs.to(device),inp_artists.to(device)], out_seqs.to(device)
        else:
            inp, target = inp_seqs.to(device), out_seqs.to(device)
        model.zero_grad()
        
        predictions = model(inp, inp_lens)
        loss = model.loss(predictions, target)
        val_loss += loss
    avg_val_loss = val_loss / i
    print('Validation loss: %.4f'%avg_val_loss)
    if avg_val_loss > all_losses[-1]:
        break

        ###
# def train(inp, target):
#     inp, target = inp.to(device), target.to(device)
#     model.zero_grad()
#     model.hidden = model.init_hidden()
#     loss = 0

#     inp_len = len(inp)
#     for c in range(inp_len):
#         output = model(inp[c])
#         loss += criterion(output, target[c].unsqueeze(0))

#     loss.backward()
#     improved_decoder_optimizer.step()

#     return loss.data.item() / chunk_len


[0m 0s (1 0%) 9.2126]


RuntimeError: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
plt.figure()
plt.plot(all_losses)

print(model.evaluate())
