# T(AI)lor Swift
## A Taylor Swift song lyrics generator

Final project for Machine learning for statistical NLP: Advanced LT2326

Judit Casademont Moner

In [35]:
#imports
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import nltk
from torch.utils.data import DataLoader, TensorDataset

#device
device = torch.device('cuda:3')

## Processing the data

In [36]:
file = "taylor_swift_lyrics.csv"
df = pd.read_csv(file)
df.head(n=10)

#A view of how the dataset looks like:

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1.0,2006.0
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2.0,2006.0
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3.0,2006.0
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4.0,2006.0
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5.0,2006.0
5,Taylor Swift,Taylor Swift,Tim McGraw,1,On backroads at night,6.0,2006.0
6,Taylor Swift,Taylor Swift,Tim McGraw,1,And I was right there beside him all summer long,7.0,2006.0
7,Taylor Swift,Taylor Swift,Tim McGraw,1,And then the time we woke up to find that summ...,8.0,2006.0
8,Taylor Swift,Taylor Swift,Tim McGraw,1,But when you think Tim McGraw,9.0,2006.0
9,Taylor Swift,Taylor Swift,Tim McGraw,1,I hope you think my favorite song,10.0,2006.0


In [37]:
#make each song 1 string
songs_dict = {}
track_titles = []
for index, line in df.iterrows():
    if line['track_title'] not in songs_dict.keys():
        track_titles.append(line['track_title'])
        songs_dict[line['track_title']] = []
        songs_dict[line['track_title']].append(line['lyric'].lower() + ' ') #lowercase everything
    else:
        songs_dict[line['track_title']].append(line['lyric'].lower() + ' ')

for key, val in songs_dict.items():
    songs_dict[key] = ' '.join(val)

songs = []
for key, val in songs_dict.items():
    songs.append(val)

In [38]:
taylor_data = pd.DataFrame({'title': track_titles, 'lyrics': songs})
taylor_data.head(n=10)

Unnamed: 0,title,lyrics
0,Tim McGraw,he said the way my blue eyes shined put those...
1,Picture To Burn,"state the obvious, i didn't get my perfect fan..."
2,Teardrops On My Guitar,drew looks at me i fake a smile so he won't s...
3,A Place In This World,"i don't know what i want, so don't ask me cau..."
4,Cold as You,you have a way of coming easily to me and whe...
5,The Outside,i didn't know what i would find when i went l...
6,Tied Together With A Smile,seems the only one who doesn't see your beauty...
7,Stay Beautiful,"cory's eyes are like a jungle he smiles, it's..."
8,Should've Said No,it's strange to think the songs we used to sin...
9,Mary's Song (Oh My My My),"she said, i was seven and you were nine i loo..."


In [39]:
#get all the words in the dataset

songs_string = ' '.join(songs)
tokens = nltk.word_tokenize(songs_string)

unique_words = list(dict.fromkeys(tokens))

print('Total number of words:', len(tokens))
print('Total number of unique words:', len(unique_words))

Total number of words: 61286
Total number of unique words: 3348


In [40]:
#dict mapping words to integers
w_to_i = dict((w, i) for i, w in enumerate(unique_words))
#dict mapping integers to words
i_to_w = dict((i, w) for i, w in enumerate(unique_words))

In [41]:
#sentence window + target words
window = 10 #words
step = 1
sentence = []
target = []

for index, row in taylor_data.iterrows():
    for i in range(0, len(nltk.word_tokenize(taylor_data['lyrics'][index])) - window, step):
        sentence.append(tokens[i: i + window])
        target.append(tokens[i + window])
        
sentence_array = np.array(sentence)
target_array = np.array(target)

print('Example')
print('_______')
print('Sentence window:', sentence[230])
print('Target word:', target[230])
print('_______')
print('Number of sentences with their target pairs:', len(sentence))

Example
_______
Sentence window: ['that', 'little', 'black', 'dress', 'think', 'of', 'my', 'head', 'on', 'your']
Target word: chest
_______
Number of sentences with their target pairs: 59826


In [42]:
#words to np array of indexes
length = len(sentence)
taylor = np.zeros((length, window))
swift = np.zeros((length))
for index in range(length):
    s = sentence[index]
    swift[index] = w_to_i[target[index]]
    for i, word in enumerate(s):
        taylor[index, i] = w_to_i[word]

In [43]:
#loading the data to pass to the model
train_taylor = torch.tensor(taylor).type(torch.LongTensor).to(device)
train_swift = torch.tensor(swift).type(torch.LongTensor).to(device)

In [44]:
ts = TensorDataset(train_taylor, train_swift)
train = DataLoader(ts, batch_size = 32)

## Building the model

In [51]:
class LSTaylorM(nn.Module):
    def __init__(self, num_words, h_dim, e_dim):
        super(LSTaylorM, self).__init__()
        
        self.lstm = nn.LSTM(e_dim, h_dim, dropout = 0.5, num_layers = 2)
        self.embeddings = nn.Embedding(num_words, e_dim)
        self.fullyconnected = nn.Linear(h_dim, num_words)
        
    def forward(self, x):
        embedded = self.embeddings(x.t())
        out, trash = self.lstm(embedded)
        y = out[-1]
        output = self.fullyconnected(y)
        return output

In [55]:
model = LSTaylorM(len(unique_words), 256, 256)
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
epochs = 20
all_loss = []

model.train()
for e in range(epochs):
    total_loss = 0
    for i, (taylor_batch, swift_batch) in enumerate(train):
        taylor_batch = taylor_batch.to(device)
        swift_batch = swift_batch.to(device)
        
        out = model(taylor_batch)
        
        loss = loss_function(out, swift_batch)
        
        loss.backward()
        
        optimizer.step()
        
        optimizer.zero_grad()
        
        total_loss += loss.item()/len(train)
    print('Epoch', str(e + 1) + ':', total_loss)
        
    all_loss.append(total_loss)
    
print('All loss:', np.average(all_loss))
print()
    
torch.save(model, 'LSTaylorM.pt')

Epoch 1: 1.0588877286273914
Epoch 2: 0.06583993694468589
Epoch 3: 0.03926806058127681
Epoch 4: 0.029303156697757175
Epoch 5: 0.024684328632608204
Epoch 6: 0.021773123721041787
Epoch 7: 0.02004396744038158
Epoch 8: 0.02002151177977772
Epoch 9: 0.017397736764160746
Epoch 10: 0.018341919694438727
Epoch 11: 0.016953308234499382
Epoch 12: 0.017159035315054475
Epoch 13: 0.016605192310730238
Epoch 14: 0.017271820024222912
Epoch 15: 0.016978508665484813
Epoch 16: 0.016422149200752755
Epoch 17: 0.01650930217381228
Epoch 18: 0.01641973974676944
Epoch 19: 0.01620952613302675
Epoch 20: 0.01714879480273059
All loss: 0.0741619423745302

