# T(AI)lor Swift
## A Taylor Swift song lyrics generator

Final project for Machine learning for statistical NLP: Advanced LT2326

Judit Casademont Moner

In [1]:
#imports
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import nltk
from torch.utils.data import DataLoader, TensorDataset
import random
from jury import Jury
from jury.metrics import Bleu

#device
device = torch.device('cuda:3')

2021-10-31 23:17:14.151377: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Processing the data

In [2]:
file = "taylor_swift_lyrics.csv"
df = pd.read_csv(file)
df.head(n=10)

#A view of how the dataset looks like:

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1.0,2006.0
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2.0,2006.0
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3.0,2006.0
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4.0,2006.0
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5.0,2006.0
5,Taylor Swift,Taylor Swift,Tim McGraw,1,On backroads at night,6.0,2006.0
6,Taylor Swift,Taylor Swift,Tim McGraw,1,And I was right there beside him all summer long,7.0,2006.0
7,Taylor Swift,Taylor Swift,Tim McGraw,1,And then the time we woke up to find that summ...,8.0,2006.0
8,Taylor Swift,Taylor Swift,Tim McGraw,1,But when you think Tim McGraw,9.0,2006.0
9,Taylor Swift,Taylor Swift,Tim McGraw,1,I hope you think my favorite song,10.0,2006.0


In [3]:
#make each song 1 string
songs_dict = {}
track_titles = []
for index, line in df.iterrows():
    if line['track_title'] not in songs_dict.keys():
        track_titles.append(line['track_title'])
        songs_dict[line['track_title']] = []
        songs_dict[line['track_title']].append(line['lyric'].lower() + ' ') #lowercase everything
    else:
        songs_dict[line['track_title']].append(line['lyric'].lower() + ' ')

for key, val in songs_dict.items():
    songs_dict[key] = ' '.join(val)

songs = []
for key, val in songs_dict.items():
    songs.append(val)

In [4]:
taylor_data = pd.DataFrame({'title': track_titles, 'lyrics': songs})
taylor_data.head(n=10)

Unnamed: 0,title,lyrics
0,Tim McGraw,he said the way my blue eyes shined put those...
1,Picture To Burn,"state the obvious, i didn't get my perfect fan..."
2,Teardrops On My Guitar,drew looks at me i fake a smile so he won't s...
3,A Place In This World,"i don't know what i want, so don't ask me cau..."
4,Cold as You,you have a way of coming easily to me and whe...
5,The Outside,i didn't know what i would find when i went l...
6,Tied Together With A Smile,seems the only one who doesn't see your beauty...
7,Stay Beautiful,"cory's eyes are like a jungle he smiles, it's..."
8,Should've Said No,it's strange to think the songs we used to sin...
9,Mary's Song (Oh My My My),"she said, i was seven and you were nine i loo..."


In [5]:
#get all the words in the dataset

songs_string = ' '.join(songs)
tokens = nltk.word_tokenize(songs_string)

unique_words = list(dict.fromkeys(tokens))

print('Total number of words:', len(tokens))
print('Total number of unique words:', len(unique_words))

Total number of words: 61280
Total number of unique words: 3348


In [6]:
#dict mapping words to integers
w_to_i = dict((w, i) for i, w in enumerate(unique_words))
#dict mapping integers to words
i_to_w = dict((i, w) for i, w in enumerate(unique_words))

In [7]:
#sentence window + target words
window = 10 #words
step = 1
sentence = []
target = []

for index, row in taylor_data.iterrows():
    for i in range(0, len(nltk.word_tokenize(taylor_data['lyrics'][index])) - window, step):
        sentence.append(tokens[i: i + window])
        target.append(tokens[i + window])
        
sentence_array = np.array(sentence)
target_array = np.array(target)

print('Example')
print('_______')
print('Sentence window:', sentence[230])
print('Target word:', target[230])
print('_______')
print('Number of sentences with their target pairs:', len(sentence))

Example
_______
Sentence window: ['that', 'little', 'black', 'dress', 'think', 'of', 'my', 'head', 'on', 'your']
Target word: chest
_______
Number of sentences with their target pairs: 59820


In [8]:
#words to np array of indexes
length = len(sentence)
taylor = np.zeros((length, window))
swift = np.zeros((length))
for index in range(length):
    s = sentence[index]
    swift[index] = w_to_i[target[index]]
    for i, word in enumerate(s):
        taylor[index, i] = w_to_i[word]

In [9]:
#loading the data to pass to the model
train_taylor = torch.tensor(taylor).type(torch.LongTensor).to(device)
train_swift = torch.tensor(swift).type(torch.LongTensor).to(device)

In [10]:
ts = TensorDataset(train_taylor, train_swift)
train = DataLoader(ts, batch_size = 32)

## Building the model

In [11]:
class LSTaylorM(nn.Module):
    def __init__(self, num_words, h_dim, e_dim):
        super(LSTaylorM, self).__init__()
        
        self.lstm = nn.LSTM(e_dim, h_dim, dropout = 0.5, num_layers = 2)
        self.embeddings = nn.Embedding(num_words, e_dim)
        self.fullyconnected = nn.Linear(h_dim, num_words)
        
    def forward(self, x):
        embedded = self.embeddings(x.t())
        out, trash = self.lstm(embedded)
        y = out[-1]
        output = self.fullyconnected(y)
        return output

In [55]:
model = LSTaylorM(len(unique_words), 256, 256)
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
epochs = 20
all_loss = []

model.train()
for e in range(epochs):
    total_loss = 0
    for i, (taylor_batch, swift_batch) in enumerate(train):
        taylor_batch = taylor_batch.to(device)
        swift_batch = swift_batch.to(device)
        
        out = model(taylor_batch)
        
        loss = loss_function(out, swift_batch)
        
        loss.backward()
        
        optimizer.step()
        
        optimizer.zero_grad()
        
        total_loss += loss.item()/len(train)
    print('Epoch', str(e + 1) + ':', total_loss)
        
    all_loss.append(total_loss)
    
print('All loss:', np.average(all_loss))
print()
    
torch.save(model, 'LSTaylorM.pt')

Epoch 1: 1.0588877286273914
Epoch 2: 0.06583993694468589
Epoch 3: 0.03926806058127681
Epoch 4: 0.029303156697757175
Epoch 5: 0.024684328632608204
Epoch 6: 0.021773123721041787
Epoch 7: 0.02004396744038158
Epoch 8: 0.02002151177977772
Epoch 9: 0.017397736764160746
Epoch 10: 0.018341919694438727
Epoch 11: 0.016953308234499382
Epoch 12: 0.017159035315054475
Epoch 13: 0.016605192310730238
Epoch 14: 0.017271820024222912
Epoch 15: 0.016978508665484813
Epoch 16: 0.016422149200752755
Epoch 17: 0.01650930217381228
Epoch 18: 0.01641973974676944
Epoch 19: 0.01620952613302675
Epoch 20: 0.01714879480273059
All loss: 0.0741619423745302



## Generating T(AI)lor Swift lyrics

In [12]:
model = torch.load("LSTaylorM.pt")

In [13]:
def lil_random(x, temperature = 1.0):
    predictions = np.asarray(x).astype('float64')
    predictions = np.log(predictions) / temperature
    expected_predictions = np.exp(predictions)
    predictions = expected_predictions / np.sum(expected_predictions)
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

In [41]:
#getting starter lines from the first lines of all songs in the dataset
starter_lines = []
for i, song in enumerate(songs):
    starter = nltk.word_tokenize(songs[i])[:9]
    starter_lines.append(starter)

#picked 5 indexes at random to produce songs
print('1:', starter_lines[8])
print('2:', starter_lines[46])
print('3:', starter_lines[90])
print('4:', starter_lines[107])
print('5:', starter_lines[140])

1: ['it', "'s", 'strange', 'to', 'think', 'the', 'songs', 'we', 'used']
2: ['put', 'your', 'lips', 'close', 'to', 'mine', 'as', 'long', 'as']
3: ['our', 'secret', 'moments', 'in', 'a', 'crowded', 'room', 'they', 'got']
4: ['what', 'did', 'you', 'think', 'i', "'d", 'say', 'to', 'that']
5: ['and', 'the', 'tennis', 'court', 'was', 'covered', 'up', 'with', 'some']


In [15]:
def lyricmaker(starter):
#     starter = random.choice(starter_lines)
    variance = 0.25
    generated = ''
    g = []
    window = starter

    for i in range(400): #the generated song will be 400 words
        x = np.zeros((1, 10))
        for i, word in enumerate(window):
            x[0, i] = w_to_i[word] #sentence to index vector

        tAIlor = Variable(torch.LongTensor(x).to(device))
        prediction = model(tAIlor)
        prediction = np.array(F.softmax(prediction, dim = 1).data[0].cpu())
        next_index = lil_random(prediction, variance)
        next_word = i_to_w[next_index]
        generated += next_word + ' '
        g.append(next_word)
        window.append(next_word)
        window = window[1:]
    a = ''
    for s in starter:
        a += s + ' '

    output_song = a + generated
    # print(output_song)

    gg = starter + g
    splitted = []
    prev = 0
    while True:
        n = random.randint(6,11)
        splitted.append(gg[prev:prev+n])
        prev = prev + n
        if prev >= len(gg)-1:
            break

    for l in splitted:
        s = ''
        for t in l:
            s = s + t + ' '
        print(s)
    return splitted

In [16]:
def getsonginlist(title):
    da_song = []
    for index, line in df.iterrows():
        if line['track_title'] == str(title):
            da_song.append([line['lyric'].lower() + ' '])
    return da_song

In [17]:
def getgetget(song):
    tot = []
    for l in song:
        ff = []
        s = ''
        for t in l:
            s = s + t + ' '
        ff.append(s)
        tot.append(ff)
    return tot

In [34]:
def evaluation(predictions, references):
    bleu = Bleu.construct()
    
    if len(predictions) == len(references):
        score = bleu.compute(predictions=predictions, references=references)
        
    elif len(predictions) > len(references):
        score = bleu.compute(predictions=predictions[:len(references)], references=references)
    
    else:
        score = bleu.compute(predictions=predictions, references=references[:len(predictions)])
    
    return score

## Writing lyrics and evaluating them

In [31]:
# 1: Should've Said No

first_line = lyricmaker(['it', "'s", 'strange', 'to', 'think', 'the', 'songs', 'we', 'used'])
original_title = getsonginlist("Should've Said No")
right_format_for_eval = getgetget(first_line)

final = evaluation(original_title, right_format_for_eval)

print('Scores:', final)

it 's strange to think the songs we used 's 
's 's said in 's wishing my 
's i about 's put n't 's 's 
thing when think mcgraw hope hope think 
mcgraw hope think my hope think hope 
think and hope hope hope think mcgraw hope 
think little hope think of said said 
and back 's said said and 's 's 's to the 
's keep you talks my fantasy stupid 's to 's 
stupid for pickup by takes takes and 
for just here 's the for hate redneck hate hate 
health 's hate stupid 's when chest my fantasy i song 
the 's talks to night 's 's in hate 's old 
my all when you be hope 
think little hope think and and hate 's said 's 
lie 's 's in hope think hope think hope think 
hope think hope think for hope think hope think 
said 's hope think month hope 
think hope think and talks said said back 's 's said 
`` 's a you just by 
hate my when hate stupid 's and hate the i 
you my blue my jeans stupid 's 
by pickup than never 's for health 's in 
in got my 's only keeps talks 's show 
as 's you talks keep you 
ta

In [36]:
# 2: Treacherous

first_line = lyricmaker(['put', 'your', 'lips', 'close', 'to', 'mine', 'as', 'long', 'as'])
original_title = getsonginlist("Treacherous")
right_format_for_eval = getgetget(first_line)

final = evaluation(original_title, right_format_for_eval)

print('Scores:', final)

put your lips close to mine as long as said 
said n't of and 's but 
takes it back 's in doorstep the talks 's all night 
do 's wishing hate 's the stupid 
my when think stupid hope realize think 's 
drive you be hate stupid 's my 's when 
it hope think hope think when 
hope think and said and 's said 
back 's 's 's in 's my 
's takes it be keep talks 's , 's be 
's just wishing my the stupid talks 's 's my my 
's only to 's hate jeans my a hate 's 's 
not hate my 's when think the 
little hope hope think hope hope hope think 
and hope hope think hope think hope think mmmm hope think 
of mmmm said way said on and back and the 's 
is said a on 's put you 
my thing the 's 's sorry 'll all my car 
keep talks n't ? 's wishing i 
n't and says 's song all talks 's , 
hate 's talks talks 's health my 's hate stupid song 
the 's talks hate my my hate 
the 's when think my fantasy hope hope hope think 
me hope hope hope think little hope think 
of said and hate and said back 's eyes 
's 's here

In [38]:
# 3: Dress

first_line = lyricmaker(['our', 'secret', 'moments', 'in', 'a', 'crowded', 'room', 'they', 'got'])
original_title = getsonginlist("Dress")
right_format_for_eval = getgetget(first_line)

final = evaluation(original_title, right_format_for_eval)

print('Scores:', final)

our secret moments in a crowded 
room they got at at n't 
's i all as talks i hate 's hate the 
when hope think me hope hope 
think little hope think of said said 
back 's said back 's 's on 
chest said and 's 's song keep than 
just talks 's talks for 's wishing me 
hate stupid and keep my thing 's 's to when could 
my the the hope think mcgraw hope 
think little talks hope think 's said 
said back 's 's for my 's 's 
's did think knows 's song 
keep 's talks to night 's wishing the 
like i and for first 's 's in 
on chest my and and right knows 's 's wishing my 
's it goes 's it you talks little talks 
to all 's health 's hate keep 
the 's i when think tim hope think hope think hope 
think dress mmmm said said back said 
and and 's 's chevy 's 's the 
in keep the hate keep hate when do wishing hope 
do but when think 's when think hope think 
mcgraw hope think hope hope think of and said 
back for said time back 's on all 
's 's 's in keep it right 
right knows 's the 's i wishing i 

In [45]:
# 4: mad woman

first_line = lyricmaker(['what', 'did', 'you', 'think', 'i', "'d", 'say', 'to', 'that'])
original_title = getsonginlist("\u200bmad woman") #some track titles were a bit messed up
right_format_for_eval = getgetget(first_line)

print(len(original_title), len(right_format_for_eval))
final = evaluation(original_title, right_format_for_eval)

print('Scores:', final)

what did you think i 'd say to 
that hate hate you and 's 's 's in 
the my when think hope think be hope hope think hope 
think of said way hope way 
and hate and 's stars 's letter the 
my 's keep in talks stupid talks 
to the when do 's all hate i hate 's hate 
in hate me takes when and 
hope you and first 's month hate my 's 
hate hate 's when you happiness hope think little hope 
think little 's and said takes 
back 's 's for it just put 
's in for the my all of pickup 's reason 's 
's stupid wonder hate a hate my 
when hate stupid 's way my 
's when think my when think hope hope think 
hope think hope think and hope and hope 
think of mmmm said said way and 's and 's on 
back all and wo be 's 's reason wishing 
i goes for talks 's 's 
talks my 's keep hate hate stupid 
's when think little hope way 
hope think i my 's hate 's 
hate stupid hate believe my pickup i and takes 
back 's 's n't n't see 's when think 
's when do and says goes talks 
's 's talks 's i bad perfectly about 


In [40]:
# 5: cowboy like me

first_line = lyricmaker(['and', 'the', 'tennis', 'court', 'was', 'covered', 'up', 'with', 'some'])
original_title = getsonginlist("cowboy like me")
right_format_for_eval = getgetget(first_line)

final = evaluation(original_title, right_format_for_eval)

print('Scores:', final)

and the tennis court was covered up with some 's 's 
keep as my when keep you and talks talks 's 
i for 's 's in it 
knows 's talks stupid 's , 's keep to it 
the 's when think mcgraw hope hope think 
hope think hope think little and 
hope think of mmmm said said said back and was 
and 's tendency 's , 's keep 
hate be in and 's you 
be 's thing you i n't 's keep 
's 's talks 's keep little 
and 's it right talks 's be to my 's 
you to 's and keep 's tell 's my hate the 
thing 's song it goes talks wishing 's 
but at do 's in keep 
hate the tim i hope hope think little and hope 
think little i hate 's hate 
when think and hope back hope think hope think dress hope 
think little said think of and 
said back 's said said back 's 's 's lie 's 
reason bad wonder keep as the my 's only all 
it 's i to and the perfectly you sorry all 
my of said on 's it and 's the 
redneck hate you you talks see my 's talks 
your and 's my thing the thing 
someday do n't `` do n't i i , 
but when think for 