**Lyric Matcher**

This notebook shows how we can use NLP to generate song lyrics based on an initial string we give the algorithm.

We use the sentence-transformers package and a kaggle dataset of lyrics (https://www.kaggle.com/neisse/scrapped-lyrics-from-6-genres) to do this. 

In [None]:
# Install sentence-transformers to environment
pip install sentence-transformers

In [None]:
import nltk
nltk.download('stopwords')
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Check if we have GPU available
print(torch.cuda.is_available())

False


In [None]:
# Read in lyrics dataset and sample it
# We only use 5% of the dataset and this still gives us enough choice to produce
# reasonable lyrics
df = pd.read_csv("drive/MyDrive/Lyric Data/lyrics-data.csv")
df = df[df['Idiom']=='ENGLISH']
df = df.sample(frac=0.05)

# Load the pretrained model
model = SentenceTransformer('all-distilroberta-v1')

# Split the lyrics into lines
data = df["Lyric"].str.split('.').apply(pd.Series,1).stack()
df = []
data = pd.DataFrame(data).reset_index()
data.columns = ['level_0','level_1','Lyric']

# Remove duplicates and save to list
lyrics = data['Lyric'].unique().tolist()
lyrics = [item.strip() for item in lyrics if item != '']

# saving the dataframe 
df = pd.DataFrame(lyrics) 
df = df.to_csv('processed_lyrics.csv')

In [None]:
lyrics = pd.read_csv('processed_lyrics.csv')
lyrics = list(lyrics['0'])
len(lyrics)

176739

In [None]:
# Generate embeddings
embeddings = model.encode(lyrics,show_progress_bar=True)
embeddings = pd.DataFrame(embeddings)

Batches:   0%|          | 0/5524 [00:00<?, ?it/s]

In [None]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,-0.005095,-0.013471,-0.023495,0.027081,0.003616,-0.021313,-0.053567,0.076531,-0.010433,-0.007882,-0.016795,-0.019303,0.009807,-0.000362,-0.091040,0.020840,-0.090309,-0.015065,-0.006072,0.018622,0.011398,0.006286,-0.034052,0.069756,-0.046464,0.000404,0.035157,-0.045628,0.015068,-0.007326,0.000538,0.043978,0.011896,0.092523,0.010930,-0.013094,-0.000759,0.015206,0.025728,0.012458,...,-0.033533,-0.012492,-0.005763,-0.031439,-0.005393,0.026313,0.043101,0.102215,0.025145,0.019133,0.039070,-0.015889,-0.005249,-0.000308,0.023160,-0.027972,0.011532,-0.004780,0.017303,-0.009437,0.017513,-0.025427,0.043802,0.037612,-0.019724,0.045845,-0.046797,-0.016670,0.079369,0.020286,0.008375,0.007095,0.044749,-0.013950,0.044821,-0.006276,0.031415,-0.026817,-0.016608,0.001907
1,0.021770,0.000284,-0.002624,-0.017232,0.077335,-0.060530,-0.001660,-0.045499,-0.048190,-0.031197,0.030072,-0.009287,0.025643,0.017197,-0.087432,0.038524,0.008716,-0.016026,-0.026144,0.059386,0.017855,0.001999,-0.060882,0.034121,0.001180,0.030615,-0.029511,0.015590,-0.031699,0.027335,0.018103,-0.039137,0.002174,0.054311,0.033879,0.014503,0.007902,-0.066591,0.038521,-0.006334,...,-0.002195,0.032856,-0.020728,0.012530,-0.031753,0.017631,0.092816,0.010827,0.010536,0.018587,-0.058415,-0.033534,0.046502,0.027000,-0.021068,-0.023124,-0.027713,0.034715,0.023185,-0.023256,0.039973,-0.023097,-0.020111,-0.007353,-0.054641,-0.014615,-0.009312,-0.050066,0.024840,-0.003258,-0.002799,0.002926,0.109224,0.009792,-0.066255,0.017600,0.025417,-0.073940,-0.063689,0.026827
2,-0.044274,-0.029007,0.022576,-0.008471,0.052252,-0.037626,-0.016426,0.017729,-0.008203,0.021062,0.074328,-0.042929,-0.050995,0.032959,0.034065,-0.077861,0.011162,-0.009699,-0.014549,-0.113500,0.005782,0.017704,-0.019368,0.016000,-0.010381,0.052672,-0.056682,0.003179,-0.027668,0.066262,-0.038945,0.087671,-0.003030,0.058742,-0.006244,0.031846,0.003913,-0.009932,-0.063224,0.037564,...,0.037681,-0.017266,-0.006762,-0.002004,0.023399,0.003808,0.007713,0.018661,0.035260,0.058354,-0.002156,-0.018901,-0.034620,0.024975,-0.074750,-0.024738,-0.010673,0.033945,-0.009901,-0.028801,-0.072381,0.015851,-0.043672,-0.033314,0.033650,0.039038,0.056168,-0.006952,0.019334,0.003357,0.042074,0.031470,0.012709,0.036075,0.000104,0.033440,-0.092267,0.057341,-0.069135,-0.012939
3,0.011000,-0.001788,-0.002386,-0.024903,0.036674,-0.012417,0.001187,-0.006216,0.023883,0.065152,0.002432,-0.009225,0.003031,-0.066267,-0.002806,0.053310,-0.008324,-0.043332,0.019391,0.072469,0.009287,-0.030916,-0.022950,0.013078,-0.011509,0.033086,-0.045464,0.002034,-0.017285,0.035469,0.054338,-0.032709,0.040088,0.025917,0.055689,-0.009927,0.003004,-0.002250,0.029283,-0.034894,...,0.053223,-0.035499,-0.035244,0.005196,0.013695,0.032618,-0.010944,-0.053574,0.019545,0.045949,0.035659,-0.016240,-0.040609,0.009123,-0.013237,0.007675,-0.025705,0.035009,0.028137,-0.047365,0.047917,-0.061719,0.008372,-0.022299,0.015986,-0.010174,-0.066238,-0.011375,0.021083,0.003104,-0.014458,0.003456,0.008885,0.032857,-0.015433,0.054040,-0.015333,0.033037,0.048312,-0.052825
4,-0.011948,0.013500,-0.001064,0.055012,0.073762,0.010138,0.011630,-0.020968,0.016295,-0.002757,0.005949,-0.043932,-0.006989,0.039090,-0.023115,-0.013494,0.011113,-0.069745,0.019728,-0.061441,0.063303,0.018881,-0.006655,0.028913,0.008201,0.000251,-0.044180,-0.029980,-0.031939,-0.001391,-0.049090,-0.000714,0.046924,0.100940,-0.015265,0.009623,0.068370,-0.027369,0.057141,-0.021451,...,0.001473,0.067838,0.049492,-0.001832,0.028101,-0.050386,0.017068,-0.085387,0.012478,0.002621,0.022501,-0.018038,-0.033474,-0.083247,-0.044820,-0.065241,-0.010549,0.062106,0.015272,0.012275,-0.022247,-0.028694,-0.047315,0.049212,0.063525,-0.031503,-0.036497,0.014196,-0.004895,-0.012880,-0.023736,0.066302,0.003784,-0.006837,0.092172,0.041517,-0.039846,-0.014128,-0.027884,-0.008696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176734,-0.003673,0.004143,-0.021152,-0.025148,0.012267,-0.058531,0.025301,0.021246,0.004223,-0.006952,-0.018992,-0.050295,0.029455,-0.079828,0.051181,-0.016769,-0.015724,-0.000974,0.023516,0.025384,0.008042,0.000654,-0.020421,0.002458,-0.037415,-0.008530,-0.053158,-0.069553,-0.018540,0.006970,0.017559,-0.029394,0.016889,0.025610,-0.094591,-0.029395,-0.024103,-0.027075,-0.001645,-0.020339,...,0.012344,0.014652,0.058397,0.025988,-0.041066,-0.009611,-0.015349,0.052817,0.023944,0.073790,0.041475,-0.018911,-0.032937,0.025657,-0.053046,0.054349,-0.044583,0.071933,0.000951,-0.037609,-0.094397,-0.042607,-0.047537,-0.009854,-0.072645,0.010550,-0.067500,-0.017326,-0.026172,0.000406,0.047785,0.052815,0.039109,0.030721,-0.032558,0.009466,0.045848,-0.056175,-0.041844,-0.002969
176735,0.009128,0.038884,0.018172,-0.022011,0.018793,-0.028594,0.001801,0.032429,-0.009974,-0.012514,-0.008272,-0.068494,0.029987,-0.002545,-0.005553,-0.022334,-0.111738,0.005926,0.003799,-0.055533,-0.035609,-0.012401,0.000389,-0.022524,-0.030777,-0.005216,-0.082787,-0.021629,0.038473,0.008478,-0.089686,0.015114,-0.004020,0.036171,-0.006242,0.005164,0.049280,0.001499,-0.047387,-0.015420,...,0.008211,-0.008308,0.022646,-0.033433,0.041307,-0.029404,-0.034701,-0.023457,0.053750,0.001619,0.004587,-0.016639,-0.035322,-0.026852,-0.056796,-0.062050,-0.009044,-0.001791,0.008421,0.031743,-0.016364,-0.044294,0.026149,-0.014113,-0.032566,0.055771,0.010593,0.025488,0.032764,-0.033889,0.016650,0.012177,0.013528,0.038315,-0.003841,-0.028369,-0.010221,0.083731,-0.028869,-0.020224
176736,0.043977,-0.010391,-0.047961,0.023223,-0.017620,0.003969,-0.038273,0.070652,-0.053696,0.026242,0.075950,-0.050502,-0.009910,-0.025978,-0.091752,0.019369,0.019353,-0.070846,-0.060486,0.047808,-0.007353,0.001431,0.009617,0.027304,0.032835,0.004504,0.009079,-0.026966,0.042719,-0.063437,0.022293,-0.008360,-0.002997,0.031207,-0.022229,-0.070566,-0.011848,-0.029430,0.095955,0.041497,...,-0.035307,0.007804,-0.087643,-0.008609,-0.022684,0.024458,0.006794,0.075005,-0.012602,-0.057116,-0.014937,-0.024500,0.040623,-0.032557,0.033187,0.017889,0.018141,0.016160,0.024001,0.021216,-0.070073,-0.021614,0.025906,0.038553,0.028033,-0.045415,0.000542,0.032179,0.015238,-0.028660,-0.017062,0.014803,0.004437,-0.024173,0.058620,0.089497,0.052309,0.029892,-0.083696,-0.022236
176737,0.055868,-0.009013,-0.057989,-0.050477,0.060262,0.019253,0.000970,0.022657,0.026011,-0.025866,-0.040691,0.058716,-0.009479,0.029924,-0.017455,0.075071,-0.024156,0.013220,0.003917,-0.021253,0.008312,0.021627,0.012306,0.067906,0.016332,0.049361,-0.005309,-0.006043,0.020070,0.000824,0.064267,-0.047158,0.008673,0.021170,-0.026161,-0.018874,-0.005183,-0.071437,0.054253,-0.000990,...,-0.001976,0.001678,0.042324,0.033953,-0.018498,0.002999,-0.011457,0.040651,0.012558,0.019627,0.014670,-0.018215,-0.008419,0.031117,-0.022851,-0.013490,-0.027950,-0.038924,0.012381,0.061756,-0.012111,-0.004419,0.015913,0.044668,0.012136,-0.007300,0.013047,-0.038993,0.010710,0.062251,0.008138,-0.009460,0.013726,-0.004039,-0.066440,0.008394,0.027402,0.001007,-0.008348,-0.034853


In [None]:
# Test
# Start with a lyric we enter
new_lyric = 'I wanna dance with somebody'
new_lyric = model.encode(new_lyric,show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Look for most similar lyrics to the one we've given using cosine similarity
top_n = cosine_similarity(np.array(new_lyric).reshape(1,-1), embeddings)[0]
  # If the best match has a similarity > 0.4
if top_n.max()>0.4:
  counter = 0
  answer_list = []
  # Loop through top answers
  for f in top_n.argsort()[-1:][::-1]:
      # If we haven't already saved this answer and we haven't yet saved 3 answers
          # Append necessary information
    print(lyrics[f]," - Similarity: ",top_n.max())

Tonight I wanna dance with someone else  - Similarity:  0.82730186


In [None]:
# Provide a song structure for the final song

no_verses = 3
lines_per_verse = 5
lines_per_chorus = 3


In [None]:
# Calculate total number of lyrics
tot_lines = no_verses*lines_per_verse

In [None]:
# Generate lyrics for the verses

new_lyric = "Reach for the stars"

verses = []
verses.append(new_lyric)
no_v = 0
# Loop over each 'new' lyric
while len(verses)<=(tot_lines+no_v-1):
  # Find most similar lyric
  new_lyric = model.encode(new_lyric,show_progress_bar=True)
  top_n = cosine_similarity(np.array(new_lyric).reshape(1,-1), embeddings)[0]
  counter = 0
  while counter <1:
    # Loop through the potential matches
    for f in top_n.argsort()[-1000:][::-1]:
      # We don't want the lyric to be exactly the same
      # We don't want to have used the lyric before
      if (top_n[f]<0.80) & (lyrics[f] not in verses):
        verses.append(lyrics[f])
        new_lyric = lyrics[f]
        if (len(verses)-no_v)%lines_per_verse == 0:
          verses.append('')
          no_v = no_v + 1
        break
    counter = counter+1

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Check song length
len(verses)

18

In [None]:
# Print verses
verses

['Reach for the stars',
 "Reaching for somebody's star",
 'Reach for the stars"',
 'I reach for the stars',
 'You shoot for the stars',
 '',
 'I look to the stars',
 'Look up at the stars',
 'If you look above you, there are no more stars',
 'With you, there is not a star',
 "There's not a star in heaven",
 '',
 "There's a heaven above",
 'there must be heaven too',
 "Tell me there's a heaven in the sky where there is peace",
 'Is there peace',
 'No peace',
 '']

In [None]:
# Choose the most common non-stopword as a starting point for the chorus

from statistics import mode
from nltk.corpus import stopwords
mode([wrd for sub in verses for wrd in str(sub).split() 
      if wrd not in stopwords.words('english')])

'stars'

In [None]:
from random import randrange
chorus_lyric = list(
    filter(
        lambda k: 'stars' in str(k), lyrics
        ))[randrange(0,len(list(
            filter(
                lambda k: 'stars' in str(k), lyrics
                )
            ))-1)]

In [None]:
chorus_lyric

'Dead stars are always the blackest'

In [None]:
# Use the same method as for the verses

new_lyric = chorus_lyric

chorus = []
chorus.append(new_lyric)
no_v = 0
while len(chorus)<lines_per_chorus:
  new_lyric = model.encode(new_lyric,show_progress_bar=True)
  top_n = cosine_similarity(np.array(new_lyric).reshape(1,-1), embeddings)[0]
  counter = 0
  while counter <1:
    for f in top_n.argsort()[-1000:][::-1]:
      if (top_n[f]<0.80) & (lyrics[f] not in chorus):
        chorus.append(lyrics[f])
        new_lyric = lyrics[f]
        break
    counter = counter+1

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Print chorus
chorus

['Dead stars are always the blackest',
 'Just dead stars for dead eyes',
 "But you've got stars, they're in your eyes"]

In [None]:
# Print song
for line in verses:
  if line == '':
    print('')
    for line_2 in chorus:
      print(line_2)
    print('')
  else:
    print(line)

Reach for the stars
Reaching for somebody's star
Reach for the stars"
I reach for the stars
You shoot for the stars

Dead stars are always the blackest
Just dead stars for dead eyes
But you've got stars, they're in your eyes

I look to the stars
Look up at the stars
If you look above you, there are no more stars
With you, there is not a star
There's not a star in heaven

Dead stars are always the blackest
Just dead stars for dead eyes
But you've got stars, they're in your eyes

There's a heaven above
there must be heaven too
Tell me there's a heaven in the sky where there is peace
Is there peace
No peace

Dead stars are always the blackest
Just dead stars for dead eyes
But you've got stars, they're in your eyes

