In [69]:
#data / numbers
import pandas as pd
pd.options.plotting.backend = "plotly"
import numpy as np
import string


#NLP
import gensim
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


#PCA
from sklearn.decomposition import PCA
import contractions

#viz

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, iplot, plot
import chart_studio
import plotly.figure_factory as ff
import chart_studio.plotly as py
chart_studio.tools.set_credentials_file(username='kitsamho', api_key='JPuAu5Xtf7BhX43kDd4m')

import tensorflow as tf
import tensorflow_hub as hub

import glob




In [4]:
"""
GITHUB
readme
1) scraper as class
2) Main notebook
3) raw data




MEDIUM
copy
gists
some codeblocks


"""

'\nGITHUB\nreadme\n1) scraper as class\n2) Main notebook\n3) raw data\n\n\n\n\nMEDIUM\ncopy\ngists\nsome codeblocks\n\n\n'

### Read in data..

In [145]:
df = pd.read_csv('../Lyric_data/lyrics.csv') #read in csv

### Get Genres

In [146]:
df.artist.unique()

array(['Snoop Dogg', 'Cannibal Corpse', 'Kanye West', 'Nine Inch Nails',
       'AC DC', 'The Rolling Stones', 'Public Enemy', 'Alice In Chains',
       'Cradle of Filth', 'Metallica', 'The Beatles', 'Slayer', 'Adele',
       'JAY Z', 'Nirvana', 'Green Day', 'Motley Crue', 'Stevie Wonder',
       'Sex Pistols', 'Elton John', 'Queen', 'Stormzy', 'Megadeth',
       'Madonna', 'U2', 'Drake', 'Coldplay'], dtype=object)

In [147]:
genres = ['hip-hop/rap','death-metal','hip-hop/rap',
          'alternative','rock','rock','hip-hop/rap',
          'alternative','death-metal','metal',
          'pop','metal','soul','hip-hop/rap','alternative','alternative',
          'rock','soul',
          'punk','pop','rock','hip-hop/rap','metal',
          'pop','pop/rock','hip-hop/rap','pop/rock']

In [148]:
#make a genre dictionary
genre_dic = dict(zip(df.artist.unique(),genres))

In [149]:
#add genre columns
df['genre'] = df.artist.map(lambda x: genre_dic[x])

### Count word frequencies for each song

In [150]:
#get lyric frequencies for each song
df['lyric_count'] = df['lyrics'].map(lambda x: len(x.split()))#

For plotting, we need to scale back the data in order for the traces to be the right size for the Plotly visualisations

In [151]:
#normalise lyric counts
norm = [float(i)/max(df.lyric_count) for i in df.lyric_count]

In [152]:
#get column for normalised data
df['lyric_count_norm'] = norm

In [153]:
#scale it up
df['lyric_count_norm'] = df['lyric_count_norm'].map(lambda x: x*55)

### Get a unique artist code (for plotting)

We need to get some unique codes in order to be able to colour our traces

In [154]:
#get a unique code for each artists
artist_code = [i for i in range(len(df.artist.unique()))]

#get a dictionary
artist_dic = dict(zip(df.artist.unique(),artist_code))

#add artist code as column
df['artist_coding'] = df.artist.map(lambda x: artist_dic[x])

### Get a genre code (for plotting)

As above - but for genre

In [155]:
#get a unique code for each genre
genre_code = [i for i in range(len(df.genre.unique()))]

#get a dictionary
genre_code_dic = dict(zip(df.genre.unique(),genre_code))

#add genre code as column
df['genre_coding'] = df.genre.map(lambda x: genre_code_dic[x])


Inspect new columns

In [156]:
df.tail(2)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding
1008,Coldplay,When I Need A Friend,"Holy, Holy Dove descend Soft and slowly When I...",pop/rock,72,2.600131,26,8
1009,Coldplay,Life in Technicolor II,There's a wild wind blowing Down the corner of...,pop/rock,117,4.225213,26,8


# EDA

Explore distribution of genres in data

In [157]:
#plot genres
fig = pd.DataFrame(df.genre.value_counts()).plot.bar(title='Count of genre',template='ggplot2',labels=dict(index="genre", value="count"))
fig.show()

Explore song frequency by artist

In [158]:
pd.DataFrame(df.artist.value_counts()).plot.bar(title='Count of songs by artist',template='ggplot2',
                                                labels=dict(index="band", value="count"))

Let's look at the distribution of word counts in songs by genre

In [159]:
lyric_count_df = []

In [160]:
for i in df.genre.unique():
    
    lyric_count_df.append(pd.DataFrame(df[df.genre == i]['lyric_count']))
    
word_counts = pd.concat([i for i in lyric_count_df],axis=1)

In [161]:
word_counts.columns = df.genre.unique()

In [162]:
word_counts.plot.box(labels=dict(variable="genre", value="count"),template='ggplot2',title='Distribution of lyric length by genre')

# Load Universal Sentence Encoder

In order to get embeddings - we need to load the universal sentence encoder.

In [63]:
#get universal sentence encoder
USE = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [68]:
def getUSEEmbed(_string,USE = USE):
    
    """Function takes a string argument and returns its high dimensional vector from USE"""
    
    return np.array(USE([_string])[0])

In [163]:
x = getUSEEmbed('Hello how are you?')

Inspect shape of the vector

In [164]:
x.shape

(512,)

Let's see how the universal sentence encode works...
by using cosine similarity between two high dimensional vectors representing SIMILAR sentences we see a high cosine similarity. With different sentences we see low cosine similarity



In [165]:
y = getUSEEmbed("How's it going?")

In [166]:
cosine_similarity([x],[y])

array([[0.7616003]], dtype=float32)

In [167]:
z = getUSEEmbed('The library is shut today')

In [168]:
cosine_similarity([x],[z])

array([[0.02121948]], dtype=float32)

# USE Embeddings

In [169]:
#get USE embeddings for each song
df_embed = pd.DataFrame([getUSEEmbed(df.lyrics[i]) for i in range(df.shape[0])],\
                        index=[df.song[i] for i in range(df.shape[0])])

The lyrics of each song are represented as high dimensional vectors

In [180]:
df_embed.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
Qu%C3%A9 Maldici%C3%B3n,0.056127,-0.057682,0.061244,0.024724,0.036293,-0.015954,-0.048913,-0.058275,0.000548,0.048626,...,0.041816,-0.061469,-0.028729,-0.022281,-0.046868,0.061488,0.057996,0.032887,-0.059259,-0.061488
Hollywood,0.038094,-0.038747,0.067618,0.022815,0.05442,-0.054005,0.031656,-0.019276,-0.006973,0.00778,...,-0.035624,-0.068139,-0.020647,0.008774,-0.019561,0.054682,-0.059823,-0.005858,0.038268,0.0127
I Wanna Go Outside,0.040523,-0.062522,0.064338,0.060554,-0.039082,0.020569,0.023504,-0.056504,0.019111,-0.028236,...,-0.024295,-0.064507,0.007232,0.012674,-0.036413,0.057598,0.047352,-0.025239,-0.054012,-0.014935
H%C3%A3y Trao Cho Anh,-0.055349,-0.053006,0.055276,0.030029,-0.037239,-0.050743,0.054892,-0.048707,-0.054799,0.055308,...,-0.027174,-0.055466,-0.052861,0.053064,0.054891,-0.055441,-0.023995,0.053029,-0.053363,-0.055466
Bitch Please II,0.017413,-0.020858,0.060201,0.054406,0.003723,-0.038302,0.0575,-0.048243,-0.055822,-0.016195,...,0.006425,-0.060201,-0.032836,-0.010812,0.033373,0.031376,0.017206,0.022197,-0.059763,-0.058746
Forgot About Dre,-0.043108,-0.045828,0.061949,0.057756,0.031977,0.017026,0.057532,-0.024112,-0.06129,-0.052605,...,-0.01487,-0.061955,-0.053471,-0.018858,-0.056293,0.032677,-0.00545,-0.007978,-0.060287,-0.055799
Who Am I What s My Name,-0.035744,-0.031402,0.054668,0.051643,-0.009709,-0.05201,-0.054466,-0.050582,0.002098,0.038716,...,0.032469,-0.054663,-0.053015,0.03271,-0.031747,-0.054668,0.053742,-0.048302,-0.05466,-0.054647
Still D R E 4,-0.042496,-0.048596,0.06026,0.031114,0.016864,-0.049657,0.056369,-0.041385,-0.059738,-0.016863,...,0.054312,-0.060259,-0.051211,-0.030025,-0.048846,0.024502,0.010183,0.012048,-0.060068,-0.058929
Onda diferente feat Papatinho,-0.023941,-0.055286,0.058537,0.03897,-0.057273,0.048122,0.046818,-0.054009,-0.056998,0.05517,...,0.002943,-0.058532,-0.048668,0.032714,-0.048182,0.05827,0.057717,0.038872,-0.058286,-0.058538
Young Wild Free,0.00441,-0.052068,0.064898,0.04518,-0.038735,-0.064032,0.051999,-0.007492,-0.054858,0.01421,...,0.04268,-0.064976,-0.010427,-0.052598,-0.058537,-0.037424,-0.043538,0.014478,-0.06224,-0.051726


In [242]:
def getPCA(df_embedding,n_components):
    
    """Function uses PCA to reduce dimensionality of the USE embeddings for each song
    returning a DataFrame with either 1,2,3 PC's"""
    
    cols = df_embedding.index
    
    embeddings = df_embedding.iloc[:, 1:] #subset on embeddings only
    
    pca = PCA(n_components=n_components) #instantiate PCA

    pca.fit(embeddings) #fit the embeddings

    new_values = pca.transform(embeddings) #transform / reduce data
    
    
    print(pca.explained_variance_ratio_.cumsum())
    
    #option to have 1,2 columns / components
    if n_components == 2:
        
        columns = ['pca_x','pca_y']
        
    elif n_components == 3:
        
        columns = ['pca_x','pca_y','pca_z']
        
    else: 
        columns = ['pc']
    
    df_reduced = pd.DataFrame(new_values,index=cols) #get new DataFrame
    
    df_reduced.columns = columns
    
    if n_components == 1:
        
        df_reduced.sort_values(by='pc',ascending=False,inplace=True)
    
    return df_reduced #return new DataFrame

In [243]:
def mergePCA(df,df_embedding,n_components):
    
    df_reduced = getPCA(df_embedding,n_components)
    
    df_merge = pd.merge(df,df_reduced,how='inner',left_on='song',right_on=df_reduced.index)
    
    return df_merge

In [248]:
mergePCA(df,df_embed,2)

[0.06867555 0.12525669]


Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding,pca_x,pca_y
0,Snoop Dogg,Qu%C3%A9 Maldici%C3%B3n,Es que no sabes cuánto duele el amor No sabes ...,hip-hop/rap,271,9.786605,0,0,-0.158174,-0.109948
1,Snoop Dogg,Hollywood,Where the beautiful people at? I said where th...,hip-hop/rap,289,10.436638,0,0,-0.115256,0.193755
2,Snoop Dogg,Hollywood,Where the beautiful people at? I said where th...,hip-hop/rap,289,10.436638,0,0,-0.093827,0.031203
3,Madonna,Hollywood,Everybody comes to Hollywood They wanna make i...,pop,155,5.597505,23,5,-0.115256,0.193755
4,Madonna,Hollywood,Everybody comes to Hollywood They wanna make i...,pop,155,5.597505,23,5,-0.093827,0.031203
...,...,...,...,...,...,...,...,...,...,...
1059,Coldplay,BrokEn,"Hmm-hmm, hmm-hmm Hmm-hmm, hmm-oh Hmm-hmm Lord,...",pop/rock,155,5.597505,26,8,-0.035484,-0.156408
1060,Coldplay,%C3%88k%C3%B3,Joseph rode in on a beam of light Stray dogs t...,pop/rock,108,3.900197,26,8,0.250563,-0.028779
1061,Coldplay,Clocks,"The lights go out, and I can't be saved Tides ...",pop/rock,136,4.911359,26,8,0.046526,-0.131604
1062,Coldplay,When I Need A Friend,"Holy, Holy Dove descend Soft and slowly When I...",pop/rock,72,2.600131,26,8,0.167460,-0.023719


In [210]:
pd.DataFrame(mergePCA(df,df_embed,1).groupby('artist')['pc'].mean()).\
                            sort_values(by='pc').plot.bar(labels=dict(variable="artist", value="PC"),template='ggplot2',title='PC1 by artist')

In [211]:
pd.DataFrame(mergePCA(df,df_embed,1).groupby('genre')['pc'].mean()).\
                            sort_values(by='pc').plot.bar(labels=dict(variable="genre", value="PC"),template='ggplot2',title='PC1 by genre')

# Plotting

In [212]:
def get2Plot(df,title,colour_col,label_col,x='pca_x',y='pca_y',sizing_factor=10,font=10):


        data=go.Scatter(opacity=0.7,name=False,visible=True,
                x=df[x],
                y=df[y],
                mode='markers+text',textposition='middle right',text=df[label_col],
                            marker=dict(size=sizing_factor,color=df[colour_col]))

        # set up axis for plot
        axis=dict(showline=True, # hide axis line, grid, ticklabels and  title
                  zeroline=True,
                  showgrid=False,
                  showticklabels=True,
                  title=''
                  )

        #set up figure for plot
        figure = {
        "data": data,
        "layout":
        go.Layout(title=title,
                    font= dict(family='Open Sans',size=font),
                    width=900,
                    height=900,
                    autosize=False,
                    showlegend=False,
                    xaxis=axis,
                    yaxis=axis,
                    margin=dict(
                    l=40,
                    r=40,
                    b=85,
                    t=100,
                    pad=0,),)}


        #plot the figure
        return iplot(figure)

In [213]:
get2Plot(mergePCA(df,df_embed,2),title='test',colour_col='genre_coding',label_col='genre',font=1,sizing_factor=df_merge.lyric_count_norm)

In [214]:
def get3Plot(df,title,colour_col,label_col,x='pca_x',y='pca_y',z='pca_z',sizing_factor=df.lyric_count_norm,font=10):


        data=go.Scatter3d(opacity=0.7,name=False,visible=True,
                x=df[x],
                y=df[y],
                z=df[z],
                mode='markers',textposition='middle right',text=df[label_col],
                            marker=dict(size=sizing_factor,color=df[colour_col]))

        # set up axis for plot
        axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
                  zeroline=False,
                  showgrid=False,
                  showticklabels=True,
                  title=''
                  )

        #set up figure for plot
        figure = {
        "data": data,
        "layout":
        go.Layout(title=title,
                    font= dict(family='Open Sans',size=font),
                    width=900,
                    height=900,
                    autosize=False,
                    showlegend=False,
                    xaxis=axis,
                    yaxis=axis,
                    margin=dict(
                    l=40,
                    r=40,
                    b=85,
                    t=100,
                    pad=0,),)}


        #plot the figure
        return iplot(figure)

In [215]:
get3Plot(mergePCA(df,df_embed,3),title='test',colour_col='artist_coding',label_col='genre',font=8,sizing_factor=df_merge.lyric_count_norm)

In [331]:
def savePlot(fig,filename):
        
    """Method to save Plotly figure to chart studio"""

    py.plot(fig, filename = filename, auto_open=True)

    return print('Successfully saved as ',filename)

In [None]:

# #instantiate the spacy model - we need the large instance as this has the pre-trained vectors
nlp = spacy.load('en_core_web_lg')

In [417]:
x = similaritySorter(kill_lyrics,nlp)
getPlot(x,'test')

In [413]:
def similaritySorter(_string,model,n_components=2):
    
    """This function takes in a string and returns a DataFrame summary of the ouputs from a PCA
    The DataFrame can be plotted - (scatter) to see which words in the string have greater 
    similarity"""
    
    text.ENGLISH_STOP_WORDS
    #removes punctuation and expands contractions
    string_clean = contractions.fix(_string.translate(str.maketrans(' ', ' ', string.punctuation)))
        
    token_raw = model(' '.join(set(string_clean.lower().split()))) #tokenises unique tokens
    
    token_list_no_punc = [token for token in token_raw if token.pos_ != 'PUNCT'] #subset of tokens
    
    token_list = [token for token in token_list_no_punc if token.text not in text.ENGLISH_STOP_WORDS]
    
    #DataFrame of vectors
    df = pd.DataFrame([token.vector for token in token_list],index=[token.text for token in token_list])
    
    embeddings = df.iloc[:, 1:] #subset on embeddings only
    
    n_components = n_components #set components
    
    pca = PCA(n_components=n_components) #instantiate PCA

    pca.fit(embeddings) #fit the embeddings

    new_values = pca.transform(embeddings) #transform / reduce data
    
    #option to have 1,2 columns / components
    if n_components == 2:
        
        columns = ['pca_x','pca_y']
        
    else: 
        
        columns = ['pc']
    
    df_reduced = pd.DataFrame(new_values,index=df.index,columns=columns) #get new DataFrame
    
    if n_components == 1:
        
        df_reduced.sort_values(by='pc',ascending=False,inplace=True)
    
    return df_reduced #return new DataFrame