<img src="assets/header'jpeg" style="width: 800px;">

# `Contents:`

- [Background](#background)
- [What is a network?](#whatisanetwork)    
- [A Game of Networks](#agameofnetworks)
- [Load Libraries](#loadlibraries)   
    
- [Load Edges & Nodes](#loadedgesandnodes)   
	- [Exploring the nodes](#exploringthenodes)
	- [Exploring the edges](#exploringtheedges)     
	- [Building the graph](#buildingthegraph)
	- [Drawing the graph](#drawingthegraph)  
    
- [Graph Metrics](#graphmetrics)   
	- [Degree](#degree)
	- [Betweeness Centrality](#betweenesscentrality)
	- [Degree and Betweeness Centrality - is there a relationship?](#degreeandbetweenesscentrality)          
	- [Building up the node dataframe](#buildingupthenodedataframe)
	- [Building up the edge dataframe](#buildinguptheedgedataframe)

- [Plotting Tools](#plottingtools)   
	- [Filtered Graph Plotter](#filteredgraphplotter)        

- [Ways we can use network science to answer questions about the characters](#questions)
	- [Which character pairings occur most often?](#often)
	- [Who is the most devoted character?](#devoted)
	- [What characters are yet to meet?](#unconnected)
	- [What house is the most important?](#house)
    


In [256]:
#data / numbers
import pandas as pd
pd.options.plotting.backend = "plotly"
import numpy as np
import string
import itertools

#NLP
import gensim
import spacy

#sklearn
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 


#PCA
from sklearn.decomposition import PCA
import contractions as ct

#viz

import plotly
import plotly.graph_objs as go

import plotly.express as px

from plotly.offline import download_plotlyjs, iplot, plot
import chart_studio
import plotly.figure_factory as ff
import chart_studio.plotly as py
chart_studio.tools.set_credentials_file(username='kitsamho', api_key='JPuAu5Xtf7BhX43kDd4m')

import tensorflow as tf
import tensorflow_hub as hub

### Read in data..

In [251]:
df = pd.read_csv('../Lyric_data/lyrics.csv') #read in csv

In [252]:
#drop any dupes
df.drop_duplicates(subset=['song'],inplace=True)

In [253]:
#reset index
df.reset_index(drop=True,inplace=True)

In [254]:
#formatting
df.lyrics = df.lyrics.map(lambda x: x.lower())
df.artist = df.artist.map(lambda x: x.replace('Iggy and The Stooges 3','Iggy and The Stooges'))

In [257]:
def fixContractions(text):
    
    """Function used contractions library to fix contractions and normalise slang"""
    
    return ct.fix(text)

In [258]:
df.lyrics = df.lyrics.map(fixContractions)

Let's look at what we have..

In [259]:
df.head(3)

Unnamed: 0,artist,song,lyrics
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...
2,Snoop Dogg,Bitch Please II,"yeah, what up detroit nu-uh, nu-uh nuh no he d..."


### Get Genres

In [260]:
#look at all our artists
df.artist.unique()

array(['Snoop Dogg', 'Cannibal Corpse', 'Kanye West', 'Nine Inch Nails',
       'AC DC', 'The Rolling Stones', 'Public Enemy', 'Alice In Chains',
       'Cradle of Filth', 'Metallica', 'The Beatles', 'Slayer', 'Adele',
       'JAY Z', 'Nirvana', 'Green Day', 'Motley Crue', 'Stevie Wonder',
       'Sex Pistols', 'Elton John', 'Queen', 'Stormzy', 'Megadeth',
       'Madonna', 'U2', 'Drake', 'Coldplay', 'Pearl Jam',
       'Stone Temple Pilots', 'Bon Jovi', 'Iron Maiden', 'Black Sabbath',
       'Judas Priest', 'Michael Jackson', 'Taylor Swift', 'Lady Gaga',
       'Maroon 5', 'Sam Smith', 'Marvin Gaye', 'Luther Vandross',
       'Lionel Richie', 'Carcass', 'Death', 'Exhumed', 'Ramones',
       'The Clash', 'Iggy and The Stooges', 'Selena Gomez', 'Anthrax',
       'Red Hot Chili Peppers', 'George Michael', 'Aretha Franklin'],
      dtype=object)

In [261]:
#setting up genre labels for each artist
genres = ['hip-hop/rap','death-metal','hip-hop/rap',
          'alternative','rock','rock','hip-hop/rap',
          'alternative','death-metal','metal',
          'pop','metal','soul','hip-hop/rap','alternative','alternative',
          'rock','soul','punk','pop','rock','hip-hop/rap','metal',
          'pop','pop/rock','hip-hop/rap','pop/rock','alternative',
          'alternative','rock','metal','metal','metal','pop','pop','pop',
          'pop/rock','soul','soul','soul','soul','death-metal','death-metal','death-metal',
          'punk','punk','punk','pop','metal','alternative','pop','soul']

In [262]:
#make a genre dictionary
genre_dic = dict(zip(df.artist.unique(),genres))

In [263]:
genre_dic

{'Snoop Dogg': 'hip-hop/rap',
 'Cannibal Corpse': 'death-metal',
 'Kanye West': 'hip-hop/rap',
 'Nine Inch Nails': 'alternative',
 'AC DC': 'rock',
 'The Rolling Stones': 'rock',
 'Public Enemy': 'hip-hop/rap',
 'Alice In Chains': 'alternative',
 'Cradle of Filth': 'death-metal',
 'Metallica': 'metal',
 'The Beatles': 'pop',
 'Slayer': 'metal',
 'Adele': 'soul',
 'JAY Z': 'hip-hop/rap',
 'Nirvana': 'alternative',
 'Green Day': 'alternative',
 'Motley Crue': 'rock',
 'Stevie Wonder': 'soul',
 'Sex Pistols': 'punk',
 'Elton John': 'pop',
 'Queen': 'rock',
 'Stormzy': 'hip-hop/rap',
 'Megadeth': 'metal',
 'Madonna': 'pop',
 'U2': 'pop/rock',
 'Drake': 'hip-hop/rap',
 'Coldplay': 'pop/rock',
 'Pearl Jam': 'alternative',
 'Stone Temple Pilots': 'alternative',
 'Bon Jovi': 'rock',
 'Iron Maiden': 'metal',
 'Black Sabbath': 'metal',
 'Judas Priest': 'metal',
 'Michael Jackson': 'pop',
 'Taylor Swift': 'pop',
 'Lady Gaga': 'pop',
 'Maroon 5': 'pop/rock',
 'Sam Smith': 'soul',
 'Marvin Gaye': '

In [264]:
#add genre columns
df['genre'] = df.artist.map(lambda x: genre_dic[x])

### Count word frequencies for each song

In [265]:
#get lyric frequencies for each song
df['lyric_count'] = df['lyrics'].map(lambda x: len(x.split()))

For plotting, we need to scale back the data in order for the traces to be the right size for the Plotly visualisations

In [266]:
#normalise lyric counts
norm = [float(i)/max(df.lyric_count) for i in df.lyric_count]

In [267]:
#get column for normalised data
df['lyric_count_norm'] = norm

In [268]:
#scale it up
df['lyric_count_norm'] = df['lyric_count_norm'].map(lambda x: x*55)

Let's look at what we have..

In [269]:
df.head(3)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...,hip-hop/rap,302,10.234134
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...,hip-hop/rap,428,14.504005
2,Snoop Dogg,Bitch Please II,"yeah, what up detroit nu-uh, nu-uh nuh no he d...",hip-hop/rap,738,25.009242


### Get a unique artist code (for plotting)

We need to get some unique codes in order to be able to colour our traces

In [270]:
#get a unique code for each artists
artist_code = [i for i in range(len(df.artist.unique()))]

#get a dictionary
artist_dic = dict(zip(df.artist.unique(),artist_code))

#add artist code as column
df['artist_coding'] = df.artist.map(lambda x: artist_dic[x])

### Get a genre code (for plotting)

As above - but for genre

In [271]:
#get a unique code for each genre
genre_code = [i for i in range(len(df.genre.unique()))]

#get a dictionary
genre_code_dic = dict(zip(df.genre.unique(),genre_code))

#add genre code as column
df['genre_coding'] = df.genre.map(lambda x: genre_code_dic[x])


Let's look at what we have..

In [272]:
df.head(3)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...,hip-hop/rap,302,10.234134,0,0
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...,hip-hop/rap,428,14.504005,0,0
2,Snoop Dogg,Bitch Please II,"yeah, what up detroit nu-uh, nu-uh nuh no he d...",hip-hop/rap,738,25.009242,0,0


# EDA

Before we start looking at sentence embeddings, let's have a little explore of what data we have

## Count of Genre

In [273]:
#plot genres
fig_genre = pd.DataFrame(df.genre.value_counts()).plot.bar(template='ggplot2')

#title parameters
title_param = dict(text='<b>Count of Genre</b><br></b>', 
                        font=dict(size=20))

#update layout
fig_genre.update_layout(title=title_param,
                  width=1000,
                  height=500,        
                  xaxis = dict(title='Genre'),
                  yaxis = dict(title='Count'),
                  autosize=False,
                  showlegend=False,)

fig_genre.update_traces(marker_color='rgb(148, 103, 189)')
#show plot
fig_genre.show()



## Count of Artist / Band

In [274]:
fig_artist = pd.DataFrame(df.artist.value_counts()).plot.barh(template='ggplot2')

#title parameters
title_param = dict(text='<b>Count of Artist</b><br></b>', 
                        font=dict(size=20))

#update layout
fig_artist.update_layout(title=title_param,
                  width=1000,
                  height=1000,        
                  xaxis = dict(title='Song Count'),
                  yaxis = dict(title='Artist'),
                  autosize=False,
                  showlegend=False,)

fig_artist.update_traces(marker_color='rgb(148, 103, 189)')
#show plot
fig_artist.show()

## Distribution of Lyric Length, by genre

In [275]:
#empty list
lyric_count_df = []

In [276]:
#loop through each unique genre
for i in df.genre.unique():
    
    #get subset of genres appended to list
    lyric_count_df.append(pd.DataFrame(df[df.genre == i]['lyric_count']))
    
#concat all DataFrames where each column is a genre
word_counts = pd.concat([i for i in lyric_count_df],axis=1)

In [277]:
#update columns
word_counts.columns = df.genre.unique()

In [279]:
#plot lyric count distributon for each genre
fig_dist = word_counts.plot.box(template='ggplot2')



#title parameters
title_param = dict(text='<b>Distribution of lyric length by genre</b><br></b>', 
                        font=dict(size=20))

#update layout
fig_dist.update_layout(title=title_param,
                  width=1000,
                  height=700,        
                  xaxis = dict(title='Genre'),
                  yaxis = dict(title='Song lyric length (n words)'),
                  autosize=False,
                  showlegend=False,)

fig_dist.update_traces(marker_color='rgb(148, 103, 189)')
#show plot
fig_dist.show()

# PoS Analysis

In order to perform part of speech analysis we need use something like spaCy

In [280]:
#load spacy model
nlp = spacy.load('en_core_web_lg')

In [281]:
def pos(string,pos):
    
    """Returns any token that qualifies as a specific part of speech"""
    
    doc = nlp(string) #fit model
    
    return ' '.join(list(set([i.text for i in doc if i.pos_ == pos]))) #return any tokens that qualify

In [282]:
#get nouns
df['nouns'] = df.lyrics.map(lambda x: pos(x,'NOUN'))

In [283]:
#get verbs
df['verbs'] = df.lyrics.map(lambda x: pos(x,'VERB'))

In [284]:
#get adjectives
df['adjectives'] = df.lyrics.map(lambda x: pos(x,'ADJ'))

In [285]:
#get adverbs
df['adverbs'] = df.lyrics.map(lambda x: pos(x,'ADV'))

Let's inspect the new columns..

In [246]:
# subset on new columns
df.iloc[200:204][df.columns[8:]]

Unnamed: 0,nouns,verbs,adjectives,adverbs
200,taste sweetie woo traps hands vain ministers n...,hope get made stank mean call let meet sealed ...,round sure long criminal pleased,around well down damn just after all when
201,shoes arm line face carryin bar tonight woman ...,knew let keep see loose getting come dressed d...,loose drunk corny wrong more,maybe just about no never right
202,reds fruits shoes child wine shit ha friend sp...,scrape drop thank come said stop beg want help...,stormy sweet bitter,down right
203,mind glimpse woods babe baby time park love da...,torn looked ripped swirling taken followed wai...,strong hooked sweet scared seedy neat hard dar...,just so anywhere someday


A function that can count the most common PoS will be helpful

In [247]:
def mostcommonTokens(data,additional_stopwords = [],token=1):
    
    """Returns a DataFrame of the most common n (arg)tokens in a string. Excludes stop words. Additional
    stop words can be added"""
    
    #add any new stop words to default stopword list
    add_stop_words = ENGLISH_STOP_WORDS.union(additional_stopwords)

    #instantiate count vectorizer and specify tokens
    vect = CountVectorizer(stop_words=add_stop_words, ngram_range= (token,token))

    #fit vectorizer
    X = vect.fit_transform(data)
    
    #get word counts as DataFrame
    word_counts = pd.DataFrame(np.asarray(X.sum(axis=0))[0],vect.get_feature_names(),columns=['count'])
    
    #sort by count
    word_counts = word_counts.sort_values(by='count', ascending=False)

    return word_counts #return DataFrame


Lets look at the top 10 words in the whole lyric corpus

In [286]:
#call the function on one gram
mostcommonTokens(df.lyrics,token=1).head(10)

Unnamed: 0,count
like,2179
yeah,2097
know,2059
oh,2057
love,2018
just,1947
got,1923
want,1584
time,1190
baby,1105


We can integrate the mostcommonTokens function with Treemap visualisations to make the analysis clearer

In [290]:
def genreMap(df,pos,n=100):
    
    """Visualisation function. Returns a treemap of the most common n tokens that qualify as a part of speech"""
    
    #list to populate subset DataFrame's
    all_df = []

    #loop through unique genres
    for i in df.genre.unique():

        #get most common tokens for part of speech
        temp_df = mostcommonTokens(df[df['genre'] == i][pos]).head(n)
        
        #add column for current genre
        temp_df['genre'] = i

        #append DataFrame to list
        all_df.append(temp_df)

    #concatenate all DataFrame's
    all_df = pd.concat([i for i in all_df])

    #reset index
    all_df.reset_index(drop=False,inplace=True)

    #column labels
    all_df.columns = ['word','count','genre']

    # in order to have a single root node add 'all genres'
    all_df["all genres"] = "all genres" 

    #create plotly treemap figure
    fig = px.treemap(all_df, path=['all genres', 'genre', 'word'], values='count',)
    
    #title parameters
    title_param = dict(text='<b>Most Common Nouns, by genre</b><br>spaCy used for classifying PoS</b>', 
                        font=dict(size=20))
    
    #update layout
    fig.update_layout(title=title_param,
                    font= dict(family='Helvetica',size=10),
                    width=900,
                    height=900,
                    autosize=False,
                    margin=dict(l=40,r=40,b=85,t=100,pad=0,))
        
    #show plot
    fig.show()
    
    return fig




In [291]:
fig_nouns = genreMap(df,'nouns')

# Load Universal Sentence Encoder

In order to get embeddings - we need to load the universal sentence encoder.

In [292]:
#get universal sentence encoder
USE = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [293]:
def getUSEEmbed(_string,USE = USE):
    
    """Function takes a string argument and returns its high dimensional vector from USE"""
    
    return np.array(USE([_string])[0])

In [294]:
example = getUSEEmbed('Hello how are you?')

Inspect shape of the vector

In [296]:
example.shape

(512,)

We can assess similarity by looking at **cosine similarity** of the vectors..

The cosine similarity of two high dimensional vectors representing SIMILAR sentences should be high.. 

For sentences that are DIFFERENT, the cosine similarity should be low..



In [298]:
#some example sentences
example_sentences = ["The weather is going to be really warm today",
                     
                     "Today is going to be the sunniest day of the year",
                     
                     "I like my eggs sunny side up",
                     
                     "Breakfast is my favourite meal"]

In [299]:
#get embeddings for each example sentence
embed = [getUSEEmbed(i) for i in example_sentences]

In [300]:
#set up a dictionary where each sentece is a key and its value is its 512 vector embedding
dic_ = dict(zip(test,embed))

In [304]:
#let's find all the unique pairwise sentence combinations
combo = [list(i) for i in itertools.combinations(example_sentences, 2)]

In [305]:
combo

[['The weather is going to be really warm today',
  'Today is going to be the sunniest day of the year'],
 ['The weather is going to be really warm today',
  'I like my eggs sunny side up'],
 ['The weather is going to be really warm today',
  'Breakfast is my favourite meal'],
 ['Today is going to be the sunniest day of the year',
  'I like my eggs sunny side up'],
 ['Today is going to be the sunniest day of the year',
  'Breakfast is my favourite meal'],
 ['I like my eggs sunny side up', 'Breakfast is my favourite meal']]

In [307]:
def cosineSimilarity(vec_x,vec_y):
    
    """Function returns pairwise cosine similarity of two vector arguments"""
    
    return cosine_similarity([vec_x],[vec_y])[0][0]

Let's make a DataFrame that maps the cosine similarity for each unique sentence pair

In [308]:
#empty list for data
cs = []

#lop through each unique sentence pairing
for i in range(len(combo)):
    
    #get cosine similarity for each
    cs_ = cosineSimilarity(dic_[combo[i][0]],dic_[combo[i][1]])
    
    #append data to list
    cs.append((combo[i][0],combo[i][1],cs_))



In [309]:
#construct DataFrame
cs_df = pd.DataFrame(cs,columns=['sent_1','sent_2','cosine_similarity']).\
                    sort_values(by='cosine_similarity',ascending=False)

Let's look at the data..

Sentences that are similar have higher cosine similarity. This mean the vectors that represent these sentences are similar. That's really impressive!

In [311]:
cs_df

Unnamed: 0,sent_1,sent_2,cosine_similarity
0,The weather is going to be really warm today,Today is going to be the sunniest day of the year,0.519781
5,I like my eggs sunny side up,Breakfast is my favourite meal,0.41655
1,The weather is going to be really warm today,I like my eggs sunny side up,0.150719
3,Today is going to be the sunniest day of the year,I like my eggs sunny side up,0.146834
2,The weather is going to be really warm today,Breakfast is my favourite meal,0.139814
4,Today is going to be the sunniest day of the year,Breakfast is my favourite meal,0.137269


# USE Embeddings

Let's apply this technique to get embeddings for each of our songs.

In [312]:
#get USE embeddings for each song
df_embed = pd.DataFrame([getUSEEmbed(df.lyrics[i]) for i in range(df.shape[0])],\
                        index=[df.song[i] for i in range(df.shape[0])])

The lyrics of each song are represented as high dimensional vectors

In [313]:
df_embed.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
Hollywood,0.031945,-0.03947,0.067169,0.037638,0.055449,-0.057283,0.015879,0.011321,0.046767,0.007317,...,-0.035721,-0.070376,-0.001685,-0.009381,-0.008252,0.050228,-0.065086,-0.003475,0.045313,0.027769
I Wanna Go Outside,0.018305,-0.063809,0.066128,0.06549,-0.040116,0.031882,0.010662,-0.042234,0.045169,-0.024511,...,-0.038127,-0.067768,0.03556,0.011108,-0.023002,0.051027,0.03482,-0.013699,-0.04984,-0.015471
Bitch Please II,-0.004364,-0.022046,0.062683,0.058048,0.001349,-0.040177,0.057402,-0.025861,-0.04551,-0.029392,...,-0.007696,-0.062685,-0.013247,-0.022216,0.040196,0.030657,-0.012255,0.025255,-0.06161,-0.059172
Forgot About Dre,-0.056103,-0.036869,0.064619,0.061866,0.030001,0.029511,0.058852,0.007722,-0.063222,-0.057571,...,-0.019299,-0.064862,-0.029222,-0.032879,-0.056511,0.024965,-0.030866,0.018206,-0.057782,-0.054157
Who Am I What s My Name,-0.037254,-0.025406,0.055416,0.052488,-0.010872,-0.052389,-0.055218,-0.043555,0.019043,0.030778,...,0.023482,-0.055413,-0.049951,0.026458,-0.032243,-0.055416,0.053424,-0.045275,-0.055404,-0.055368
Still D R E 4,-0.05145,-0.043041,0.061808,0.044671,-0.00156,-0.045167,0.057029,-0.018829,-0.06038,-0.019011,...,0.053305,-0.061808,-0.04164,-0.033476,-0.049697,0.018316,-0.000133,0.022376,-0.061462,-0.059913
Onda diferente feat Papatinho,-0.02529,-0.055152,0.058948,0.038355,-0.057572,0.048022,0.046603,-0.054027,-0.057517,0.055173,...,0.009989,-0.058944,-0.042791,0.032751,-0.048464,0.058677,0.058184,0.046278,-0.058671,-0.05895
Young Wild Free,-0.010504,-0.049545,0.066359,0.043981,-0.045096,-0.065617,0.043706,0.024774,-0.042835,0.017063,...,0.035944,-0.066807,0.020015,-0.057763,-0.058356,-0.040083,-0.053104,0.017477,-0.062313,-0.047047
Gin Juice,0.001986,-0.051723,0.061851,0.049477,0.020119,0.019104,0.05776,-0.044103,0.056063,-0.027305,...,0.025074,-0.064295,0.002977,-0.043329,-0.042093,-0.041022,0.013241,-0.002306,-0.059803,-0.060226
Bitch Please,0.003131,-0.057889,0.058769,0.056847,0.017084,-0.053589,0.053192,-0.039342,-0.04818,0.005184,...,0.038014,-0.058769,-0.040805,0.025714,0.026791,-0.045621,0.026785,-0.035045,-0.058133,-0.058393


Let's use another data reduction technique - Principal Component Analysis, to reduce the dimensionality of these embeddings.

In [314]:
def getPCA(df,df_embedding,n_components):
    
    """Function uses PCA to reduce dimensionality of the USE embeddings for each song
    returning a DataFrame with either 1,2,3 PC's"""
    
    cols = df_embedding.index
    
    embeddings = df_embedding.iloc[:, 1:] #subset on embeddings only
    
    pca = PCA(n_components=n_components) #instantiate PCA

    pca.fit(embeddings) #fit the embeddings

    new_values = pca.transform(embeddings) #transform / reduce data
    
    #let's see how much variance can be explained
    print(pca.explained_variance_ratio_.cumsum()) 
    
    #option to have 1,2 columns / components
    if n_components == 2:
        
        columns = ['pca_x','pca_y']
        
    elif n_components == 3:
        
        columns = ['pca_x','pca_y','pca_z']
        
    else: 
        columns = ['pc']
    
    df_reduced = pd.DataFrame(new_values,index=cols) #get new DataFrame
    
    df_reduced.columns = columns
    
    if n_components == 1:
        
        df_reduced.sort_values(by='pc',ascending=False,inplace=True)
    
    #merges PCA DataFrame on original DataFrame
    df_merge = pd.merge(df,df_reduced,how='inner',left_on='song',right_on=df_reduced.index)
    
    return df_merge #returns merged DataFrame

In [315]:
pc_1 = getPCA(df,df_embed,n_components=1)
pc_2 = getPCA(df,df_embed,n_components=2)
pc_3 = getPCA(df,df_embed,n_components=3)

[0.06896908]
[0.06896908 0.11355067]
[0.06896908 0.11355067 0.1463629 ]


Let's inspect the data..

In [326]:
#look at the 1st principal component 
pc_1.head(2)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding,nouns,verbs,adjectives,adverbs,pc
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...,hip-hop/rap,302,10.234134,0,0,vibe gorilla hills city people bitches thing j...,dimmed clap tie wake excite sinking going make...,alright seductive wonderful bad dark real frag...,now so where exactly how just bareback never back,0.082787
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...,hip-hop/rap,428,14.504005,0,0,dial nigga sky tension battles hands kitchen o...,supposed get take waitin rush dance say stuck ...,double o fresh old whole right little safe rea...,out around outside rather wisely too right how...,-0.181245


In [322]:
#look at the 1st and 2nd principal components
pc_2.head(2)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding,nouns,verbs,adjectives,adverbs,pca_x,pca_y
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...,hip-hop/rap,302,10.234134,0,0,vibe gorilla hills city people bitches thing j...,dimmed clap tie wake excite sinking going make...,alright seductive wonderful bad dark real frag...,now so where exactly how just bareback never back,0.082787,0.263123
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...,hip-hop/rap,428,14.504005,0,0,dial nigga sky tension battles hands kitchen o...,supposed get take waitin rush dance say stuck ...,double o fresh old whole right little safe rea...,out around outside rather wisely too right how...,-0.181244,0.356239


In [323]:
#look at the 1st, 2nd and 3rd principal components
pc_3.head(2)

Unnamed: 0,artist,song,lyrics,genre,lyric_count,lyric_count_norm,artist_coding,genre_coding,nouns,verbs,adjectives,adverbs,pca_x,pca_y,pca_z
0,Snoop Dogg,Hollywood,where the beautiful people at? i said where th...,hip-hop/rap,302,10.234134,0,0,vibe gorilla hills city people bitches thing j...,dimmed clap tie wake excite sinking going make...,alright seductive wonderful bad dark real frag...,now so where exactly how just bareback never back,0.082787,0.263122,0.096147
1,Snoop Dogg,I Wanna Go Outside,yes and we are back here at the jack-off hour ...,hip-hop/rap,428,14.504005,0,0,dial nigga sky tension battles hands kitchen o...,supposed get take waitin rush dance say stuck ...,double o fresh old whole right little safe rea...,out around outside rather wisely too right how...,-0.181244,0.356235,0.058101


Let's do some grouping to explore which artists have lyrics that are most similar to each other

As we can see - even with one component, artists from genres tend to cluster together. Death metal, heavy metal and rock differ slighty from one another but vastly different to pop and soul. This makes sense!

In [329]:
pc1_group = pd.DataFrame(pc_1.groupby('artist')['pc'].mean())

fig = pc1_group.sort_values(by='pc').plot.barh(labels=dict(variable="artist", value="PC"),
                               template='ggplot2')

title_param = dict(text='<b>Lyric similarity, by artist</b><br>Based on Principal Component 1</b>', 
                        font=dict(size=20))

margin_param=dict(l=40,r=40,b=85,t=200,pad=0)

fig.update_layout(
                      title=title_param,
                      width=1000,
                      height=1000,
                    margin = margin_param,
                      xaxis = dict(title='Principal Component 1'),
                      autosize=False,
                      showlegend=False,
                    )

 Let's look at the same analysis but grouped by genres.

This confirms what we found above, the lyrics in genres cluster into groups that make sense.

In [328]:
pc1_group = pd.DataFrame(pc_1.groupby('genre')['pc'].mean())

fig = pc1_group.sort_values(by='pc').plot.bar(labels=dict(variable="genre", value="PC"),\
                               template='ggplot2')

title_param = dict(text='<b>Lyric similarity, by genre</b><br>Based on Principal Component 1</b>', 
                        font=dict(size=20))

margin_param=dict(l=40,r=40,b=85,t=200,pad=0)

fig.update_layout(
                      title=title_param,
                      width=1000,
                      height=900,
                    margin=margin_param,
                      xaxis = dict(title='Genre'),
                      yaxis = dict(title='Principal Component 1'),
                      autosize=False,
                      showlegend=False,
                    )

Let's make a more sophisticated visualisation that uses scatter plots to show this data by song. 

We can look at this in two dimensions (PC1,PC2) but also in three dimensions (PC1,PC2,PC3)

# Plotting

In [337]:
def scatPlot(df,df_embed,n_components=2):
    
    """Comprehensive function that adds components to a scatter plot"""
    
    df = getPCA(df,df_embed,n_components) #gets PCA
             
    fig = go.Figure() #instantiate Ploty.go object

    genres = df.genre.unique() #get unique genres     
    
    for i in range(len(genres)): #loop through genres

        df_mask = df[df.genre == genres[i]] #subset on genre
        
        df_mask['artist_song'] = df_mask['artist']+' // '+df_mask['song'] #new labels for traces
             
        if n_components == 2:
            
            #add traces for 2d - several formatting options for hover text, and marker size
            fig.add_trace(go.Scatter(
                          x=df_mask['pca_x'],
                          y=df_mask['pca_y'],
                          name=genres[i],
                          text=df_mask['artist_song'],
                          mode='markers',hoverinfo='text',
                          marker={'size':df_mask.lyric_count_norm}))
        
        else:
            #add traces for 3d - several formatting options for hover text, and marker size
            fig.add_trace(go.Scatter3d(
                          x=df_mask['pca_x'],
                          y=df_mask['pca_y'],
                          z=df_mask['pca_z'],
                          name=genres[i],
                          text=df_mask['artist_song'],
                          mode='markers',hoverinfo='text',
                          marker={'size':df_mask.lyric_count_norm}))
    # axis parameters
    axis_x_param=dict(showline=True, 
                      zeroline=True,
                      showgrid=True,
                      showticklabels=True,
                      title='Principal Component 1')
    # axis parameters
    axis_y_param=dict(showline=True, 
                      zeroline=True,
                      showgrid=True,
                      showticklabels=True,
                      title='Principal Component 2')
    
    # legend parameters
    legend_param= dict(bgcolor=None,
                       bordercolor = None,
                       borderwidth = None,
                       font = dict(family='Open Sans',size=15,color=None),
                       orientation='h',
                       itemsizing='constant',
                       title=dict(text='Genres (clickable!)',
                                  font=dict(family='Open Sans',size=20,color=None),
                                  side='top'),)
    # margin parameters
    margin_param=dict(l=40,r=40,b=85,t=200,pad=0)
    
    # title parameters
    title_param = dict(text='<b>Similarities and differences in song lyrics by genre</b>\
    <br>Universal sentence encodings and dimensionality reduction</b><br>Scatter size represents lyric count', 
                        font=dict(size=20))
    
    #update layout
    fig.update_layout(legend= legend_param,
                      title=title_param,
                      width=1000,
                      height=1000,
                      autosize=False,
                      showlegend=True,
                      xaxis=axis_x_param,
                      yaxis=axis_y_param,
                      margin=margin_param,)
    
        


    fig.show()    
    return fig

In [338]:
scatPlot_2 = scatPlot(df,df_embed,n_components=2)

[0.06896908 0.11355067]


In [336]:
scatPlot_3 = scatPlot(df,df_embed,n_components=3)

[0.06896908 0.11355067 0.1463629 ]


In [331]:
def savePlot(fig,filename):
        
    """Method to save Plotly figure to chart studio"""

    py.plot(fig, filename = filename, auto_open=True)

    return print('Successfully saved as ',filename)