# **Menon Labs: Spotify Data Analysis - Cluster Labeling**

## **Collaborators** 
- Shubhum Agrawal
- Ashna Sood 
- Shania Sinha
- Sergio Vazquez
- Rohil Khatkhate
- Kerrn Reehal

## **Imports**

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 

import seaborn as sns
sns.set()
sns.set_context('talk')

import warnings
warnings.filterwarnings('ignore')

import patsy
import statsmodels.api as sm
import scipy.stats as stats

from sklearn.metrics import classification_report, precision_recall_fscore_support, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn import metrics

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Labeling Clusters**

In [None]:
# combine the individual cluster csv files into one big dataframe
basepath = '/content/drive/MyDrive/Clubs Activities/Ethi_SpotifyProject/clustering results/new_data_30/'
data = pd.read_csv(basepath+'songs_0.csv')
data['cluster'] = 0
for i in range(1, 30):
  df = pd.read_csv(basepath+f'songs_{i}.csv')
  df['cluster'] = i
  data = data.append(df)
data

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,loudness,tempo,cluster
0,35mvY5S1H3J2QZyna3TFe0,positions,['Ariana Grande'],0.468000,0.737,0.802,0.000000,0.0931,0.0878,0.682,-4.771,144.015,0
1,6Hj9jySrnFppAI0sEMCZpJ,Robbery,['Juice WRLD'],0.328000,0.685,0.692,0.000000,0.1530,0.0457,0.578,-5.122,159.966,0
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,['Travis Scott'],0.005130,0.834,0.730,0.000000,0.1240,0.2220,0.446,-3.714,155.008,0
3,27X3qzgB0Show6qfw5wNOK,Esquema Preferido,"['DJ Ivis', 'Tarcísio do Acordeon']",0.256000,0.669,0.812,0.000000,0.0773,0.1100,0.917,-4.597,159.934,0
4,2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),"['Juice WRLD', 'Marshmello']",0.017200,0.625,0.814,0.000000,0.1580,0.0657,0.535,-5.181,144.991,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5510,4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),['Abuse'],0.000001,0.141,0.931,0.224000,0.3120,0.2010,0.613,-7.091,212.980,29
5511,6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,['Sardool Sikander'],0.533000,0.658,0.852,0.000002,0.0582,0.3560,0.801,-7.159,183.985,29
5512,2n8nDfBot4MddUcUKoAqGz,Horseface,['Yea(H)'],0.004480,0.306,0.982,0.243000,0.1800,0.0416,0.558,-5.640,188.499,29
5513,55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,"['Raimonds Pauls', 'Nora Bumbiere', 'Viktors L...",0.644000,0.204,0.476,0.000001,0.2040,0.0420,0.293,-8.611,199.410,29


In [None]:
# all 30 cluster's associated emotion label -- multiple clusters with similar moods have the same label to further simplify and group the songs by emotion
labels = [
    'Energetic', 'Chill', 'Vulnerable', 'Chill', 'Soothing',
    'Chill', 'Soothing', 'Wistful', 'Energetic', 'Calm',
    'Vulnerable', 'Vulnerable', 'Sentimental', 'Powerful', 'Energetic',
    'Vulnerable', 'Powerful', 'Energetic', 'Energetic', 'Sentimental',
    'Chill', 'Soothing', 'Calm', 'Energetic', 'Wistful',
    'Upbeat', 'Energetic', 'Vulnerable', 'Upbeat', 'Powerful'
]
pd.unique(labels)

array(['Energetic', 'Chill', 'Vulnerable', 'Soothing', 'Wistful', 'Calm',
       'Sentimental', 'Powerful', 'Upbeat'], dtype=object)

In [None]:
# include Emotion label in dataframe
data['emotion'] = [labels[c] for c in data['cluster']]
data

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,loudness,tempo,cluster,emotion
0,35mvY5S1H3J2QZyna3TFe0,positions,['Ariana Grande'],0.468000,0.737,0.802,0.000000,0.0931,0.0878,0.682,-4.771,144.015,0,Energetic
1,6Hj9jySrnFppAI0sEMCZpJ,Robbery,['Juice WRLD'],0.328000,0.685,0.692,0.000000,0.1530,0.0457,0.578,-5.122,159.966,0,Energetic
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,['Travis Scott'],0.005130,0.834,0.730,0.000000,0.1240,0.2220,0.446,-3.714,155.008,0,Energetic
3,27X3qzgB0Show6qfw5wNOK,Esquema Preferido,"['DJ Ivis', 'Tarcísio do Acordeon']",0.256000,0.669,0.812,0.000000,0.0773,0.1100,0.917,-4.597,159.934,0,Energetic
4,2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),"['Juice WRLD', 'Marshmello']",0.017200,0.625,0.814,0.000000,0.1580,0.0657,0.535,-5.181,144.991,0,Energetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5510,4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),['Abuse'],0.000001,0.141,0.931,0.224000,0.3120,0.2010,0.613,-7.091,212.980,29,Powerful
5511,6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,['Sardool Sikander'],0.533000,0.658,0.852,0.000002,0.0582,0.3560,0.801,-7.159,183.985,29,Powerful
5512,2n8nDfBot4MddUcUKoAqGz,Horseface,['Yea(H)'],0.004480,0.306,0.982,0.243000,0.1800,0.0416,0.558,-5.640,188.499,29,Powerful
5513,55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,"['Raimonds Pauls', 'Nora Bumbiere', 'Viktors L...",0.644000,0.204,0.476,0.000001,0.2040,0.0420,0.293,-8.611,199.410,29,Powerful


In [None]:
data.sample(frac=1)

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,loudness,tempo,cluster,emotion
3490,1NOkswJTI2ap7QNhTi2kfi,Dark Star,['Mike Oldfield'],0.186,0.602,0.349,0.935000,0.1220,0.0288,0.430,-16.552,140.967,24,Wistful
12472,0fMRGuLoCCtv8CNklMGxaB,中央フリーウェイ,['Junko Yamamoto'],0.563,0.739,0.481,0.000780,0.1100,0.0335,0.630,-12.508,97.028,1,Chill
6622,3hE8uQOjzxvIBXPH3mIS2V,Hardcore Feelings,"['Charly Lownoise', 'Mental Theo']",0.302,0.513,0.989,0.002220,0.7140,0.0690,0.386,-6.978,170.963,23,Energetic
6786,0XFYhlNH5OEQnYg9Oj21U5,The Big Three Killed My Baby,['The White Stripes'],0.815,0.320,0.979,0.002470,0.4570,0.2720,0.384,-1.943,73.504,2,Vulnerable
3328,40Yek1A84Gh1I1X5bo634P,Facetime,['Naps'],0.485,0.850,0.748,0.000002,0.1290,0.0672,0.368,-4.381,95.951,26,Energetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,0eafUurZ4awQ3ZAIvHr608,Himlen i min famn,['Carola'],0.938,0.408,0.202,0.000002,0.0982,0.0376,0.120,-14.050,86.476,27,Vulnerable
18161,7hzLyLUhgMOEyjrmgkLYt7,Angelica,['Sinn Fenn'],0.120,0.595,0.697,0.000000,0.3750,0.0268,0.631,-7.317,110.034,18,Energetic
4760,5r1P52qgqAGKnfYpPGFisR,Það er vor,['Ýmsir'],0.934,0.625,0.175,0.001590,0.1200,0.0410,0.827,-17.534,93.448,9,Calm
910,73yB2HMz9zu0VcueZ4P1UU,Longtemps,['Amir'],0.683,0.606,0.456,0.000000,0.1150,0.0573,0.563,-8.442,98.960,10,Vulnerable


In [None]:
# make the song ID the index of the dataframe
data = data.set_index('id')
data

Unnamed: 0_level_0,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence,loudness,tempo,cluster,emotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
35mvY5S1H3J2QZyna3TFe0,positions,['Ariana Grande'],0.468000,0.737,0.802,0.000000,0.0931,0.0878,0.682,-4.771,144.015,0,Energetic
6Hj9jySrnFppAI0sEMCZpJ,Robbery,['Juice WRLD'],0.328000,0.685,0.692,0.000000,0.1530,0.0457,0.578,-5.122,159.966,0,Energetic
2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,['Travis Scott'],0.005130,0.834,0.730,0.000000,0.1240,0.2220,0.446,-3.714,155.008,0,Energetic
27X3qzgB0Show6qfw5wNOK,Esquema Preferido,"['DJ Ivis', 'Tarcísio do Acordeon']",0.256000,0.669,0.812,0.000000,0.0773,0.1100,0.917,-4.597,159.934,0,Energetic
2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),"['Juice WRLD', 'Marshmello']",0.017200,0.625,0.814,0.000000,0.1580,0.0657,0.535,-5.181,144.991,0,Energetic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),['Abuse'],0.000001,0.141,0.931,0.224000,0.3120,0.2010,0.613,-7.091,212.980,29,Powerful
6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,['Sardool Sikander'],0.533000,0.658,0.852,0.000002,0.0582,0.3560,0.801,-7.159,183.985,29,Powerful
2n8nDfBot4MddUcUKoAqGz,Horseface,['Yea(H)'],0.004480,0.306,0.982,0.243000,0.1800,0.0416,0.558,-5.640,188.499,29,Powerful
55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,"['Raimonds Pauls', 'Nora Bumbiere', 'Viktors L...",0.644000,0.204,0.476,0.000001,0.2040,0.0420,0.293,-8.611,199.410,29,Powerful


In [None]:
# save the final emotion labeled tracks as a CSV file
data.to_csv(basepath+'final_labels.csv')

## **Save Metadata of Songs**

In [None]:
# save metadata for each track 
important_data = data[['name', 'artists', 'emotion']]
important_data

Unnamed: 0_level_0,name,artists,emotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35mvY5S1H3J2QZyna3TFe0,positions,['Ariana Grande'],Energetic
6Hj9jySrnFppAI0sEMCZpJ,Robbery,['Juice WRLD'],Energetic
2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,['Travis Scott'],Energetic
27X3qzgB0Show6qfw5wNOK,Esquema Preferido,"['DJ Ivis', 'Tarcísio do Acordeon']",Energetic
2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),"['Juice WRLD', 'Marshmello']",Energetic
...,...,...,...
4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),['Abuse'],Powerful
6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,['Sardool Sikander'],Powerful
2n8nDfBot4MddUcUKoAqGz,Horseface,['Yea(H)'],Powerful
55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,"['Raimonds Pauls', 'Nora Bumbiere', 'Viktors L...",Powerful


In [None]:
# extract primary artist from songs with multiple artists
def get_primary_artist(list_str):
  list_str = list_str.strip('[]')
  artist_strs = list_str.split(', ')
  return artist_strs[0].strip("'")

# add primary artist for each track in the df
important_data['primary_artist'] = [get_primary_artist(a) for a in important_data['artists']]
important_data

Unnamed: 0_level_0,name,artists,emotion,primary_artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35mvY5S1H3J2QZyna3TFe0,positions,['Ariana Grande'],Energetic,Ariana Grande
6Hj9jySrnFppAI0sEMCZpJ,Robbery,['Juice WRLD'],Energetic,Juice WRLD
2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,['Travis Scott'],Energetic,Travis Scott
27X3qzgB0Show6qfw5wNOK,Esquema Preferido,"['DJ Ivis', 'Tarcísio do Acordeon']",Energetic,DJ Ivis
2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),"['Juice WRLD', 'Marshmello']",Energetic,Juice WRLD
...,...,...,...,...
4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),['Abuse'],Powerful,Abuse
6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,['Sardool Sikander'],Powerful,Sardool Sikander
2n8nDfBot4MddUcUKoAqGz,Horseface,['Yea(H)'],Powerful,Yea(H)
55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,"['Raimonds Pauls', 'Nora Bumbiere', 'Viktors L...",Powerful,Raimonds Pauls


In [None]:
important_data.rename(columns={'name': 'trackName', 'primary_artist': 'artistName'}, inplace=True)
important_data

Unnamed: 0_level_0,trackName,emotion,artistName
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35mvY5S1H3J2QZyna3TFe0,positions,Energetic,Ariana Grande
6Hj9jySrnFppAI0sEMCZpJ,Robbery,Energetic,Juice WRLD
2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,Energetic,Travis Scott
27X3qzgB0Show6qfw5wNOK,Esquema Preferido,Energetic,DJ Ivis
2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),Energetic,Juice WRLD
...,...,...,...
4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),Powerful,Abuse
6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,Powerful,Sardool Sikander
2n8nDfBot4MddUcUKoAqGz,Horseface,Powerful,Yea(H)
55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,Powerful,Raimonds Pauls


In [None]:
# reorder columns for proper format for json files
important_data = important_data[['trackName', 'artistName', 'emotion']]
important_data

Unnamed: 0_level_0,trackName,artistName,emotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
35mvY5S1H3J2QZyna3TFe0,positions,Ariana Grande,Energetic
6Hj9jySrnFppAI0sEMCZpJ,Robbery,Juice WRLD,Energetic
2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,Travis Scott,Energetic
27X3qzgB0Show6qfw5wNOK,Esquema Preferido,DJ Ivis,Energetic
2Y0wPrPQBrGhoLn14xRYCG,Come & Go (with Marshmello),Juice WRLD,Energetic
...,...,...,...
4qZq5qn0CSA6YF96OM5XVZ,Manifest- Manifest (2004),Abuse,Powerful
6T0aBJyFi7U3zHuaYSMWIP,Bach Jao Mundeyo,Sardool Sikander,Powerful
2n8nDfBot4MddUcUKoAqGz,Horseface,Yea(H),Powerful
55PHPSzHvd0AzuMTFDracv,Nekal mani gredzenā,Raimonds Pauls,Powerful


In [None]:
# sort df by track and artist
important_data.sort_values(by=['trackName', 'artistName'], inplace=True)
important_data

Unnamed: 0_level_0,trackName,artistName,emotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1A05ibu1DXGIt0F62NG7xU,!,Samey,Energetic
4AFCrbzvR3vLfekhABLjDU,! (The Song Formerly Known As),Regurgitator,Energetic
4v1IBp3Y3rpkWmWzIlkYju,!!De Repente!!,Rosendo,Energetic
7y07vRzFxldnZ4ZsXyWaWB,!H.a.p.p.y!,Dawid Podsiadło,Chill
0fROT4kK5oTm8xO8PX6EJF,!I'll Be Back!,Rilès,Energetic
...,...,...,...
5vubdGDI1f6Dgq8l9kYOXV,행복했던 날들이었다 days gone by,DAY6,Energetic
3Gpdzw72aBVJSrm5J1leVK,"헤어지지 못하는 여자, 떠나가지 못하는 남자 Can't Breakup Girl, C...",Leessang,Energetic
6KrJn7TLGbkXwbU8GAS5Sk,헤픈엔딩 Happen Ending,Epik High,Energetic
2p5DfmIUTLH79elmaSCCR5,화려하지 않은 고백 Confession Is Not Flashy,LEE SEUNG HWAN,Chill


In [None]:
# export metadata associated with songs as a JSON file 
import json
result = important_data.to_json(orient='records')
parsed = json.loads(result)
json.dump(parsed, open(basepath+'song_info.json', 'w'), indent=4)