# Data Cleaning

Here we will load the data we extracted and clean it so that we have most of the columns we need to later on do feature engineering.

In [1]:
import numpy as np
import pandas as pd
import json

### Filtering the original dataset

In [2]:
orig_dataset = pd.read_csv('data/dataset.csv', low_memory = False)

In [3]:
orig_dataset.head()

Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name,Unnamed: 0,title
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,...,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde,,
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,...,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology,,
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,...,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote,,
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,...,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote),,
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,...,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom,,


In [4]:
orig_dataset['mode'].value_counts()

1    23245
0    19060
Name: mode, dtype: int64

In [5]:
orig_dataset.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'genre', 'song_name', 'Unnamed: 0', 'title'],
      dtype='object')

In [6]:
len(orig_dataset)

42305

In [7]:
orig_dataset.drop_duplicates(inplace = True)

In [8]:
len(orig_dataset)

42305

Grouping the genres together:

In [9]:
# Group by the 'Song' column and aggregate the 'Genre' column with a list
grouped = orig_dataset.groupby('id')['genre'].apply(set).reset_index()

# Convert the aggregated sets back to lists
grouped['genre'] = grouped['genre'].apply(list)

# Truncate genre lists to a maximum length of 3
grouped['genre'] = grouped['genre'].apply(lambda genres: genres[:3])

In [10]:
grouped.head()

Unnamed: 0,id,genre
0,001dY1wrXF0Vfo40Kkqv4R,[techno]
1,001rJKlws3dVKuURJQdnIk,[techno]
2,00349h1hy2raWiakxSDfsU,[trance]
3,003FTlCpBTM4eSqYSWPv4H,[Emo]
4,003VDDA7J3Xb2ZFlNx7nIZ,[Rap]


In [11]:
# Merge the aggregated 'Genre' column back to the original DataFrame
genre_grouped = orig_dataset.merge(grouped, on='id', how='left')

In [12]:
# Drop the duplicate rows

genre_grouped = genre_grouped.drop_duplicates(subset=['id'])

In [13]:
genre_grouped.rename(columns={'genre_y': 'genres'}, inplace=True)
genre_grouped.drop(['genre_x'], axis=1, inplace=True)

In [14]:
genre_grouped.head()

Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,Unnamed: 0,title,genres
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,...,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Mercury: Retrograde,,,"[Dark Trap, Underground Rap, Rap]"
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,...,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Pathology,,,[Dark Trap]
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,...,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Symbiote,,,[Dark Trap]
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,...,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,ProductOfDrugs (Prod. The Virus and Antidote),,,"[Hiphop, Dark Trap]"
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,...,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Venom,,,"[Underground Rap, Trap Metal, Dark Trap]"


In [15]:
nan_entries = genre_grouped.isna()
nan_counts = nan_entries.sum()
print(nan_counts)

danceability            0
energy                  0
key                     0
loudness                0
mode                    0
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
type                    0
id                      0
uri                     0
track_href              0
analysis_url            0
duration_ms             0
time_signature          0
song_name           17525
Unnamed: 0          18358
title               18358
genres                  0
dtype: int64


features to be needed from here: 
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acoustiness
- instrumentalness
- liveness
- valence 
- tempo 
- id
- duration_ms
- time signature?
- we wont extract the song name from here as its empty most of the time

In [16]:
columns = ['id' ,'genres', 'danceability', 'energy' , 'key' , 'loudness' , 'mode'
           ,'speechiness' , 'acousticness' , 'instrumentalness'
           ,'liveness' , 'valence' , 'tempo' , 'duration_ms', 'time_signature', 'uri']
filtered_orig_dataset = genre_grouped[columns]

In [17]:
filtered_orig_dataset.head(3)

Unnamed: 0,id,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,uri
0,2Vc6NJ9PW9gD9q343XFRKx,"[Dark Trap, Underground Rap, Rap]",0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,124539,4,spotify:track:2Vc6NJ9PW9gD9q343XFRKx
1,7pgJBLVz5VmnL7uGHmRj6p,[Dark Trap],0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,224427,4,spotify:track:7pgJBLVz5VmnL7uGHmRj6p
2,0vSWgAlfpye0WCGeNmuNhy,[Dark Trap],0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,98821,4,spotify:track:0vSWgAlfpye0WCGeNmuNhy


<br>

### Filtering the metadata dataset

In [18]:
metadata_dataset = pd.read_csv('data/meta_dataset.csv')

In [19]:
metadata_dataset.head(3)

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,[],1,124538,True,{'isrc': 'TCADG1741539'},{'spotify': 'https://open.spotify.com/track/2V...,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,2Vc6NJ9PW9gD9q343XFRKx,False,Mercury: Retrograde,2,,4,track,spotify:track:2Vc6NJ9PW9gD9q343XFRKx
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",1,224426,True,{'isrc': 'QZMEQ2072724'},{'spotify': 'https://open.spotify.com/track/7p...,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,7pgJBLVz5VmnL7uGHmRj6p,False,Pathology,37,https://p.scdn.co/mp3-preview/a411834d126815db...,1,track,spotify:track:7pgJBLVz5VmnL7uGHmRj6p
2,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",1,98821,True,{'isrc': 'QZBRF1834041'},{'spotify': 'https://open.spotify.com/track/0v...,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,0vSWgAlfpye0WCGeNmuNhy,False,Symbiote,23,https://p.scdn.co/mp3-preview/fb20cea30bdd9d74...,1,track,spotify:track:0vSWgAlfpye0WCGeNmuNhy


Since we know that the duplicates have arised from the fact that same songs were entered in the original dataset but with a different genre, we will drop all the duplicates in the metadata dataset and the audio analysis dataset, and fix the original dataset to include all the genres of the song in ONE entry.

In [20]:
metadata_dataset.drop_duplicates(subset='id', inplace=True)

In [21]:
len(metadata_dataset)

35877

In [22]:
metadata_dataset.columns

Index(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms',
       'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local',
       'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'],
      dtype='object')

In [23]:
nan_entries = metadata_dataset.isna()
nan_counts = nan_entries.sum()
print(nan_counts)

album                    0
artists                  0
available_markets        0
disc_number              0
duration_ms              0
explicit                 0
external_ids             0
external_urls            0
href                     0
id                       0
is_local                 0
name                    14
popularity               0
preview_url          11328
track_number             0
type                     0
uri                      0
dtype: int64


it is discovered that these 14 nameless have practically no information to go off with, therefore we will be removing them

In [24]:
# Find indexes where 'Column1' contains NaN values
nan_indexes = metadata_dataset[metadata_dataset['name'].isna()].index

print("Indexes with NaN values:", nan_indexes)

Indexes with NaN values: Int64Index([ 6132, 14460, 18597, 19677, 19678, 19679, 19680, 19681, 20354,
            20515, 25249, 33952, 34717, 34835],
           dtype='int64')


In [25]:
metadata_dataset = metadata_dataset.drop(nan_indexes)

In [26]:
# The reason why we use literal_eval and not JSON parsing is because JSON strictly only handles double quotes key-value pairs
# while literal_Eval offers some flexibility
from ast import literal_eval

features = ['album' , 'artists']
for feature in features:
    metadata_dataset[feature] = metadata_dataset[feature].apply(literal_eval)

In [27]:
metadata_dataset.head(3)

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,[],1,124538,True,{'isrc': 'TCADG1741539'},{'spotify': 'https://open.spotify.com/track/2V...,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,2Vc6NJ9PW9gD9q343XFRKx,False,Mercury: Retrograde,2,,4,track,spotify:track:2Vc6NJ9PW9gD9q343XFRKx
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",1,224426,True,{'isrc': 'QZMEQ2072724'},{'spotify': 'https://open.spotify.com/track/7p...,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,7pgJBLVz5VmnL7uGHmRj6p,False,Pathology,37,https://p.scdn.co/mp3-preview/a411834d126815db...,1,track,spotify:track:7pgJBLVz5VmnL7uGHmRj6p
2,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",1,98821,True,{'isrc': 'QZBRF1834041'},{'spotify': 'https://open.spotify.com/track/0v...,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,0vSWgAlfpye0WCGeNmuNhy,False,Symbiote,23,https://p.scdn.co/mp3-preview/fb20cea30bdd9d74...,1,track,spotify:track:0vSWgAlfpye0WCGeNmuNhy


features to be needed from here: 
- album (image link to be extracted, release_date)
- artists (top 3 artists)
- popularity
- name
- id

In [28]:
metadata_dataset['images_url'] =  metadata_dataset['album'].apply(lambda x: x['images'][0]['url'] if x['images'] else None)

In [29]:
metadata_dataset['release_date'] = metadata_dataset['album'].apply(lambda x: x['release_date'])

In [30]:
max_artists = 3
metadata_dataset['artists_names'] = metadata_dataset['artists'].apply(lambda x: [artist['name'] for artist in x[:max_artists]])

In [31]:
metadata_dataset.head(2)

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri,images_url,release_date,artists_names
0,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,[],1,124538,True,{'isrc': 'TCADG1741539'},{'spotify': 'https://open.spotify.com/track/2V...,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,2Vc6NJ9PW9gD9q343XFRKx,False,Mercury: Retrograde,2,,4,track,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://i.scdn.co/image/ab67616d0000b27347fa48...,2017-09-05,[Ghostemane]
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",1,224426,True,{'isrc': 'QZMEQ2072724'},{'spotify': 'https://open.spotify.com/track/7p...,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,7pgJBLVz5VmnL7uGHmRj6p,False,Pathology,37,https://p.scdn.co/mp3-preview/a411834d126815db...,1,track,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://i.scdn.co/image/ab67616d0000b27391b3f8...,2020-11-20,"[Don Kenobi, SkyDxddy]"


In [32]:
columns = ['id' , 'name' ,'popularity' , 'release_date','artists_names', 'images_url']
filtered_metadata_dataset = metadata_dataset[columns]

<br>

### Filtering the audio analysis dataset

In [33]:
audio_analysis_dataset = pd.read_csv('data/audio_analysis_dataset.csv')

In [34]:
audio_analysis_dataset.head()

Unnamed: 0,meta,track,_song_id,no_of_sections,error
0,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2726725, 'duration': 123.661, ...",0VSXnJqQkwuH2ei1nOQ1nu,3,
1,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2718715, 'duration': 123.29773...",4jCeguq9rMTlbMmPHuO7S3,5,
2,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2480864, 'duration': 112.51084...",6fsypiJHyWmeINsOLC1cos,5,
3,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2689637, 'duration': 121.979, ...",2ggqfj97qyiORmXoVFzP5j,4,
4,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2230848, 'duration': 101.17224...",7EL7ifncK2PWFYThJjzR25,4,


In [35]:
audio_analysis_dataset.drop_duplicates(inplace=True)

In [36]:
len(audio_analysis_dataset)

35875

In [37]:
audio_analysis_dataset.columns

Index(['meta', 'track', '_song_id', 'no_of_sections', 'error'], dtype='object')

In [38]:
# The reason why we use literal_eval and not JSON parsing is because JSON strictly only handles double quotes key-value pairs
# while literal_Eval offers some flexibility

audio_analysis_dataset['track'] = audio_analysis_dataset['track'].apply(literal_eval)

features to be needed from here: 
- _song_id
- no_of_sections
- track (tempo confidence, mode confidence , key confidence

In [39]:
# List of columns to extract
keys_to_extract = ['tempo_confidence', 'mode_confidence', 'key_confidence']

# Apply lambda function to extract values and create new columns
for key in keys_to_extract:
    audio_analysis_dataset[key] = audio_analysis_dataset['track'].apply(lambda x: x[key])

In [40]:
audio_analysis_dataset.head(2)

Unnamed: 0,meta,track,_song_id,no_of_sections,error,tempo_confidence,mode_confidence,key_confidence
0,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2726725, 'duration': 123.661, ...",0VSXnJqQkwuH2ei1nOQ1nu,3,,0.0,0.745,0.765
1,"{'analyzer_version': '4.0.0', 'platform': 'Lin...","{'num_samples': 2718715, 'duration': 123.29773...",4jCeguq9rMTlbMmPHuO7S3,5,,0.205,0.631,0.694


Filtering:

In [41]:
columns = ['_song_id' , 'no_of_sections'] + keys_to_extract
filtered_audio_analysis_dataset = audio_analysis_dataset[columns]

filtered_audio_analysis_dataset.rename(columns={"_song_id": "id"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_audio_analysis_dataset.rename(columns={"_song_id": "id"}, inplace = True)


In [42]:
filtered_audio_analysis_dataset.head()

Unnamed: 0,id,no_of_sections,tempo_confidence,mode_confidence,key_confidence
0,0VSXnJqQkwuH2ei1nOQ1nu,3,0.0,0.745,0.765
1,4jCeguq9rMTlbMmPHuO7S3,5,0.205,0.631,0.694
2,6fsypiJHyWmeINsOLC1cos,5,0.824,0.465,0.378
3,2ggqfj97qyiORmXoVFzP5j,4,0.378,0.413,0.121
4,7EL7ifncK2PWFYThJjzR25,4,0.548,0.301,0.008


<br><br><br><br><br><br><br><br><br>

In [43]:
print("Length of original dataset:",len(filtered_orig_dataset),
     "\nLength of metadata dataset", len(filtered_metadata_dataset),
     "\nLength of audio_analysis_dataset", len(filtered_audio_analysis_dataset))

Length of original dataset: 35877 
Length of metadata dataset 35863 
Length of audio_analysis_dataset 35875


In [44]:
filtered_orig_dataset.head(1)

Unnamed: 0,id,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,uri
0,2Vc6NJ9PW9gD9q343XFRKx,"[Dark Trap, Underground Rap, Rap]",0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,124539,4,spotify:track:2Vc6NJ9PW9gD9q343XFRKx


In [45]:
filtered_metadata_dataset.head(1)

Unnamed: 0,id,name,popularity,release_date,artists_names,images_url
0,2Vc6NJ9PW9gD9q343XFRKx,Mercury: Retrograde,2,2017-09-05,[Ghostemane],https://i.scdn.co/image/ab67616d0000b27347fa48...


In [46]:
filtered_audio_analysis_dataset.head(1)

Unnamed: 0,id,no_of_sections,tempo_confidence,mode_confidence,key_confidence
0,0VSXnJqQkwuH2ei1nOQ1nu,3,0.0,0.745,0.765


### Merging the original dataset which contains the numerical columns, with the data collected from two different spotify api

In [47]:
merged_df = filtered_orig_dataset.merge(filtered_metadata_dataset, on='id', how='inner').merge(filtered_audio_analysis_dataset, on='id', how='inner')

In [51]:
merged_df.head(3)

Unnamed: 0,id,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,uri,name,popularity,release_date,artists_names,images_url,no_of_sections,tempo_confidence,mode_confidence,key_confidence
0,2Vc6NJ9PW9gD9q343XFRKx,"[Dark Trap, Underground Rap, Rap]",0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,...,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,Mercury: Retrograde,2,2017-09-05,[Ghostemane],https://i.scdn.co/image/ab67616d0000b27347fa48...,8,0.307,0.543,0.506
1,7pgJBLVz5VmnL7uGHmRj6p,[Dark Trap],0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,...,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,Pathology,37,2020-11-20,"[Don Kenobi, SkyDxddy]",https://i.scdn.co/image/ab67616d0000b27391b3f8...,11,0.343,0.542,0.414
2,0vSWgAlfpye0WCGeNmuNhy,[Dark Trap],0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,...,spotify:track:0vSWgAlfpye0WCGeNmuNhy,Symbiote,23,2018-03-05,[gizmo],https://i.scdn.co/image/ab67616d0000b2731981f3...,4,0.636,0.346,0.198


In [53]:
merged_df.columns

Index(['id', 'genres', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature', 'uri', 'name',
       'popularity', 'release_date', 'artists_names', 'images_url',
       'no_of_sections', 'tempo_confidence', 'mode_confidence',
       'key_confidence'],
      dtype='object')

## Removing Duplicates with different popularity scores:
Some songs would have the same name but different artists, it was noticed that there were the same songs (same name and artist) however the popularity scores were different. It was then removed so that only the entries with the highest popularity remained while the other songs with the same name and artists are removed.

In [None]:
test = merged_df.copy()

In [None]:
len(test)

In [110]:
# Combine 'name' and 'artist_names' columns into a new column 'name_artist'
test['name_artist'] = test['name'] + test['artists_names'].apply(str)

# Use groupby and idxmax to select rows with the highest popularity score within each group
unique_rows = test.loc[test.groupby('name_artist')['popularity'].idxmax()]
# Reset the index if needed
unique_rows.reset_index(drop=True, inplace=True)

# Remove the 'name_artist' column if not needed anymore
unique_rows.drop(columns=['name_artist'], inplace=True)

In [111]:
len(unique_rows)

33964

In [112]:
unique_rows[unique_rows['name'] == "ProductOfDrugs (Prod. The Virus and Antidote)"]

Unnamed: 0,id,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,uri,name,popularity,release_date,artists_names,images_url,no_of_sections,tempo_confidence,mode_confidence,key_confidence
22485,2Z4luUc9BQHrEwSSJ1eg0q,"[Underground Rap, Trap Metal, Dark Trap]",0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,...,spotify:track:2Z4luUc9BQHrEwSSJ1eg0q,ProductOfDrugs (Prod. The Virus and Antidote),51,2016-09-14,"[Kamiyada+, The Virus and Antidote]",https://i.scdn.co/image/ab67616d0000b273fa4cc2...,3,0.0,0.745,0.765


In [113]:
# Save the DataFrame to a JSON file
unique_rows.to_json('data/cured_dataset.json', orient='records', lines=True)