# Cleaning Data

Due to the amount of data that is required to have an accurate recommender system, and the need to have audio features at the track level, data of close to 600,000 tracks were sourced from Kaggle (i.e., rather than scraping). Refer to the readme references for this data.

#### Imports

In [113]:
# Imports 
import ast
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

#Reading in data
tracks = pd.read_csv('../data/source/tracks.csv')
artists = pd.read_csv('../data/source/artists.csv')

print(f'tracks.csv shape: {tracks.shape}')
print(f'artists.csv shape: {artists.shape}')
tracks.head()

tracks.csv shape: (586672, 20)
artists.csv shape: (1162095, 5)


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [114]:
artists.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


#### Checking for nulls, data types, bad entries

In [115]:
tracks.dtypes

id                   object
name                 object
popularity            int64
duration_ms           int64
explicit              int64
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
dtype: object

In [116]:
artists.dtypes

id             object
followers     float64
genres         object
name           object
popularity      int64
dtype: object

In [117]:
# Making sure that datetime column is datetime dtype.
tracks['release_date'] = pd.to_datetime(tracks['release_date'])
tracks.dtypes['release_date'] #fixed

dtype('<M8[ns]')

In [118]:
print(artists.dtypes)
artists = artists[artists['genres'] != '[]']

artists.reset_index(drop = True, inplace = True)

artists.head()

id             object
followers     float64
genres         object
name           object
popularity      int64
dtype: object


Unnamed: 0,id,followers,genres,name,popularity
0,0VLMVnVbJyJ4oyZs2L3Yl2,71.0,['carnaval cadiz'],Las Viudas De Los Bisabuelos,6
1,0dt23bs4w8zx154C5xdVyl,63.0,['carnaval cadiz'],Los De Capuchinos,5
2,0pGhoB99qpEJEsBQxgaskQ,64.0,['carnaval cadiz'],Los “Pofesionales”,7
3,3HDrX2OtSuXLW5dLR85uN3,53.0,['carnaval cadiz'],Los Que No Paran De Rajar,6
4,22mLrN5fkppmuUPsHx6i2G,59.0,"['classical harp', 'harp']",Vera Dulova,3


In [119]:
tracks.isnull().sum() #some nulls

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

In [120]:
artists.isnull().sum() # some nulls

id            0
followers     5
genres        0
name          0
popularity    0
dtype: int64

#### Since we have more than enough data for a decent recommender, these few nulls will be dropped. Further, a few features need to interpreted from string to their literals

In [121]:
tracks.dropna(inplace = True)
artists.dropna(inplace = True)

# Changing values to the intended data format instead of string
tracks['artists'] = tracks['artists'].map(lambda x: ast.literal_eval(x))
tracks['id_artists'] = tracks['id_artists'].map(lambda x: ast.literal_eval(x))
tracks['first_artist'] = tracks['id_artists'].map(lambda x: x[0])

artists['genres'] = artists['genres'].map(lambda x: ast.literal_eval(x))

tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,first_artist
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,[Uli],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,45tIt06XoI0Iio4LBEVpls
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,[Fernando Pessoa],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,...,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,14jtPCOoNZwquk5wd9DxrY
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.177,...,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,5LiOoJbxVSAMkBS2fUm3X2
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,5LiOoJbxVSAMkBS2fUm3X2
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,[Dick Haymes],[3BiJGZsyX9sJchTqcSA7Su],1922-01-01,0.402,0.158,...,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,3BiJGZsyX9sJchTqcSA7Su


In [122]:
artists = artists[artists['id'].isin(tracks['first_artist'])].reset_index(drop = True)

tracks = tracks[tracks['first_artist'].isin(artists['id'])].reset_index(drop = True)

print(tracks['first_artist'].unique().shape[0])
print(artists['id'].unique().shape[0])

49232
49232


#### Match! - The two datasets now contain only mutual artists. This will ensure that we are not dealing with songs whose artists are unknown.

In [123]:
artists.set_index('id', inplace = True)
tracks['genre'] = tracks['first_artist'].map(lambda x: artists.loc[x, 'genres'][0])
tracks.drop(columns = ['first_artist'], inplace = True)

print(f'Number of unique genres in cleaned data: {tracks["genre"].unique().shape[0]}')
print(f'Data shape before: {tracks.shape}')
tracks.head()

Number of unique genres in cleaned data: 3628
Data shape before: (525643, 21)


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.177,...,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,tango
1,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,tango
2,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,[Dick Haymes],[3BiJGZsyX9sJchTqcSA7Su],1922-01-01,0.402,0.158,...,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,adult standards
3,0BRXJHRNGQ3W4v9frnSfhu,Ave Maria,0,178933,0,[Dick Haymes],[3BiJGZsyX9sJchTqcSA7Su],1922-01-01,0.227,0.261,...,-12.343,1,0.0382,0.994,0.247,0.0977,0.0539,118.891,4,adult standards
4,0IA0Hju8CAgYfV1hwhidBH,La Java,0,161427,0,[Mistinguett],[4AxgXfD7ISvJSTObqm4aIE],1922-01-01,0.563,0.184,...,-13.757,1,0.0512,0.993,1.6e-05,0.325,0.654,133.088,3,vintage chanson


In [124]:
tracks = tracks.sort_values('popularity', ascending = True)
tracks['release_date'] = tracks['release_date'].dt.year
tracks.rename(columns = {'release_date': 'year'}, inplace = True)
tracks.drop_duplicates(subset = ['name', 'year'], keep = 'first', inplace = True)

tracks = tracks.drop(columns = ['id', 'id_artists'])

tracks.reset_index(drop = True, inplace = True)

print(f'Data shape after: {tracks.shape}')
tracks.head()

Data shape after: (489348, 19)


Unnamed: 0,name,popularity,duration_ms,explicit,artists,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,Vivo para Quererte - Remasterizado,0,181640,0,[Ignacio Corsini],1922,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,tango
1,"Symphony No. 9 in C Major, D. 944 ""The Great"":...",0,770653,0,"[Franz Schubert, Arturo Toscanini]",1941,0.331,0.17,0,-13.87,1,0.0382,0.887,0.876,0.255,0.185,121.846,4,classical
2,Don Quixote Op.35: VI Variation V: The Knight’...,0,219613,0,"[Richard Strauss, Bavarian State Orchestra]",1941,0.202,0.0445,2,-30.649,0,0.0549,0.963,0.78,0.347,0.038,75.087,3,classical
3,"Kreisleriana, Op. 16: VII. Sehr rasch",0,141520,0,"[Robert Schumann, Claudio Arrau]",1941,0.362,0.308,3,-14.835,1,0.0461,0.995,0.851,0.26,0.232,129.87,4,classical
4,Prisionera,0,193040,0,[Villa Cariño],2010,0.546,0.673,9,-4.419,1,0.0323,0.0151,0.0,0.165,0.964,208.106,4,chilean rock


In [125]:
tracks[tracks['year'] == tracks.year.min()].index # One spurious entry - we should only have data going back to 1922!

Int64Index([154444], dtype='int64')

In [126]:
tracks.drop(index = 154444, inplace = True) # Simply removing this entry, makes for cleaner EDA later.

#### Now creating a tracks dataframe w/ scaled numerical features:

In [127]:
#Scaling data for use in recommendations later
ss = StandardScaler()
cols = list(tracks.select_dtypes(include = 'number').drop(columns = 'year').columns)

scaled = pd.DataFrame(ss.fit_transform(tracks.select_dtypes(include = 'number').drop(columns = 'year'))) 
scaled.rename(columns = dict(zip([x for x in range(15)], cols)), inplace = True)

# Scaled data with accompanying metadata features (all together)
meta_scaled = scaled.copy(deep = True)
meta_scaled['name'] = tracks['name']
meta_scaled['artists'] = tracks['artists']
meta_scaled['year'] = tracks['year']
meta_scaled['genre'] = tracks['genre']
cols = ['name', 'artists', 'year', 'genre', 'explicit'] + [x for x in list(scaled.drop(columns = ['duration_ms', 'explicit']).columns)[:-1]] + ['duration_ms', 'time_signature']
meta_scaled = meta_scaled[cols]

print(f'Data shape after: {meta_scaled.shape}')
meta_scaled.head(5)

Data shape after: (489347, 19)


Unnamed: 0,name,artists,year,genre,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Vivo para Quererte - Remasterizado,[Ignacio Corsini],1922.0,tango,-0.219069,-1.633928,-0.775077,-1.493096,-1.204136,-2.224026,0.721191,-0.287248,1.609785,-0.314693,-0.016886,-0.375407,0.393636,-0.414423,2.433075
1,"Symphony No. 9 in C Major, D. 944 ""The Great"":...","[Franz Schubert, Arturo Toscanini]",1941.0,classical,-0.219069,-1.633928,-1.396889,-1.521176,-1.488554,-0.760477,0.721191,-0.363994,1.298473,3.062205,0.213137,-1.430606,0.105328,4.532573,0.266081
2,Don Quixote Op.35: VI Variation V: The Knight’...,"[Richard Strauss, Bavarian State Orchestra]",1941.0,classical,-0.219069,-1.633928,-2.175665,-2.024615,-0.919717,-4.119832,-1.386594,-0.265405,1.519592,2.68269,0.70528,-2.000879,-1.467351,-0.095496,-1.900912
3,"Kreisleriana, Op. 16: VII. Sehr rasch","[Robert Schumann, Claudio Arrau]",1941.0,classical,-0.219069,-1.633928,-1.209742,-0.967594,-0.635298,-0.953682,0.721191,-0.317356,1.612695,2.963373,0.239884,-1.248274,0.375205,-0.751382,0.266081
4,Prisionera,[Villa Cariño],2010.0,chilean rock,-0.219069,-1.633928,-0.098931,0.49659,1.071215,1.131725,0.721191,-0.398825,-1.238282,-0.400875,-0.268306,1.591452,3.006573,-0.318677,0.266081


In [128]:
tracks = tracks[cols]

tracks.set_index('name', drop = True, inplace = True)
meta_scaled.set_index('name', drop = True, inplace = True)

tracks.to_csv('../data/cleaned/track_data.csv')
meta_scaled.to_csv('../data/cleaned/scaled_with_metadata.csv')

### Now, moving to recommender.ipynb to create our recommender!
---