# **Data Cleansing**

In [1]:
# Importing the relevant libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
artist_df = pd.read_csv("artists.csv")
tracks_df = pd.read_csv("tracks.csv")


In [6]:
artist_df.head()


Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


In [7]:
tracks_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


## Handling missing data

In [14]:
# Checking for null values in the datasets
print("Number of null values in each column of artist_df:")
print(artist_df.isnull().sum())

print("\nNumber of null values in each column of tracks_df:")
print(tracks_df.isnull().sum())

Number of null values in each column of artist_df:
id             0
followers     13
genres         0
name           3
popularity     0
dtype: int64

Number of null values in each column of tracks_df:
id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64


In [15]:
# Checking for rows with null values in the both dataset
print("Rows with null values in artist_df:")
print(artist_df[artist_df.isnull().any(axis=1)])

print("\nRows with null values in tracks_df:")
print(tracks_df[tracks_df.isnull().any(axis=1)])

Rows with null values in artist_df:
                            id  followers  \
260011  2ZO5sFPyILDQkFdS4HJPFS     8718.0   
423964  4zevbrEdBtYUhAZgCGDtxz      270.0   
444199  7F71W80jaXFARK7hBjsDI2        NaN   
444200  3MLHJz04KmEVzCTPclzkEm        NaN   
444797  0cqZsULDZdJTGA4Zqh8Ckv        NaN   
444798  0BuknWzKujyc9HfZ1V50Uk        NaN   
446635  6ltU5gIDLmWNYaVNHnll5G        NaN   
446636  7C9nWRMbRqpPUuKh2OEw9n        NaN   
446637  41c30F8zy5UCTSevbn0WfD        NaN   
468525  1DK979aOesiZ4Vkus8txqu        NaN   
468526  6jkpqSWWsXSuqtsoeAiMDU        NaN   
468527  7aMdHPv79qOuqqBD6TnaCp        NaN   
468528  2lr0R5vHGfI0C489h0r6qV        NaN   
468529  0xkSOIeyeTILNIOZKyFgaP        NaN   
468530  4EqqnE0XMAcreVF84QGYJ0        NaN   
889212  5YVrJKFCv5krBGjhMya576      414.0   

                                               genres  \
260011  ['cascadian black metal', 'post-black metal']   
423964                                             []   
444199                     

In [None]:
# Handling missing data in artist_df
artist_df['followers'] = artist_df['followers'].fillna(0)  

unknown_artist = artist_df['name'].isnull()
num_unknown_artists = unknown_artist.sum()
unknown_name = [f'Unknown{i+1}' for i in range(num_unknown_artists)]
artist_df.loc[unknown_artist, 'name'] = unknown_name

print("\nAfter handling missing data in artist_df:")
print(artist_df.isnull().sum())



After handling missing data in artist_df:
id            0
followers     0
genres        0
name          0
popularity    0
dtype: int64


In [26]:
# Handling missing data in tracks_df

# tracks w/ missing titles
notitles = tracks_df['name'].isnull()
num_notitles = notitles.sum()
notitles_list = [f'NoTitle{i+1}' for i in range(num_notitles)]
tracks_df.loc[notitles, 'name'] = notitles_list

# tracks w/ missing artists
noartists = tracks_df[tracks_df['artists'].apply(lambda x: x==[''])]

for idx, row in noartists.iterrows():
    artist_id = row['id_artists']
    artist = []
    for j in artist_id:
        match = artist_df[artist_df['id'] == j]
        if not match.empty:
            artist.append(match['name'].values[0])
        else:
            artist.append('Unknown Artist')

    tracks_df.at[idx, 'artists'] = artist

print("\nAfter handling missing data in tracks_df:")
print(tracks_df.isnull().sum())
display(tracks_df[tracks_df['artists'].apply(lambda x: x==['Unknown Artist'])])




After handling missing data in tracks_df:
id                  0
name                0
popularity          0
duration_ms         0
explicit            0
artists             0
id_artists          0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
