In [37]:
import pandas as pd
import numpy as np

albums_file = './data/spotify_albums.csv'
artists_file = './data/spotify_artists.csv'
tracks_file = './data/spotify_tracks.csv'

albums = pd.read_csv(albums_file, header=0)
artists = pd.read_csv(artists_file, header=0)
tracks = pd.read_csv(tracks_file, header=0)

In [None]:
#print(df.head()) will print the first 5 rows of the dataframe. This allows us to see a preview of the kind of data we are working with, and what we can expect for the values.
print(albums.head())

#print(df.tail()) will print the last 5 rows of the dataframe. We can compare this data to the df.head() to see if the data changes significantly through the dataframe.
print(albums.tail())

#print(df.shape) will print the (number of rows, number of columns). This gives us an indication of the amount of data we are working with.
print(albums.shape)

In [None]:
print(artists.head())
print(artists.tail())
print(artists.shape)

print(tracks.head())
print(tracks.tail())
print(tracks.shape)

In [None]:
print(albums.loc[10:20, ['name', 'release_date']])

In [41]:
albums.drop_duplicates(keep='first', subset='id', inplace = True)
artists.drop_duplicates(keep='first', subset= 'id', inplace = True)
tracks.drop_duplicates(keep='first', subset='id', inplace = True)

In [42]:
#Map function to replace blank values in 'generes' with NaN.
def map_genre (value):
    if value =='[]':
        return np.NaN
    else:
        return value

artists['genres']= artists.genres.map(map_genre)

In [None]:
print(tracks.columns)

tracks_rem = tracks.drop('lyrics', axis=1, errors='ignore')

#'lyrics' has been removed from df tracks. A new variable has been assigned in order to show change. 
print(tracks_rem.columns)

In [None]:
'''
Left outer join is being performed to ensure that artist info stays in the df even if they do not have a correlating album.
I chose to keep this data as the analysis questions are primarily regarding artists, and I didn't want to eliminate artists 
from that analysis if they did not have a corresponding album
'''
artists_albums = pd.merge(artists, albums, how='left', left_on='id', right_on='artist_id')

print(artists_albums.head())
artists_albums.shape

In [60]:
#Inner join is performed to eliminate unnecessary data, such as albums with no tracks. 
albums_tracks = pd.merge(albums, tracks, how='inner', left_on='id', right_on='album_id')

print(albums_tracks.head())
albums_tracks.shape

   Unnamed: 0_x album_type               artist_id  \
0             0     single  3DiDSECUqqY1AuBP8qtaIa   
1             1      album  6s1pCNXcbdtQJlsnM1hRIA   
2             2     single  5YjfNaHq05WrwldRe1QSBc   
3             3     single  2G9Vc16JCpnZmK4uGH46Fa   
4             4     single  2dwM9OcE4c3Ph1UBINSodx   

                                 available_markets_x  \
0  ['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...   
1  ['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...   
2  ['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...   
3  ['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...   
4  ['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...   

                                       external_urls  \
0  {'spotify': 'https://open.spotify.com/album/1g...   
1  {'spotify': 'https://open.spotify.com/album/4K...   
2  {'spotify': 'https://open.spotify.com/album/7n...   
3  {'spotify': 'https://open.spotify.com/album/6p...   
4  {'spotify': 'https://open.spotify.com/album/1X...   

 

(101939, 48)