In [37]:
import pandas as pd
import numpy as np

albums_file = './data/spotify_albums.csv'
artists_file = './data/spotify_artists.csv'
tracks_file = './data/spotify_tracks.csv'

albums = pd.read_csv(albums_file, header=0)
artists = pd.read_csv(artists_file, header=0)
tracks = pd.read_csv(tracks_file, header=0)

In [None]:
#print(df.head()) will print the first 5 rows of the dataframe. This allows us to see a preview of the kind of data we are working with, and what we can expect for the values.
print(albums.head())

#print(df.tail()) will print the last 5 rows of the dataframe. We can compare this data to the df.head() to see if the data changes significantly through the dataframe.
print(albums.tail())

#print(df.shape) will print the (number of rows, number of columns). This gives us an indication of the amount of data we are working with.
print(albums.shape)

In [None]:
print(artists.head())
print(artists.tail())
print(artists.shape)

print(tracks.head())
print(tracks.tail())
print(tracks.shape)

In [None]:
print(albums.loc[10:20, ['name', 'release_date']])

In [41]:
albums.drop_duplicates(keep='first', subset='id', inplace = True)
artists.drop_duplicates(keep='first', subset= 'id', inplace = True)
tracks.drop_duplicates(keep='first', subset='id', inplace = True)

In [42]:
#Map function to replace blank values in 'generes' with NaN.
def map_genre (value):
    if value =='[]':
        return np.NaN
    else:
        return value

artists['genres']= artists.genres.map(map_genre)

In [None]:
print(tracks.columns)

tracks_rem = tracks.drop('lyrics', axis=1, errors='ignore')

#'lyrics' has been removed from df tracks. A new variable has been assigned in order to show change. 
print(tracks_rem.columns)

In [None]:
'''
Left outer join is being performed to ensure that artist info stays in the df even if they do not have a correlating album.
I chose to keep this data as the analysis questions are primarily regarding artists, and I didn't want to eliminate artists 
from that analysis if they did not have a corresponding album
'''
artists_albums = pd.merge(artists, albums, how='left', left_on='id', right_on='artist_id')

print(artists_albums.head())
artists_albums.shape

In [None]:
#Inner join is performed to eliminate unnecessary data, such as albums with no tracks. 
albums_tracks = pd.merge(albums, tracks, how='inner', left_on='id', right_on='album_id')

print(albums_tracks.head())
albums_tracks.shape

In [None]:
#Clean data for ease of use, rename columns after merge and drop extra index columns.
artists_albums.rename(columns={'name_x':'artist_name', 'name_y':'album_name', 'id_x':'artist_id', 'id_y':'album_id'}, inplace=True)
artists_albums.drop(['Unnamed: 0_x','Unnamed: 0_y'], axis=1, inplace=True)

In [None]:
from pandas_profiling import ProfileReport

#This report gives us a lot of information about the data set. The artists that appear most in the data set are Various Artists, Johann Sebastian Bach, Frederic Chopin, David Guetta and Wolgang Amadeus Mozart.
artist_profile = ProfileReport(artists_albums, title="Artists Profiling Report")
artist_profile.to_notebook_iframe()

In [94]:
#Print top ten artist by artist popularity
top_ten = artists_albums.sort_values(by= 'artist_popularity', ascending= False).artist_name.unique()

print(top_ten[0:10])

['Ariana Grande' 'Drake' 'Post Malone' 'Khalid' 'Ozuna' 'Juice WRLD'
 'XXXTENTACION' 'Bad Bunny' 'Anuel Aa' 'Travis Scott']
