# Data Preprocessing

In [60]:
# Import libraries
import pandas as pd
from ast import literal_eval

In [61]:
# Load-in data
my_spotify_data = pd.read_csv("../data/raw/my_spotify_data.csv", encoding='latin-1', low_memory=False)
tracks = pd.read_csv("../data/raw/tracks.csv")

In [62]:
# Convert strings of list into actual lists and extract main artist from the list of artists
tracks['artists'] = tracks['artists'].apply(lambda x: literal_eval(x))
tracks['artist'] = tracks['artists'].apply(lambda x: x[0])
tracks.drop(columns='artists',inplace=True)

In [63]:
# Merging datasets with song features and personal listening history together
personal_tracks = pd.merge(my_spotify_data, tracks, left_on = ['track_name','artist_name'], right_on = ['name','artist']) 

In [64]:
# Create target variable of 'favorite_song'
# Use top 20% of most personally played songs as 1, bottom 80% as 0
personal_tracks.groupby(by=['track_name','artist_name'])['name'].count().sort_values(ascending=False).quantile(.8) # value of .8 quantile 

37.0

In [65]:
track_freq = personal_tracks.groupby(by=['track_name','artist_name'])['name'].count().reset_index()
track_freq.rename(columns = {'name': 'track_freq'},inplace=True)
track_freq.head()

Unnamed: 0,track_name,artist_name,track_freq
0,! (The Song Formerly Known As),Regurgitator,1
1,#1,Aphex Twin,4
2,#3,Aphex Twin,2
3,#SELFIE,The Chainsmokers,4
4,#thatPOWER,will.i.am,4


In [66]:
# function that creates 'favorite_song' column with 1 = True, 0 = False
def favorite_song(row):
    if row < 37.0:
        return 0
    else:
        return 1 

In [67]:
track_freq['favorite_song'] = track_freq['track_freq'].apply(lambda x: favorite_song(x))
track_freq.head()

Unnamed: 0,track_name,artist_name,track_freq,favorite_song
0,! (The Song Formerly Known As),Regurgitator,1,0
1,#1,Aphex Twin,4,0
2,#3,Aphex Twin,2,0
3,#SELFIE,The Chainsmokers,4,0
4,#thatPOWER,will.i.am,4,0


In [68]:
track_freq['favorite_song'].value_counts()

favorite_song
0    6993
1    1771
Name: count, dtype: int64

In [69]:
# Merge with main dataframe: personal_tracks
personal_tracks = pd.merge(personal_tracks, track_freq, left_on = ['track_name','artist_name'], right_on = ['track_name', 'artist_name'])
personal_tracks.head()

Unnamed: 0,track_name,artist_name,album_name,spotify_track_uri,skipped,date,time,min_played,id,name,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist,track_freq,favorite_song
0,Sail,AWOLNATION,Megalithic Symphony,spotify:track:7ueP5u2qkdZbIPN2YA6LR0,True,9/6/2012,19:48:20,0.041016667,7ueP5u2qkdZbIPN2YA6LR0,Sail,...,0.0558,0.441,0.615,0.0964,0.272,119.051,4,AWOLNATION,72,1
1,Sail,AWOLNATION,Megalithic Symphony,spotify:track:7ueP5u2qkdZbIPN2YA6LR0,False,9/7/2012,14:24:48,4.318216667,7ueP5u2qkdZbIPN2YA6LR0,Sail,...,0.0558,0.441,0.615,0.0964,0.272,119.051,4,AWOLNATION,72,1
2,Sail,AWOLNATION,Megalithic Symphony,spotify:track:7ueP5u2qkdZbIPN2YA6LR0,True,9/13/2012,0:27:31,2.23815,7ueP5u2qkdZbIPN2YA6LR0,Sail,...,0.0558,0.441,0.615,0.0964,0.272,119.051,4,AWOLNATION,72,1
3,Sail,AWOLNATION,Megalithic Symphony,spotify:track:7ueP5u2qkdZbIPN2YA6LR0,False,9/13/2012,21:09:01,2.09695,7ueP5u2qkdZbIPN2YA6LR0,Sail,...,0.0558,0.441,0.615,0.0964,0.272,119.051,4,AWOLNATION,72,1
4,Sail,AWOLNATION,Megalithic Symphony,spotify:track:7ueP5u2qkdZbIPN2YA6LR0,False,9/21/2012,19:03:49,4.318216667,7ueP5u2qkdZbIPN2YA6LR0,Sail,...,0.0558,0.441,0.615,0.0964,0.272,119.051,4,AWOLNATION,72,1


In [70]:
# Drop unneeded columns
personal_tracks.drop(columns=['spotify_track_uri','id','id_artists','min_played','explicit','skipped','date','time','name','artist','track_freq'],inplace=True)

In [71]:
personal_tracks.head(1)

Unnamed: 0,track_name,artist_name,album_name,popularity,duration_ms,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,favorite_song
0,Sail,AWOLNATION,Megalithic Symphony,74,259093,2011-03-15,0.826,0.436,1,-9.583,1,0.0558,0.441,0.615,0.0964,0.272,119.051,4,1


In [72]:
# No longer need to keep track of frequency of song played, so drop duplicates
personal_tracks.drop_duplicates(subset=['track_name','artist_name'], inplace=True)
personal_tracks = personal_tracks.reset_index(drop=True)

In [77]:
# 8,764 unique songs listened to
personal_tracks.shape

(8764, 19)

In [74]:
personal_tracks.head()

Unnamed: 0,track_name,artist_name,album_name,popularity,duration_ms,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,favorite_song
0,Sail,AWOLNATION,Megalithic Symphony,74,259093,2011-03-15,0.826,0.436,1,-9.583,1,0.0558,0.441,0.615,0.0964,0.272,119.051,4,1
1,Remember the Name (feat. Styles of Beyond),Fort Minor,Petrified / Remember the Name,67,230493,2005-11-15,0.688,0.835,8,-4.162,1,0.0911,0.0583,3e-06,0.0795,0.88,84.858,4,1
2,Around the World - Radio Edit [Radio Edit],Daft Punk,"Musique, Vol. 1",45,241467,1997-04-11,0.934,0.76,7,-5.549,1,0.148,0.00417,0.906,0.0656,0.864,121.302,4,0
3,Club Foot,Kasabian,Kasabian,62,214373,2004-01-30,0.238,0.919,0,-6.893,1,0.0816,0.0257,2.3e-05,0.0914,0.123,101.752,4,1
4,I Can't Hold Back,Survivor,Survivor Greatest Hits,54,238800,1984-01-01,0.509,0.681,4,-10.659,1,0.0306,0.499,5.4e-05,0.1,0.33,143.645,4,1


In [78]:
# Save to intermediate data folder
personal_tracks.to_csv('../data/intermediate/personal_tracks.csv', index=False)