In [1]:
import spotipy
import spotipy.oauth2 as oauth2
import configparser
import pandas as pd
from dateutil.parser import parse as parse_date
import numpy as np
import seaborn as sns
import ast
from typing import List
from os import listdir

# Getting Streaming History 
### making it into something more palatable

In [2]:
def get_streamings(path: str = 'MyData') -> List[dict]:
    
    files = ['MyData/' + x for x in listdir(path)
             if x.split('.')[0][:-1] == 'StreamingHistory']
    
    all_streamings = []
    
    for file in files: 
        with open(file, 'r', encoding='UTF-8') as f:
            new_streamings = ast.literal_eval(f.read())
            all_streamings += [streaming for streaming 
                               in new_streamings]
    return all_streamings

In [3]:
streamings = get_streamings()

In [4]:
import requests
def get_id(track_name, token):
    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
        }
    params = [
    ('q', track_name),
    ('type', 'track'),
    ]
    try:
        response = requests.get('https://api.spotify.com/v1/search', 
                    headers = headers, params = params, timeout = 5)
        json = response.json()
        first_result = json['tracks']['items'][0]
        track_id = first_result['id']
        return track_id
    except:
        return None

In [5]:
def get_features(track_id, token):
    sp = spotipy.Spotify(auth=token)
    try:
        features = sp.audio_features([track_id])
        return features[0]
    except:
        return None

In [29]:
def get_recommendations(track_id_list, token):
    sp = spotipy.Spotify(auth=token)
    try:
        recommend = sp.recommendations(seed_tracks = track_id_list)
        return recommend
    except:
        return None

In [22]:
config = configparser.ConfigParser()
config.read('/Users/kunal/Projects/spotify/config.cfg')
client_id = config.get('SPOTIFY', 'CLIENT_ID')
client_secret = config.get('SPOTIFY', 'CLIENT_SECRET')


auth = oauth2.SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

token = auth.get_access_token()
spotify = spotipy.Spotify(auth=token)
sp = spotipy.Spotify(client_credentials_manager=auth)

  if sys.path[0] == '':


In [7]:
unique_tracks = list(set([streaming['trackName'] for streaming in streamings]))

all_features = {}
for track in unique_tracks:
    track_id = get_id(track, token)
    features = get_features(track_id, token)
    if features:
        all_features[track] = features
        
with_features = []
for track_name, features in all_features.items():
    with_features.append({'name': track_name, **features})

In [8]:
from collections import Counter
allsongs = list([streaming['trackName'] for streaming in streamings])
dist = Counter(allsongs)

In [9]:
frequency_song = pd.DataFrame.from_dict(dist, orient='index').reset_index()
frequency_song = frequency_song.rename(columns={'index':'name', 0:'count'})

In [13]:
artists = []
songname = []
for streaming in streamings:
    artists.append(streaming['artistName'])
    songname.append(streaming['trackName'])
artist_track = {'artist': artists, 'name': songname}
track_mapping = pd.DataFrame(artist_track)

In [14]:
dfp = pd.merge(df, frequency_song, on='name')
dfp = pd.merge(dfp, track_mapping, on='name')

In [15]:
dfp = dfp.drop_duplicates()

In [69]:
dfp.head()

Unnamed: 0,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,count,artist
0,Waves,0.557,0.586,10,-4.57,1,0.0382,0.0528,7e-06,0.339,...,96.091,audio_features,3nAq2hCr1oWsIU54tS98pL,spotify:track:3nAq2hCr1oWsIU54tS98pL,https://api.spotify.com/v1/tracks/3nAq2hCr1oWs...,https://api.spotify.com/v1/audio-analysis/3nAq...,181573,4,7,Opus Monik
7,Psycho! (Remix) (feat. Trippie Redd),0.871,0.525,7,-4.957,1,0.0512,0.297,0.0,0.184,...,115.01,audio_features,7j0He3HOaCTWLG7IRQr33p,spotify:track:7j0He3HOaCTWLG7IRQr33p,https://api.spotify.com/v1/tracks/7j0He3HOaCTW...,https://api.spotify.com/v1/audio-analysis/7j0H...,158719,4,1,MASN
8,Faneto,0.849,0.755,1,-7.399,0,0.363,0.346,0.0,0.0564,...,144.186,audio_features,6J1SjVyNDzkZ3sF85Ndd3B,spotify:track:6J1SjVyNDzkZ3sF85Ndd3B,https://api.spotify.com/v1/tracks/6J1SjVyNDzkZ...,https://api.spotify.com/v1/audio-analysis/6J1S...,206655,4,2,Chief Keef
10,Outta Control,0.674,0.63,6,-6.915,0,0.101,0.0534,3e-06,0.0948,...,180.022,audio_features,3E3f3Po59cqs9gS3vexHjB,spotify:track:3E3f3Po59cqs9gS3vexHjB,https://api.spotify.com/v1/tracks/3E3f3Po59cqs...,https://api.spotify.com/v1/audio-analysis/3E3f...,245293,4,2,Jessie G
12,Love Undivided,0.787,0.585,11,-11.147,1,0.186,0.654,0.00324,0.144,...,109.001,audio_features,1UQj2iyjrLLzEUi80uP98k,spotify:track:1UQj2iyjrLLzEUi80uP98k,https://api.spotify.com/v1/tracks/1UQj2iyjrLLz...,https://api.spotify.com/v1/audio-analysis/1UQj...,270000,4,4,Zuice ZE


In [16]:
# dfp.to_csv('streaming_history.csv')

In [None]:
# look at the distribution of how many times songs are played
sns.set(color_codes = True)
x = dfp['count']
filtered = x[(x >= 0) & (x < 50)]
sns.distplot(filtered, bins = 100)

In [None]:
favs = dfp[dfp['count']>=10]

In [None]:
favs.head()

In [None]:
favs['count_bins'] = pd.cut(favs['count'], bins=10)

In [None]:
favs.head()

In [None]:
sns.boxplot(x=favs.count_bins,y=favs.danceability)

In [None]:
# was curious as to what song I played so many times
# study music needs to be removed
topsongalltime = dfp[dfp['count']>=150]
topsongalltime.head()

In [None]:
favs.drop(favs.loc[favs['count']>=150].index, inplace=True)

In [None]:
# luckily I just listened to that one song on loop 
favs[favs['count']>50]

## Getting genres from LastFM 

In [34]:
config.read('/Users/kunal/Projects/spotify/config.cfg')
lastfm_key = config.get('LASTFM', 'API_KEY')
lastfm_secret = config.get('LASTFM', 'SECRET')

In [35]:
import pylast
network = pylast.LastFMNetwork(api_key=lastfm_key, api_secret=lastfm_secret)
# username=username, password_hash=password_hash

In [63]:
top_tags = network.get_track("Don Toliver", "No Idea").get_top_tags()
for item in top_tags:
    print(item.item.get_name())

Hip-Hop
rap
hiphop
rnb
don toliver


In [70]:
songs_mapped = dict(zip(dfp.artist, dfp.name))

In [73]:
def last_fm_genres(artist_song):
    """
    takes map of artists and songs and retrives top tags from LastFM
    """
    genres = []
    network = pylast.LastFMNetwork(api_key=lastfm_key, api_secret=lastfm_secret)
    for artist, song_name in artist_song.items():
        try:
            top_tags = network.get_track(artist, song_name).get_top_tags()
            tags = []
            for tag in top_tags:
                tags.append(tag.item.get_name())
            genres.append(tags)
        except:
            genres.append(['song_unknown'])
        
    return genres

In [74]:
potential_genres = last_fm_genres(songs_mapped)

# Kaggle 160k Dataset 

In [17]:
kaggle = pd.read_csv("./Kaggle160kData/data.csv")

In [18]:
# Out of personal preference I only want to consider songs released after 1990
kaggle = kaggle[kaggle['year'] >= 1990]

In [19]:
kaggle.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
5054,0.775,['Paul Simon'],0.656,303507,0.644,0,1RxlMG0bJhJdR5H6wL2oJf,0.416,6,0.12,-12.878,0,She Moves On,35,1990-10-16,0.0377,120.942,0.943,1990
5055,0.185,['Concrete Blonde'],0.275,308112,0.598,0,30q9tmV1f3vHli0MhHtZR9,1.1e-05,3,0.149,-6.757,1,"Tomorrow, Wendy",35,1990-09-16,0.0305,180.13,0.198,1990
5056,5e-06,['Napalm Death'],0.289,260933,0.887,0,5oIHVkR1P4y277ftTPLpWj,0.861,1,0.293,-14.387,1,Suffer The Children,40,1990,0.0963,148.655,0.478,1990
5057,0.975,"['Roberta Flack', 'Donny Hathaway']",0.229,219600,0.0723,0,0vB3z9Jp1OFTV9gZDYkVG1,0.000176,9,0.087,-17.46,1,For All We Know,35,1990-04-03,0.0402,79.303,0.0738,1990
5058,0.559,['Benny Hinn'],0.12,239133,0.419,0,2w3PjEG8DcqXADd51KwqtJ,0.0,0,0.486,-11.881,1,Holy Spirit Thou Art Welcome,37,1990-05-12,0.0427,73.289,0.284,1990


## Spotify Genres from MusicOrganizer

In [80]:
spot_genres = pd.read_csv("./organize_music_spotify.csv")

In [81]:
spot_genres.head()

Unnamed: 0,title,artist,top genre,year,added,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,Fortune Days,The Glitch Mob,edm,2010,2015‑03‑03,105,67,60,-9,8,19,383,38,10,56
1,Tighten Up,The Black Keys,alternative rock,2010,2015‑02‑22,109,71,50,-6,45,57,211,0,7,63
2,Corinne,Metronomy,alternative dance,2011,2018‑04‑02,140,53,91,-9,12,82,196,24,5,51
3,The Look,Metronomy,alternative dance,2011,2018‑04‑02,85,54,64,-8,8,50,278,62,4,70
4,The Bay,Metronomy,alternative dance,2011,2018‑04‑02,121,47,80,-7,7,89,290,59,4,64


In [82]:
len(spot_genres)

2750

In [83]:
spot_genres.drop_duplicates()

Unnamed: 0,title,artist,top genre,year,added,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,Fortune Days,The Glitch Mob,edm,2010,2015‑03‑03,105,67,60,-9,8,19,383,38,10,56
1,Tighten Up,The Black Keys,alternative rock,2010,2015‑02‑22,109,71,50,-6,45,57,211,0,7,63
2,Corinne,Metronomy,alternative dance,2011,2018‑04‑02,140,53,91,-9,12,82,196,24,5,51
3,The Look,Metronomy,alternative dance,2011,2018‑04‑02,85,54,64,-8,8,50,278,62,4,70
4,The Bay,Metronomy,alternative dance,2011,2018‑04‑02,121,47,80,-7,7,89,290,59,4,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287,I Don't See,Stalley,alternative hip hop,2019,2019‑02‑01,147,67,73,-9,38,63,239,3,23,7
2292,Streaming Services,Taylor Bennett,chicago rap,2019,2019‑01‑30,155,27,57,-13,17,29,260,0,9,24
2294,Cherish 2,Delivery Boys,,2019,2019‑01‑30,124,75,65,-5,18,50,249,44,5,14
2303,FLUSH,Abhi The Nomad,desi hip hop,2019,2019‑01‑22,94,64,84,-7,8,90,143,21,36,31


In [84]:
spot_genres[spot_genres['artist'] == "Machine Gun Kelly"]

Unnamed: 0,title,artist,top genre,year,added,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
694,why are you here,Machine Gun Kelly,ohio hip hop,2019,2020‑04‑10,129,89,56,-4,48,63,176,1,4,72
695,I Think I'm OKAY (with YUNGBLUD & Travis Barker),Machine Gun Kelly,ohio hip hop,2019,2020‑04‑10,120,74,63,-5,31,28,169,3,4,80
842,I Think I'm OKAY (with YUNGBLUD & Travis Barker),Machine Gun Kelly,ohio hip hop,2019,2020‑04‑10,120,74,63,-5,31,28,169,3,4,80
902,why are you here,Machine Gun Kelly,ohio hip hop,2019,2020‑04‑10,129,89,56,-4,48,63,176,1,4,72
969,Misery Business,Machine Gun Kelly,ohio hip hop,2020,2020‑04‑10,170,87,46,-4,37,72,201,0,15,66
1013,"Bullets With Names (feat. Young Thug, RJMrLA &...",Machine Gun Kelly,ohio hip hop,2020,2020‑04‑10,148,90,88,-2,13,73,169,11,15,61
1238,Misery Business,Machine Gun Kelly,ohio hip hop,2020,2020‑04‑10,170,87,46,-4,37,72,201,0,15,66
1485,"Bullets With Names (feat. Young Thug, RJMrLA &...",Machine Gun Kelly,ohio hip hop,2020,2020‑04‑10,148,90,88,-2,13,73,169,11,15,61
1527,Misery Business,Machine Gun Kelly,ohio hip hop,2020,2020‑04‑10,170,87,46,-4,37,72,201,0,15,66
1556,why are you here,Machine Gun Kelly,ohio hip hop,2019,2020‑04‑10,129,89,56,-4,48,63,176,1,4,72


### Spotify Genres are seemingly still done by Artist rather than track
This means that they are useless for songs by artists that move across genres, i.e. MGK who was hip hop but now has moved to more rock-ish songs (see "I think I'm OKAY with Travis Barker)


# Playlist Analysis

In [None]:
sourcePlaylist = sp.user_playlist("kunalr", "3F20w43v5tF4DZShDF00j1")
tracks = sourcePlaylist["tracks"]
songs = tracks["items"] 
while tracks['next']:
    tracks = sp.next(tracks)
    for item in tracks["items"]:
        songs.append(item)

In [None]:
playlist = sp.user_playlist("kunalr", "3F20w43v5tF4DZShDF00j1")
tracks = playlist['tracks']['items']
next_uri = playlist['tracks']['next']
for _ in range(int(playlist['tracks']['total'] / playlist['tracks']['limit'])):
    response = sp._get(next_uri)
    tracks += response['items']
    next_uri = response['next']

tracks_df = pd.DataFrame([(track['track']['id'],
                           track['track']['artists'][0]['name'],
                           track['track']['name'],
                           parse_date(track['track']['album']['release_date']) if track['track']['album']['release_date'] else None,
                           parse_date(track['added_at']))
                          for track in playlist['tracks']['items']],
                         columns=['id', 'artist', 'name', 'release_date', 'added_at'] )

In [None]:
tracks_df.head()

In [None]:
tracks_df \
    .groupby('artist') \
    .count()['id'] \
    .reset_index() \
    .sort_values('id', ascending=False) \
    .rename(columns={'id': 'amount'}) \
    .head(10)

In [None]:
counted_year_df = tracks_df \
    .assign(year_added=tracks_df.added_at.dt.year) \
    .groupby(['artist', 'year_added']) \
    .count()['id'] \
    .reset_index() \
    .rename(columns={'id': 'amount'}) \
    .sort_values('amount', ascending=False)

in_top_5_year_artist = counted_year_df \
    .groupby('year_added') \
    .head(5) \
    .artist \
    .unique()

counted_year_df \
    [counted_year_df.artist.isin(in_top_5_year_artist)] \
    .pivot('artist', 'year_added', 'amount') \
    .fillna(0) \
    .style.background_gradient()

In [None]:
features = []
for n, chunk_series in tracks_df.groupby(np.arange(len(tracks_df)) // 50).id:
    features += sp.audio_features([*map(str, chunk_series)])
features_df = pd.DataFrame.from_dict(filter(None, features))
tracks_with_features_df = tracks_df.merge(features_df, on=['id'], how='inner')

In [None]:
tracks_with_features_df.head()

In [None]:
sns.boxplot(x=tracks_with_features_df.added_at.dt.year,
            y=tracks_with_features_df.danceability)

In [None]:
sns.boxplot(x=tracks_with_features_df.added_at.dt.month,
            y=tracks_with_features_df.valence)

In [None]:
encode_fields = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'duration_ms',
    'time_signature',
]

def encode(row):
    return np.array([
        (row[k] - tracks_with_features_df[k].min())
        / (tracks_with_features_df[k].max() - tracks_with_features_df[k].min())
        for k in encode_fields])

tracks_with_features_encoded_df = tracks_with_features_df.assign(
    encoded=tracks_with_features_df.apply(encode, axis=1))

In [None]:
tracks_with_features_encoded_product_df = tracks_with_features_encoded_df \
    .assign(temp=0) \
    .merge(tracks_with_features_encoded_df.assign(temp=0), on='temp', how='left') \
    .drop(columns='temp')
tracks_with_features_encoded_product_df = tracks_with_features_encoded_product_df[
    tracks_with_features_encoded_product_df.id_x != tracks_with_features_encoded_product_df.id_y
]
tracks_with_features_encoded_product_df['merge_id'] = tracks_with_features_encoded_product_df \
    .apply(lambda row: ''.join(sorted([row['id_x'], row['id_y']])), axis=1)
tracks_with_features_encoded_product_df['distance'] = tracks_with_features_encoded_product_df \
    .apply(lambda row: np.linalg.norm(row['encoded_x'] - row['encoded_y']), axis=1)

In [None]:
# most similar songs
tracks_with_features_encoded_product_df \
    .sort_values('distance') \
    .drop_duplicates('merge_id') \
    [['artist_x', 'name_x', 'release_date_x', 'artist_y', 'name_y', 'release_date_y', 'distance']] \
    .head(10)

In [None]:
# average songs
tracks_with_features_encoded_product_df \
    .groupby(['artist_x', 'name_x', 'release_date_x']) \
    .sum()['distance'] \
    .reset_index() \
    .sort_values('distance') \
    .head(10)

In [None]:
# outlier songs
tracks_with_features_encoded_product_df \
    .groupby(['artist_x', 'name_x', 'release_date_x']) \
    .sum()['distance'] \
    .reset_index() \
    .sort_values('distance', ascending=False) \
    .head(10)