# Combine Datasets

Combine the Spotify Million Playlist and Kaggle Tracks Dataset

In [1]:
import pandas as pd
import os
import json
import copy
import datetime

## Load Pickle Files

In [2]:
tracks_dir = os.path.join('..','..','datasets','spotify_tracks_dataset')
tracks_file_path = os.path.join(tracks_dir,'table.pkl')
tracks_df = pd.read_pickle(tracks_file_path)
tracks_df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
mp_dir = os.path.join('..','..','datasets','spotify_million_playlist_dataset','pkl')
mp_df_list = []
file_count = 0
start_time = datetime.datetime.now()
for filename in os.listdir(mp_dir):
    if((file_count*100/1000) % 10 == 0):
        print(f"{file_count}, {(file_count*100/1000)}%, {datetime.datetime.now()-start_time}")
    file_path = os.path.join(mp_dir, filename)
    if os.path.isfile(file_path):  # skip subdirectories
        df = pd.read_pickle(file_path)
        mp_df_list.append(df)
        file_count += 1

0, 0.0%, 0:00:00.115187
100, 10.0%, 0:00:06.000006
200, 20.0%, 0:00:11.918566
300, 30.0%, 0:00:18.249380
400, 40.0%, 0:00:27.128622
500, 50.0%, 0:00:34.468184
600, 60.0%, 0:00:40.959696
700, 70.0%, 0:00:49.103033
800, 80.0%, 0:00:57.190943
900, 90.0%, 0:01:04.085259


In [4]:
mp_df_list[0].head()

Unnamed: 0,track_name,artist_name,album_name,playlist_name,track_uri,artist_uri
0,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,Throwbacks,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk
1,Toxic,Britney Spears,In The Zone,Throwbacks,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4
2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m
3,Rock Your Body,Justin Timberlake,Justified,Throwbacks,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7
4,It Wasn't Me,Shaggy,Hot Shot,Throwbacks,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij


In [5]:
mp_df_list[1].head()

Unnamed: 0,track_name,artist_name,album_name,playlist_name,track_uri,artist_uri
0,Fathoms Below - Broadway Cast Recording,Original Broadway Cast - The Little Mermaid,The Little Mermaid: Original Broadway Cast Rec...,disney,spotify:track:5IbCV9Icebx8rR6wAp5hhP,spotify:artist:3TymzPhJTMyupk7P5xkahM
1,Daughters Of Triton - Broadway Cast Recording,Original Broadway Cast - The Little Mermaid,The Little Mermaid: Original Broadway Cast Rec...,disney,spotify:track:6rKVAvjHcxAzZ1BHtwh5yC,spotify:artist:3TymzPhJTMyupk7P5xkahM
2,The World Above - Broadway Cast Recording,Original Broadway Cast - The Little Mermaid,The Little Mermaid: Original Broadway Cast Rec...,disney,spotify:track:6Jlkb1Wh08RYHstWScsTvg,spotify:artist:3TymzPhJTMyupk7P5xkahM
3,Human Stuff - Broadway Cast Recording,Original Broadway Cast - The Little Mermaid,The Little Mermaid: Original Broadway Cast Rec...,disney,spotify:track:0XhC8bfStML9ygBmfOt1JJ,spotify:artist:3TymzPhJTMyupk7P5xkahM
4,I Want the Good Times Back - Broadway Cast Rec...,Original Broadway Cast - The Little Mermaid,The Little Mermaid: Original Broadway Cast Rec...,disney,spotify:track:0ABxAcsRWlqckkyONsfP67,spotify:artist:3TymzPhJTMyupk7P5xkahM


## Evaluate Overlap between Datasets
 
 - How are multi-artist track artists listed?
 - How many (track_name, artist_name) are shared between the two?
 - Is `track_id` in Tracks Dataset the same as `track_uri` in Million Playlist Dataset?
 - If so, what's the intersection count for `tracks.track_id == mp.track_uri`

In [6]:
tracks_df[tracks_df['artists'] == 'Britney Spears']

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
20652,6ic8OlLUNEATToEFU3xmaH,Britney Spears,Blackout,Gimme More,79,251240,False,0.788,0.844,2,-3.131,1,0.0334,0.25,0.000678,0.0723,0.382,113.324,4,dance
20709,6T689Jvh5KrSXyaPtUWZtQ,Britney Spears,Femme Fatale (Deluxe Version),Criminal,72,225080,False,0.696,0.734,7,-5.294,1,0.0298,0.0485,0.0,0.183,0.711,107.987,4,dance


In [7]:
# search for matches in mp_df_list
for mp_df in mp_df_list:
    filter = mp_df['track_name'] == 'Gimme More'
    if len(mp_df[filter]) > 0:
        print('Million Playlist Dataset Entry')
        print(mp_df[filter])
        break


Million Playlist Dataset Entry
       track_name     artist_name album_name  playlist_name  \
52579  Gimme More  Britney Spears   Blackout  Main Playlist   
66928  Gimme More  Britney Spears   Blackout      Michelle    

                                  track_uri  \
52579  spotify:track:6ic8OlLUNEATToEFU3xmaH   
66928  spotify:track:6ic8OlLUNEATToEFU3xmaH   

                                  artist_uri  
52579  spotify:artist:26dSoYclwsYLMAKD3tpOr4  
66928  spotify:artist:26dSoYclwsYLMAKD3tpOr4  


In [8]:
track_id = '6ic8OlLUNEATToEFU3xmaH'
track_uri = f'spotify:track:{track_id}'

# search for matches in mp_df_list
for mp_df in mp_df_list:
    filter = mp_df['track_uri'] == track_uri
    if len(mp_df[filter]) > 0:
        print('Million Playlist Dataset Entry')
        print(mp_df[filter])
        break

Million Playlist Dataset Entry
       track_name     artist_name album_name  playlist_name  \
52579  Gimme More  Britney Spears   Blackout  Main Playlist   
66928  Gimme More  Britney Spears   Blackout      Michelle    

                                  track_uri  \
52579  spotify:track:6ic8OlLUNEATToEFU3xmaH   
66928  spotify:track:6ic8OlLUNEATToEFU3xmaH   

                                  artist_uri  
52579  spotify:artist:26dSoYclwsYLMAKD3tpOr4  
66928  spotify:artist:26dSoYclwsYLMAKD3tpOr4  


In [12]:
# Run this as a separate python script in terminal
    
track_ids = []
total_rows = len(tracks_df['track_id'].unique())


with open("match_track_ids.txt", "a") as f:  # "a" = append mode
    start_time = datetime.datetime.now()
    for i,track_id in enumerate(tracks_df['track_id'].unique()):
        
        if (i*100/total_rows) % 10 == 0:
            print(f"{datetime.datetime.now() - start_time}: Row# {i}, Progress = {i*100/total_rows:.2f}%, Matches found = {len(track_ids)}")
        
        # get track_id
        track_id = tracks_df.iloc[i]['track_id']
        track_uri = f'spotify:track:{track_id}'

        # search for matches in mp_df_list
        for mp_df in mp_df_list:
            filter = mp_df['track_uri'] == track_uri
            if len(mp_df[filter]) > 0:
                
                track_ids.append(track_id)

                # Write a new line in a text file with the track_id
                f.write(track_id + "\n")
                print(f"{datetime.datetime.now() - start_time}: Row# {i}, Progress = {i*100/total_rows:.2f}%, Matches found = {len(track_ids)}")
                break

0:00:00.015895: Row# 0, Progress = 0.00%, Matches found = 0
0:00:34.209512: Row# 9, Progress = 0.01%, Matches found = 1
0:00:56.769897: Row# 15, Progress = 0.02%, Matches found = 2
0:01:04.528047: Row# 18, Progress = 0.02%, Matches found = 3
0:01:08.512986: Row# 20, Progress = 0.02%, Matches found = 4
0:01:12.586089: Row# 22, Progress = 0.02%, Matches found = 5
0:03:05.046584: Row# 52, Progress = 0.06%, Matches found = 6
0:03:05.047966: Row# 53, Progress = 0.06%, Matches found = 7
0:03:09.000541: Row# 55, Progress = 0.06%, Matches found = 8
0:03:09.004259: Row# 56, Progress = 0.06%, Matches found = 9
0:03:12.925853: Row# 58, Progress = 0.06%, Matches found = 10
0:03:20.993686: Row# 61, Progress = 0.07%, Matches found = 11
0:03:40.875494: Row# 67, Progress = 0.07%, Matches found = 12
0:03:40.880611: Row# 68, Progress = 0.08%, Matches found = 13
0:03:40.919161: Row# 69, Progress = 0.08%, Matches found = 14
0:03:40.923706: Row# 70, Progress = 0.08%, Matches found = 15
0:03:44.874396: Row#

KeyboardInterrupt: 

In [None]:
len(track_ids)

In [None]:
len(track_ids)/total_rows 

## Expand Columns of Million Playlist Dataset

Likely easiest to match based on `track_id/track_uri`.

`null` values if the `track_id/uri` isn't found.

In [None]:
# Calculate percentage of rows without `null` values