In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import glob
import json
import itertools as it
import json
import datetime as dt

import nltk
from nltk.cluster import KMeansClusterer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import seaborn as sns
import s3fs
import boto3
import string

In [2]:
!python --version

Python 3.7.6


In [3]:
!conda --version

conda 4.8.2


# Last.FM Tag Merging

Welcome! This notebook is Step 2 in creating the backbone that underlies lucyd. This is functional one-shot code, so please forgive the lack of modularity. 

This notebook is run on an AWS EC2 instance with:
    + Python 3.7.6
    + conda 4.8.2
    + EC2 size = t3.xlarge
    + OS = Amazon Linux AMI, release 2018.03


We'll walk you through the steps we took carefully below, but, in general, the steps are as follows:
   1) Download Last.FM tag data from:
        a) http://millionsongdataset.com/sites/default/files/lastfm/lastfm_train.zip and
        b) http://millionsongdataset.com/sites/default/files/lastfm/lastfm_test.zip
   2) Iterate through all .h5 files contained therein, forming a pandas Dataframe
   3) Clean the tags up a bit.
   4) Merge with the metadata file formed in the preceding notebook
   5) Upload to S3

In [4]:
to_df = {'last_fm_tags': [], 'last_fm_tag_count': [], 'track_id': []}

In [9]:
#iterate through all files in the unzipped directory
rootdir = r'../last_fm/unzipped/'
for root,dirs,files in os.walk(rootdir):
    files = glob.glob(os.path.join(root,"*"+'json'))
    for i, f in enumerate(files):
        with open(f) as foo:
            test = json.load(foo)
            #retrieve just these couple columns
            #last.fm tags
            to_df['last_fm_tags'].append([x[0] for x in test['tags']])
            #weights of tags
            to_df['last_fm_tag_count'].append([x[1] for x in test['tags']])
            #track id for merging
            to_df['track_id'].append(test['track_id'])
last_fm_tags = pd.DataFrame.from_dict(to_df)
last_fm_tags.head()

Unnamed: 0,last_fm_tags,last_fm_tag_count,track_id
0,[],[],TRCCCFZ128F4283A22
1,[],[],TRCCCGY128F92EFB51
2,[],[],TRCCCFM12903CE2CB4
3,[],[],TRCCCLP128F426106F
4,[Akinyele],[100],TRCCCSL128F4260C90


In [10]:
#do some light cleaning here getting rid of most punctuation
last_fm_tags['last_fm_tags'] = last_fm_tags['last_fm_tags'].astype(str).str.replace('[\[\]\-://\"\'"]','').str.lower()
last_fm_tags.reset_index(inplace = True, drop = True)
last_fm_tags.head()

Unnamed: 0,last_fm_tags,last_fm_tag_count,track_id
0,,[],TRCCCFZ128F4283A22
1,,[],TRCCCGY128F92EFB51
2,,[],TRCCCFM12903CE2CB4
3,,[],TRCCCLP128F426106F
4,akinyele,[100],TRCCCSL128F4260C90


In [29]:
#read in flat summary created in preceding notebook
flat_summary = pd.read_csv(r'../flat_summary.csv', names = ['song_id', 'track_id', 'song_hotness', 'artist_familiarity',
       '7digital_id', 'title', 'artist', 'mode', 'tempo','key', 'artist_id',
       'all_terms', 'spotify_uri_final'])

for clm in ['last_fm_tag_count','last_fm_tags']:
    try:
        flat_summary.drop(clm, inplace = True)
    except:
        pass
    
#add tags to metadata
flat_summary = pd.merge(flat_summary,last_fm_tags,on = 'track_id', how = 'left')

#change all np.NaN to the missing string or 0 as appropriate
flat_summary['all_terms'] = np.where(flat_summary['all_terms'].isnull(),'',flat_summary['all_terms'])
flat_summary['last_fm_tags'] = np.where(flat_summary['last_fm_tags'].isnull(),'',flat_summary['last_fm_tags'])
flat_summary['song_hotness'] = np.where(flat_summary['song_hotness'].isnull(),0,flat_summary['song_hotness'])
flat_summary['artist_familiarity'] = np.where(flat_summary['artist_familiarity'].isnull(),0,flat_summary['artist_familiarity'])

#augment tags with artist level tags, name of the song, and the artist name
flat_summary['last_fm_tags'] = flat_summary['all_terms'] + flat_summary['last_fm_tags']
flat_summary['last_fm_tags'] = flat_summary['last_fm_tags'] + "," + flat_summary['artist'].str.lower()
flat_summary['last_fm_tags'] = flat_summary['last_fm_tags'] + "," + flat_summary['title'].str.lower()

#strip out anything that is not alphanumeric or the comma delimiter
flat_summary['last_fm_tags'] = flat_summary['last_fm_tags'].str.replace('[^a-zA-Z0-9,]','')
flat_summary.drop(['all_terms','last_fm_tag_count'], axis = 1, inplace = True)

flat_summary.head()

Unnamed: 0,song_id,track_id,song_hotness,artist_familiarity,7digital_id,title,artist,mode,tempo,key,artist_id,spotify_uri_final,last_fm_tags,last_fm_tag_count
0,SOQMMHC12AB0180CB8,TRMMMYQ128F932D901,0.542899,0.649822,7032331,Silent Night,Faster Pussy cat,0,87.002,10,ARYZTJS1187B98C555,,"heavymetal,industrialmetal,hardrock,glammetal,...","[100, 66, 66, 33, 33, 33, 33]"
1,SOVFVAK12A8C1350D9,TRMMMKD128F425225D,0.299877,0.439604,1514808,Tanssi vaan,Karkkiautomaatti,1,150.778,9,ARMVN3U1187FB3A1EB,spotify:track:6DOmOjeTc3btomrfFfPgy8,"poprock,indierock,chillout,rock,alternativeroc...",[]
2,SOGTUKN12AB017F4F1,TRMMMRX128F93187D9,0.617871,0.643681,6945353,No One Could Ever,Hudson Mohawke,1,177.768,7,ARGEKB01187FB50750,spotify:track:41RpZW2lxAdnqDd2nMBzLQ,"brokenbeat,hiphop,triphop,glitch,ghettotech,ro...","[100, 75, 50, 50, 25, 25, 25, 25, 25, 25, 0, 0]"
3,SOBNYVR12A8C13558C,TRMMMCH128F425532C,0.0,0.448501,2168257,Si Vos Querés,Yerba Brava,1,87.433,7,ARNWYLR1187B9B2F9C,spotify:track:7z4BZV7eZO1bqVKwAeTmou,"cumbia,italiandisco,losangeles,electronic,coun...",[]
4,SOHSBXH12A8C13B0DF,TRMMMWA128F426B589,0.0,0.0,2264873,Tangle Of Aspens,Der Mystic,0,140.035,5,AREQDTE1269FB37231,spotify:track:2poHURuOfVNbzZdivAwtOH,"hardtrance,darkpop,trance,electronica,dub,elec...",[]


In [37]:
#upload to S3 for later training
flat_summary.to_csv(r'../flat_summary.csv', index = False, header = False)
ACCESS_KEY = 'ENTER YOUR ACCESS KEY HERE'
SECRET_KEY = 'ENTER YOUR SECRET KEY HERE'
s3 = boto3.Session(aws_access_key_id=ACCESS_KEY,aws_secret_access_key=SECRET_KEY).resource('s3')
bucket = s3.Bucket('sagemaker-msdsubset')
bucket.upload_file(r'../flat_summary.csv', Key = 'flat_summary_04_09_20.csv')