In [15]:
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sklearn.feature_extraction.text import CountVectorizer
import scipy
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# Read in *preprocessed* data
bucket='sagemaker-msdsubset'
data_key = 'flat_summary'
prefix = 'sagemaker/preprocessed_lda'
role = get_execution_role()
data_location = 's3://sagemaker-msdsubset/flat_summary_04_09_20.csv'
flat = pd.read_csv(data_location, header = None)

print(flat.shape)
flat.head(20)

(1009330, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,SOQMMHC12AB0180CB8,TRMMMYQ128F932D901,0.542899,0.649822,7032331,Silent Night,Faster Pussy cat,0,87.002,10,ARYZTJS1187B98C555,,"heavymetal,industrialmetal,hardrock,glammetal,..."
1,SOVFVAK12A8C1350D9,TRMMMKD128F425225D,0.299877,0.439604,1514808,Tanssi vaan,Karkkiautomaatti,1,150.778,9,ARMVN3U1187FB3A1EB,spotify:track:6DOmOjeTc3btomrfFfPgy8,"poprock,indierock,chillout,rock,alternativeroc..."
2,SOGTUKN12AB017F4F1,TRMMMRX128F93187D9,0.617871,0.643681,6945353,No One Could Ever,Hudson Mohawke,1,177.768,7,ARGEKB01187FB50750,spotify:track:41RpZW2lxAdnqDd2nMBzLQ,"brokenbeat,hiphop,triphop,glitch,ghettotech,ro..."
3,SOBNYVR12A8C13558C,TRMMMCH128F425532C,0.0,0.448501,2168257,Si Vos Querés,Yerba Brava,1,87.433,7,ARNWYLR1187B9B2F9C,spotify:track:7z4BZV7eZO1bqVKwAeTmou,"cumbia,italiandisco,losangeles,electronic,coun..."
4,SOHSBXH12A8C13B0DF,TRMMMWA128F426B589,0.0,0.0,2264873,Tangle Of Aspens,Der Mystic,0,140.035,5,AREQDTE1269FB37231,spotify:track:2poHURuOfVNbzZdivAwtOH,"hardtrance,darkpop,trance,electronica,dub,elec..."
5,SOZVAPQ12A8C13B63C,TRMMMXN128F42936A5,0.0,0.361287,3360982,"Symphony No. 1 G minor ""Sinfonie Serieuse""/All...",David Montgomery,1,90.689,10,AR2NS5Y1187FB5879D,,"ragtime,jazz,electronic,experimental,american,..."
6,SOQVRHI12A6D4FB2D7,TRMMMLR128F1494097,0.0,0.692923,552626,We Have Got Love,Sasha / Turbulence,0,101.45,3,ARO41T51187FB397AB,spotify:track:5zvuyMMCl5TQrEefdMSERe,"progressivehouse,progressivetrance,techhouse,b..."
7,SOEYRFT12AB018936C,TRMMMBB12903CB7D21,0.0,0.588156,6435649,2 Da Beat Ch'yall,Kris Kross,0,98.02,11,AR3Z9WY1187FB4CDC2,spotify:track:3GsS8jzoixpCnp4jDWCEvb,"poprap,hiphop,oldschoolhiphop,breakbeat,tripho..."
8,SOPMIYT12A6D4F851E,TRMMMHY12903CB53F1,0.0,0.408465,8376489,Goodbye,Joseph Locke,1,115.427,5,ARA04401187B991E6E,,"folkrock,folk,country,world,irish,classical,ro..."
9,SOJCFMH12A8C13B0C2,TRMMMML128F4280EE9,0.0,0.419941,1043208,Mama_ mama can't you see ?,The Sun Harbor's Chorus-Documentary Recordings,0,124.339,4,ARCVMYS12454A51E6E,spotify:track:4U2ryP1lJ09IeWA5tBpq3R,"patriotic,upbeat,classic,beautiful,unitedstate..."


In [29]:
# Keep essential columns only
flat = flat.rename(columns={5: "title", 6: "artist", 11: "spotify_uri", 12: "last_fm_tags"})

In [30]:
# Trim columns with no tags as those songs will not make it into our recommendations
# (note that we could potentially add a pagerank-esque teleportation factor to include
# these untagged or new songs at some point down the line)
flat = flat[~flat['last_fm_tags'].isnull()]
flat = flat[flat['last_fm_tags'].astype(str) != "[]"]
flat = flat[['title', 'artist', 'spotify_uri', 'last_fm_tags']]
flat = flat[~flat['spotify_uri'].isnull()]

flat.shape

(813317, 4)

In [35]:
# For tag-level CS matrix
tag_count_vectorizer = CountVectorizer(strip_accents='unicode', lowercase = True, min_df = 50,
                                   stop_words = 'english',
                                    # preprocessor = lambda x: str(x).strip("[]"), 
                                    #token_pattern = ".*", 
                                    tokenizer=lambda x: x.split(","))
tag_sparse_matrix = tag_count_vectorizer.fit_transform(flat['last_fm_tags'])

In [36]:
# Find incorporated number of tags 
print(len(tag_count_vectorizer.get_feature_names()))

25180


In [34]:
# Compute inverse CS Matrix
full_tag_cs = cosine_similarity(tag_sparse_matrix.T, tag_sparse_matrix.T)

In [14]:
# save and store inverted index
tags = pd.Series(tag_count_vectorizer.get_feature_names())
np.save('inverted_full_index.npy', tags)

# save inverted CS matrix
tag_csr = csr_matrix(full_tag_cs)
scipy.sparse.save_npz('inverted_full_cs_matrix.npz', tag_csr)

In [37]:
# For song-level CS matrix
song_count_vectorizer = CountVectorizer(strip_accents='unicode', lowercase = True, min_df = 500,
                                   stop_words = 'english',
                                    # preprocessor = lambda x: str(x).strip("[]"), 
                                    #token_pattern = ".*", 
                                    tokenizer=lambda x: x.split(","))
song_sparse_matrix = song_count_vectorizer.fit_transform(flat['last_fm_tags'])

In [38]:
# Find incorporated number of tags 
print(len(song_count_vectorizer.get_feature_names()))

2990


In [40]:
# save and store index
songs = song_count_vectorizer.get_feature_names()
np.save('full_index.npy', songs)

# save song info for full CS Matrix
flat_without_titles = flat[['artist', 'spotify_uri']]
np.save('full_song_info.npy', flat_without_titles)

# save CountVectorized() song matrix
song_csr = csr_matrix(song_sparse_matrix)
scipy.sparse.save_npz('full_vectorized_song_matrix.npz', song_csr)