# Data prep:  Content Based filtering using audio features, album_uri and artist_uri

In [1]:
import import_ipynb

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import scipy.sparse as sps

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from category_encoders import TargetEncoder
from time import time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#own functions
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision


%matplotlib inline

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


# Data transformation, PCA and merging 

In [3]:
with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['Lose Control (feat. Ciara & Fat Man Scoop)', 'Missy Elliott', 'The Cookbook']

In [4]:
with open('../data-processed/full-data/track_artist_album.json') as json_file:
    D_album_artist = json.load(json_file)
    
D_album_artist['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['spotify:artist:2wIVse2owClT7go1WT98tk',
 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K']

In [5]:
path = '../data-processed/full-data/audio-features-combined.csv'
data = pd.read_csv(path)

In [6]:
data = data.reset_index()
data = data.rename(columns = {'index':'track_id'})
data['artist_uri'] = data.apply(lambda x:D_album_artist[x.uri][0], axis = 1)
data['album_uri'] = data.apply(lambda x:D_album_artist[x.uri][1], axis = 1)

In [7]:
# data.head()

In [8]:
D_track_uri_to_id = data.groupby('uri')['track_id'].min().to_dict()
D_track_id_to_uri = data.groupby('track_id')['uri'].min().to_dict()

In [9]:
len(D_track_uri_to_id), len(D_track_id_to_uri)

(2262190, 2262190)

In [10]:
data.head()

Unnamed: 0,track_id,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_uri,album_uri
0,0,spotify:track:5OSaZxhNj307YpTh7Qp8Xy,0.716,0.421,7,-11.56,0,0.0275,0.649,0.00207,0.0936,0.207,99.078,378440,4,spotify:artist:1vSHzGHsVOCrgPSCmKNimP,spotify:album:0oA1WetYmmrxkyjxz92yJg
1,1,spotify:track:4zytmsfZ7HtdXj3VLECcu2,0.141,0.441,5,-12.893,1,0.0892,0.7,0.687,0.128,0.0725,76.044,331000,4,spotify:artist:6gHYtzSvIIWvoUY2q2V3Rj,spotify:album:3c1gpGKfahKOFXtM7eIcFO
2,2,spotify:track:6lOWUS7iBVEw6ieJqxH17B,0.799,0.893,8,-5.496,1,0.0635,1.8e-05,0.914,0.362,0.529,128.009,469397,4,spotify:artist:7kxOVclB0zQamtBR0syCrg,spotify:album:0mX83KZvGWqBOvI4nIZ23H
3,3,spotify:track:3OSL6hJ9DoRGwr9OSEVrRS,0.789,0.548,4,-7.167,1,0.0418,0.707,0.000934,0.0616,0.593,117.713,153893,4,spotify:artist:2sxmKe3CUrWnx7eoXMhOlW,spotify:album:4hXCM8vqLJnlFcuHoH3zVP
4,4,spotify:track:6x0bgGOKckFsesVf7yPWJq,0.728,0.67,3,-7.912,0,0.102,0.157,0.000797,0.108,0.0647,119.963,196000,4,spotify:artist:6PyeXqjH8OMGnt1IOhWgrQ,spotify:album:1pjNAADvPDurRS42fqxN4k


In [11]:
##
genres = pd.read_csv('../data-processed/full-data/genres_by_artist.csv')

MemoryError: Unable to allocate 4.52 GiB for an array with shape (4967, 122142) and data type float64

In [None]:
##
genres.head()

In [None]:
##
##data_genres = data.merge(genres, how='left', left_on='artist_uri', right_on='artist_uri')

In [None]:
# data_genres.shape

# PCA on genres

In [None]:
X_genres = genres.iloc[:,1:]

In [None]:
X_genres.shape

In [None]:
X_genres.values

In [None]:
pca = PCA(n_components=800)
pca = pca.fit(X_genres.values)
print(sum(pca.explained_variance_ratio_))

In [None]:
X_genres_pca = pca.transform(X_genres)

In [None]:
genres_pca_df = pd.DataFrame(X_genres_pca)

In [None]:
genres_pca_df.shape

In [None]:
genres_pca_df.shape, genres.shape

In [None]:
genres_pca_df['artist_uri'] = genres['artist_uri']

In [None]:
new_cols = ['pca_genre_'+ str(el) for el in list(genres_pca_df.columns)]

In [None]:
genres_pca_df.columns = new_cols

In [None]:
genres_pca_df.head()

In [None]:
genres_pca_df.to_csv('../data-processed/transformation-matrices/cb_genres_pca_df.csv', index = None)

# Plot data

In [None]:
# data.iloc[:,1:-1].hist(figsize=(14,9))
# uncomment
# plt.show()

In [None]:
# fig, ax = plt.subplots(ncols=4, nrows=3, figsize=(14,9))

# column = data.columns[2:14]

#uncomment
# for i in range(3):
#     for j in range(4):
#         sns.boxplot(data=data[column[i*4+j]], palette="Set1", ax=ax[i,j]).set_title(column[i*4+j])

# Standardize data 

In [None]:
X = data.iloc[:,2:13]

In [None]:
#column orders
data.iloc[:,2:13].columns

In [None]:
scaler = StandardScaler()
transformer = scaler.fit(X)
X_transformed = transformer.transform(X)

In [None]:
# uncomment
# pd.DataFrame(X_transformed).hist(figsize=(14,9))
# plt.show()

# Target encode album_uri and artist_uri

In [None]:
X_transformed.shape

In [None]:
df_X_transformed = pd.DataFrame(X_transformed, columns=data.iloc[:,2:13].columns)
# df_X_transformed.head()

In [None]:
df_X_transformed.shape

In [None]:
df_X_transformed.columns

# Encode album_uri

In [None]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [None]:
df_X_transformed['album_uri'] = data.album_uri
X = data['album_uri'].values.reshape(-1,1)

In [None]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1)  
    df_X_transformed[f'album_uri_{col}_enc'] = enc.fit_transform(X, y)

In [None]:
# df_X_transformed.head()

# Encode artist_uri

In [None]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [None]:
df_X_transformed['artist_uri'] = data.artist_uri
X = data['artist_uri'].values.reshape(-1,1)

In [None]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1) 
    df_X_transformed[f'artist_uri_{col}_enc'] = enc.fit_transform(X, y)

In [None]:
# df_X_transformed.head()

In [None]:
#X_transformed = df_X_transformed.drop(columns=['artist_uri','album_uri']).to_numpy()

In [None]:
df_X_transformed.shape

In [None]:
df_X_transformed.head()

In [None]:
df_X_transformed.to_csv('../data-processed/transformation-matrices/cb_df_X_transformed.csv', index = None)

# Add Genres PCA

In [None]:
df_X_transformed.columns

In [None]:
df_X_transformed

In [None]:
df_merged = df_X_transformed.merge(genres_pca_df, how = 'left', left_on = 'artist_uri', right_on ='pca_genre_artist_uri' )

In [None]:
df_merged.shape

In [None]:
df_merged.head()

In [None]:
#df_merged = df_merged.fillna(0)

In [None]:
df_merged.to_csv('../data-processed/transformation-matrices/cb_df_merged.csv', index = None)

In [None]:
X_transformed = df_merged.drop(columns=['artist_uri','album_uri']).to_numpy()