In [1]:
import os
import re

import numpy as np
import pandas as pd
import tables

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, r2_score, mean_absolute_error

import surprise

In [29]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD, SVDpp, NMF, accuracy
from surprise import KNNBasic

In [4]:
taste_profile_df = pd.read_csv('/home/016709732/dataset/train_triplets.txt', sep='\t', header=None, names = ['user_id','song_id','play_count'], nrows = 2000000)
taste_profile_df.shape

(2000000, 3)

In [5]:
taste_profile_df.head(8)

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBSUJE12A6D4F8CF5,2
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBVFZR12A6D4F8AE3,1


In [6]:
taste_profile_df.isnull().sum()

user_id       0
song_id       0
play_count    0
dtype: int64

In [7]:
taste_profile_df.duplicated().sum()

0

In [8]:
song_data_df = tables.open_file("/home/016709732/dataset/msd_summary_file.h5", mode="r")
song_data_table = song_data_df.root.metadata.songs
metadata_table_df = pd.DataFrame.from_records(song_data_table[:])
metadata_table_df.shape

(1000000, 20)

In [9]:
metadata_table_df.drop(columns =['analyzer_version', 'artist_7digitalid','artist_id', 'idx_artist_terms','idx_similar_artists','release_7digitalid','artist_location','artist_longitude','artist_latitude','artist_playmeid','track_7digitalid','artist_mbid'],inplace=True)
metadata_table_df.replace({b'': np.nan}, inplace=True)
print(metadata_table_df.isna().sum())
metadata_table_df.drop(columns =['genre'],inplace=True)

artist_familiarity        185
artist_hotttnesss          12
artist_name                 0
genre                 1000000
release                     0
song_hotttnesss        418035
song_id                     0
title                      15
dtype: int64


In [10]:
metadata_table_df = metadata_table_df.applymap(lambda x: x.decode('utf-8').strip('\'"') if isinstance(x, bytes) else x)
metadata_table_df.head(8)

Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_name,release,song_hotttnesss,song_id,title
0,0.649822,0.394032,Faster Pussy cat,Monster Ballads X-Mas,0.542899,SOQMMHC12AB0180CB8,Silent Night
1,0.439604,0.356992,Karkkiautomaatti,Karkuteillä,0.299877,SOVFVAK12A8C1350D9,Tanssi vaan
2,0.643681,0.437504,Hudson Mohawke,Butter,0.617871,SOGTUKN12AB017F4F1,No One Could Ever
3,0.448501,0.372349,Yerba Brava,De Culo,,SOBNYVR12A8C13558C,Si Vos Querés
4,0.0,0.0,Der Mystic,Rene Ablaze Presents Winter Sessions,,SOHSBXH12A8C13B0DF,Tangle Of Aspens
5,0.361287,0.109626,David Montgomery,Berwald: Symphonies Nos. 1/2/3/4,,SOZVAPQ12A8C13B63C,"Symphony No. 1 G minor ""Sinfonie Serieuse""/All..."
6,0.692923,0.453732,Sasha / Turbulence,Strictly The Best Vol. 34,,SOQVRHI12A6D4FB2D7,We Have Got Love
7,0.588156,0.401092,Kris Kross,Da Bomb,,SOEYRFT12AB018936C,2 Da Beat Ch'yall


In [11]:
track_metadata = pd.read_csv('/home/016709732/dataset/track_metadata.csv')
track_metadata.shape

(1000000, 14)

In [12]:
track_metadata.drop(columns =['track_id', 'title','release', 'artist_id','artist_mbid','artist_name','artist_familiarity','artist_hotttnesss','year', 'track_7digitalid','shs_perf','shs_work'], inplace=True)

In [13]:
track_metadata = track_metadata.drop(track_metadata[track_metadata.duration > 480].index)
track_metadata = track_metadata.drop(track_metadata[track_metadata.duration < 60].index)

In [15]:
track_metadata.drop_duplicates(inplace=True)
track_metadata.shape

(942370, 2)

In [16]:
track_metadata.head(8)

Unnamed: 0,song_id,duration
0,SOQMMHC12AB0180CB8,252.05506
1,SOVFVAK12A8C1350D9,156.55138
2,SOGTUKN12AB017F4F1,138.97098
3,SOBNYVR12A8C13558C,145.05751
6,SOQVRHI12A6D4FB2D7,212.37506
7,SOEYRFT12AB018936C,221.20444
8,SOPMIYT12A6D4F851E,139.17995
9,SOJCFMH12A8C13B0C2,104.48934


In [17]:
track_merge = pd.merge(track_metadata, taste_profile_df, on='song_id', how="right")
song_data = pd.merge(metadata_table_df, track_merge, on='song_id', how="right")
song_data.shape

(2128559, 10)

In [18]:
song_data = song_data.dropna()
song_data.drop_duplicates(inplace=True)
song_data.shape

(1776948, 10)

In [19]:
print("Columns and the datatypes in the data: \n", song_data.dtypes)
print("\n\nSummary of column statistics: \n", song_data.describe())

Columns and the datatypes in the data: 
 artist_familiarity    float64
artist_hotttnesss     float64
artist_name            object
release                object
song_hotttnesss       float64
song_id                object
title                  object
duration              float64
user_id                object
play_count              int64
dtype: object


Summary of column statistics: 
        artist_familiarity  artist_hotttnesss  song_hotttnesss      duration  \
count        1.776948e+06       1.776948e+06     1.776948e+06  1.776948e+06   
mean         7.422984e-01       5.570766e-01     6.906530e-01  2.393463e+02   
std          1.247892e-01       1.398934e-01     1.953554e-01  6.601397e+01   
min          0.000000e+00       0.000000e+00     0.000000e+00  6.002893e+01   
25%          6.639805e-01       4.646150e-01     5.911614e-01  1.990526e+02   
50%          7.653570e-01       5.400078e-01     7.146505e-01  2.320975e+02   
75%          8.384095e-01       6.118629e-01     8.222856e

In [20]:
def remove_outliers(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    
    df = df[(df[column_name] >= lower) & (df[column_name] <= upper)]
    return df

song_data = remove_outliers(song_data, 'play_count')
song_data = remove_outliers(song_data, 'artist_familiarity')
song_data = remove_outliers(song_data, 'artist_hotttnesss')
song_data = remove_outliers(song_data, 'song_hotttnesss')
song_data = remove_outliers(song_data, 'duration')
song_data.shape

(1428056, 10)

In [21]:
song_data = song_data.reset_index(drop=True)
song_data.head(5)

Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_name,release,song_hotttnesss,song_id,title,duration,user_id,play_count
0,0.832012,0.677482,Jack Johnson,Thicker Than Water,0.649006,SOAKIMP12A8C130995,The Cove,112.63955,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
1,0.580555,0.482492,Billy Preston,To Die For,0.826375,SOAPDEY12A81C210A9,Nothing from Nothing,153.59955,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
2,0.633119,0.417718,Paco De Lucia,Flamenco Para Niños,0.735295,SOBBMDR12A8C13253B,Entre Dos Aguas,358.24281,b80344d063b5ccb3212f76538f3d9e43d87dca9e,2
3,0.650281,0.451749,The Dead 60s,Nick & Norah's Infinite Playlist - Original Mo...,0.754628,SOBFOVM12A58A7D494,Riot Radio (Soundtrack Version),141.42649,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
4,0.576127,0.436974,Amset,Amset,0.454193,SOBNZDC12A6D4FC103,Sin límites (I),190.53669,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1


In [22]:
song_data['unencoded_sid'] = song_data['song_id']
song_data['unencoded_uid'] = song_data['user_id']

In [23]:
data_encoder = LabelEncoder()
song_data['song_id'] = data_encoder.fit_transform(song_data['song_id'])
song_data['user_id'] = data_encoder.fit_transform(song_data['user_id'])

In [24]:
song_dict = dict(zip(song_data['unencoded_sid'], song_data['song_id']))
user_dict = dict(zip(song_data['unencoded_uid'], song_data['user_id']))
song_data.drop(columns =['unencoded_sid'], inplace=True)
song_data.drop(columns =['unencoded_uid'], inplace=True)
song_data.head(8)

Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_name,release,song_hotttnesss,song_id,title,duration,user_id,play_count
0,0.832012,0.677482,Jack Johnson,Thicker Than Water,0.649006,2078,The Cove,112.63955,29941,1
1,0.580555,0.482492,Billy Preston,To Die For,0.826375,3051,Nothing from Nothing,153.59955,29941,1
2,0.633119,0.417718,Paco De Lucia,Flamenco Para Niños,0.735295,5461,Entre Dos Aguas,358.24281,29941,2
3,0.650281,0.451749,The Dead 60s,Nick & Norah's Infinite Playlist - Original Mo...,0.754628,6263,Riot Radio (Soundtrack Version),141.42649,29941,1
4,0.576127,0.436974,Amset,Amset,0.454193,7956,Sin límites (I),190.53669,29941,1
5,0.641037,0.412069,Jorge Drexler,10 + Downloaded,0.265861,8982,12 segundos de oscuridad,246.83057,29941,2
6,0.675339,0.499641,Josh Rouse,Under Cold Blue Stars,0.645846,9466,Ears To The Ground (Album Version),171.17995,29941,1
7,0.806208,0.498077,Eric Hutchinson,Sounds Like This,0.345802,9823,Food Chain (Album Version),221.20444,29941,1


In [25]:
scaler = MinMaxScaler()
track_metadata['duration'] = scaler.fit_transform(track_metadata[['duration']])

In [26]:
reader = Reader(rating_scale=(1, 7))
songdata = Dataset.load_from_df(song_data[['user_id', 'song_id', 'play_count']], reader)

In [27]:
from surprise.model_selection import train_test_split
train_set, test_set = train_test_split(songdata, test_size=0.1, random_state=42)
svd = SVD(n_factors= 2, n_epochs=20, lr_all= 0.007, reg_all= 0.1, init_mean= 0, init_std_dev= 0.001)
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f323c259520>

In [28]:
svd_predictions = svd.test(test_set)
accuracy.rmse(svd_predictions)

RMSE: 1.1865


1.1864640154771293

In [31]:
svd_predictions = svd.test(test_set)
accuracy.mae(svd_predictions)

MAE:  0.8531


0.8531247437496093

In [30]:
svdpp = SVDpp(n_factors= 2, n_epochs=20, lr_all= 0.007, reg_all= 0.1, init_mean= 0, init_std_dev= 0.001)
svdpp.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f30e5206340>

In [32]:
svdpp_predictions = svdpp.test(test_set)
accuracy.rmse(svdpp_predictions)

RMSE: 1.1840


1.1840424599487298

In [33]:
svdpp_predictions = svdpp.test(test_set)
accuracy.mae(svdpp_predictions)

MAE:  0.8509


0.8509362373075864

In [34]:
nmf = NMF()
nmf.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f30e5206b80>

In [35]:
nmf_predictions = nmf.test(test_set)
accuracy.rmse(nmf_predictions)

RMSE: 1.2690


1.2689799692250405

In [36]:
nmf_predictions = nmf.test(test_set)
accuracy.mae(nmf_predictions)

MAE:  0.8265


0.8265017467990579