In [1]:
import json, os, sys
from itertools import combinations

from utils import *

import numpy as np

In [2]:
with open('../rawdata/msd_subset_audio_features.json', 'r') as f:
    merged_features_raw = json.load(f)

with open('../rawdata/msd_taste_profile_no-lyrics.json', 'r') as f:
    taste_profile_raw = json.load(f)

In [3]:
len(merged_features_raw), len(taste_profile_raw)

(10000, 404103)

In [4]:
# Normalize content features to [0, 1]
max_features = {
    'duration': -np.inf,
    'key': -np.inf,
    'key_confidence': -np.inf,
    'mode': -np.inf,
    'mode_confidence': -np.inf,
    'loudness': -np.inf,
    'tempo': -np.inf,
    'time_signature': -np.inf,
    'time_signature_confidence': -np.inf,
}

min_features = {
    'duration': np.inf,
    'key': np.inf,
    'key_confidence': np.inf,
    'mode': np.inf,
    'mode_confidence': np.inf,
    'loudness': np.inf,
    'tempo': np.inf,
    'time_signature': np.inf,
    'time_signature_confidence': np.inf,
}

for _, features in merged_features_raw.items():
    for feature, value in features.items():
        if feature in max_features:
            max_features[feature] = max(max_features[feature], value)
            min_features[feature] = min(min_features[feature], value)
            
max_features, min_features

({'duration': 1819.76771,
  'key': 11.0,
  'key_confidence': 1.0,
  'mode': 1.0,
  'mode_confidence': 1.0,
  'loudness': 0.566,
  'tempo': 262.828,
  'time_signature': 7.0,
  'time_signature_confidence': 1.0},
 {'duration': 1.04444,
  'key': 0.0,
  'key_confidence': 0.0,
  'mode': 0.0,
  'mode_confidence': 0.0,
  'loudness': -51.643,
  'tempo': 0.0,
  'time_signature': 0.0,
  'time_signature_confidence': 0.0})

In [5]:
def process_features(features):
    # Filter out unrecognized features
    feature_subset = { feature_name: features[feature_name] for feature_name in max_features }
    
    # Normalize recognized features
    for feature, value in feature_subset.items():
        if feature in max_features:
            feature_subset[feature] = (value - min_features[feature]) / (max_features[feature] - min_features[feature])
            
    return feature_subset

In [6]:
merged_features_processed = { track_id: process_features(features) for track_id, features in merged_features_raw.items() }

In [None]:
user_profiles, user_setlists = generate_user_profile(taste_profile_raw, merged_features_processed)

In [8]:
average_user_profiles = {}
user_setlists = {}

for user_id, tracklist in taste_profile_raw.items():
    initial = {
        'duration': 0,
        'key': 0,
        'key_confidence': 0,
        'mode': 0,
        'mode_confidence': 0,
        'loudness': 0,
        'tempo': 0,
        'time_signature': 0,
        'time_signature_confidence': 0,
    }
    user_setlists[user_id] = {}
    
    # Generate weighted average of features
    total_track_counts = 0
    
    for track in tracklist:
        track_id, count = track['track'], track['count']
        if track_id not in merged_features_processed:
            raise 'AAAAA'
        
        total_track_counts += count
        user_setlists[user_id][track_id] = count
        
        track_features = merged_features_processed[track_id]
        for feature, value in track_features.items():
            initial[feature] += value * count
            
    for feature in initial:
        initial[feature] /= total_track_counts
    
    average_user_profiles[user_id] = initial

len(average_user_profiles)

404103

In [9]:
def dict_euclidean_distance(a, b):
    return np.linalg.norm(np.array(list(a.values())) - np.array(list(b.values())))

def dict_manhtattan_distance(a, b):
    return np.sum(np.abs(np.array(list(a.values())) - np.array(list(b.values()))))

# Define a user's rating of a track as the number of times they listened to it
# If they haven't listened to it, define rating as the similarity (i.e. inverse distance) between the user's average profile and the track's features
def user_rating(user_id, track_id, distance_func: callable=dict_euclidean_distance):
    if track_id in user_setlists[user_id]:
        return user_setlists[user_id][track_id]
    else:
        user_avg_features = average_user_profiles[user_id]
        track_features = merged_features_processed[track_id]
        
        return 1 / (1 + distance_func(user_avg_features, track_features))        

In [10]:
def pearson_user_sim(user_id1, user_id2):
    combined_songs = set([*user_setlists[user_id1].keys(), *user_setlists[user_id2].keys()])
    n_songs = len(combined_songs)
    
    user1_ratings = np.zeros(n_songs)
    user2_ratings = np.zeros(n_songs)
    
    for i, song in enumerate(combined_songs):
        user1_ratings[i] = user_rating(user_id1, song)
        user2_ratings[i] = user_rating(user_id2, song)
    
    user1_ratings[:] -= np.mean(user1_ratings)
    user2_ratings[:] -= np.mean(user2_ratings)
    
    num = np.dot(user1_ratings, user2_ratings)
    den = np.linalg.norm(user1_ratings) * np.linalg.norm(user2_ratings)
    
    if num == 0 and den == 0:
        return 1
    
    return num / den

def cosine_user_sim(user_id1, user_id2):
    combined_songs = set([*user_setlists[user_id1].keys(), *user_setlists[user_id2].keys()])
    n_songs = len(combined_songs)
    
    user1_ratings = np.zeros(n_songs)
    user2_ratings = np.zeros(n_songs)
    
    for i, song in enumerate(combined_songs):
        user1_ratings[i] = user_rating(user_id1, song)
        user2_ratings[i] = user_rating(user_id2, song)
    
    num = np.dot(user1_ratings, user2_ratings)
    den = np.linalg.norm(user1_ratings) * np.linalg.norm(user2_ratings)
    
    if num == 0 and den == 0:
        return 0
    
    return num / den

In [11]:
# Generate user ID to index mapping
userid_to_index = { user_id: i for i, user_id in enumerate(average_user_profiles.keys()) }
index_to_userid = { v: k for k, v in userid_to_index.items() }

In [12]:
RAND_SEED = 221
rand = np.random.default_rng(RAND_SEED)

In [42]:
# Generate random user subset
subset_size = 5000
user_subset = rand.choice(len(average_user_profiles), subset_size, replace=False)
index_subset_mapping = { i: index_to_userid[user_id] for i, user_id in enumerate(user_subset) }
subset_index_mapping = { v: k for k, v in index_subset_mapping.items() }

In [None]:
user_similarity = np.full((subset_size, subset_size), -np.inf)
total_sims = (subset_size * (subset_size - 1)) // 2

for i, pair in enumerate(combinations(index_subset_mapping, 2)):
    if i % 2500 == 0:
        print(f'{i=}/{total_sims} | {(i/total_sims * 100):.3f}%', end='\r')
    
    user1, user2 = pair
    sim = cosine_user_sim(index_subset_mapping[user1], index_subset_mapping[user2])
    
    user_similarity[user1][user2] = sim
    user_similarity[user2][user1] = sim

i=12495000 / 12497500, 99.980%

In [None]:
def run_neighborhood_analysis(neighborhood_size):
    user_neighborhoods = {}
    user_neighborhoods[user]
    
    # for i, user in enumerate(user_subset):
    #     if i % 250 == 0:
    #         print(f'{i=}/{subset_size}', end='\r')
        
    #     user_neighborhoods[user] = np.argsort(user_similarity[i])[-neighborhood_size:]

In [30]:
# neigbhorhood_size = 100


In [None]:
neighborhood_sizes = [3, 5, 10, 25, 50, 100, 250, 500, 1000]
errors = [run_neighborhood_analysis(size) for size in neighborhood_sizes]

TypeError: run_neighborhood_analysis.<locals>.predict_rating() missing 1 required positional argument: 'neighborhoods'

In [31]:
run_count = 0
error = 0
for user in user_neighborhoods:
    # print(f'User {user}:')
    for track in taste_profile_raw[user]:
        track_id = track['track']
        
        prediction = predict_rating(user, track_id)
        actual = track['count']
        
        # print(f'Prediction on track {track_id}: {prediction} | Actual: {actual}')
        
        error += (actual - prediction) ** 2
        run_count += 1
        
print(f'MSE: {error / run_count}')

MSE: 29.72595568405758


todo:
- training process:
  - for each of user's songs:
    - generate recommendations using other 

combining both hybrid feature and interaction models:
- convert msd taste profile into piki-like interactions
  - run interaction model using said data
  - run feature model similarly
  - weighted average of both models' outputs
- 