In [2]:
import json, os, sys
import numpy as np
from utils import *

In [3]:
with open('../rawdata/msd_mxm_merged_features.json', 'r') as f:
    merged_features_raw = json.load(f)

with open('../rawdata/msd_taste_profile_filtered.json', 'r') as f:
    taste_profile_raw = json.load(f)

In [4]:
len(merged_features_raw), len(taste_profile_raw)

(2202, 225495)

In [5]:
all_lyrics = merged_features_raw.pop('all_lyrics')
len(all_lyrics)

5000

In [6]:
# Normalize content features to [0, 1]
max_features = {
    'duration': -np.inf,
    'key': -np.inf,
    'key_confidence': -np.inf,
    'mode': -np.inf,
    'mode_confidence': -np.inf,
    'loudness': -np.inf,
    'tempo': -np.inf,
    'time_signature': -np.inf,
    'time_signature_confidence': -np.inf,
}

min_features = {
    'duration': np.inf,
    'key': np.inf,
    'key_confidence': np.inf,
    'mode': np.inf,
    'mode_confidence': np.inf,
    'loudness': np.inf,
    'tempo': np.inf,
    'time_signature': np.inf,
    'time_signature_confidence': np.inf,
}

for _, features in merged_features_raw.items():
    for feature, value in features.items():
        if feature in max_features:
            max_features[feature] = max(max_features[feature], value)
            min_features[feature] = min(min_features[feature], value)
            
max_features, min_features

({'duration': 978.442,
  'key': 11.0,
  'key_confidence': 1.0,
  'mode': 1.0,
  'mode_confidence': 1.0,
  'loudness': -1.674,
  'tempo': 253.357,
  'time_signature': 7.0,
  'time_signature_confidence': 1.0},
 {'duration': 1.04444,
  'key': 0.0,
  'key_confidence': 0.0,
  'mode': 0.0,
  'mode_confidence': 0.0,
  'loudness': -32.742,
  'tempo': 0.0,
  'time_signature': 0.0,
  'time_signature_confidence': 0.0})

In [7]:
def process_features(features):
    # Filter out unrecognized features
    feature_subset = { feature_name: features[feature_name] for feature_name in max_features }
    
    # Normalize recognized features
    for feature, value in feature_subset.items():
        if feature in max_features:
            feature_subset[feature] = (value - min_features[feature]) / (max_features[feature] - min_features[feature])
            
    # Vectorize lyrics
    lyrics = features['lyrics']
    lyric_vector = np.zeros(len(all_lyrics), dtype=int)
    
    for word, count in lyrics.items():
        lyric_vector[int(word)] = count
    
    feature_subset['lyrics'] = lyric_vector

    return feature_subset

In [8]:
merged_features_processed = { track_id: process_features(features) for track_id, features in merged_features_raw.items() }

In [5]:
track_count = 0
for _, tracks in taste_profile_raw.items():
    track_count += len(tracks)
track_count

326620

In [23]:
average_user_profiles = {}
user_setlists = {}
user_id_mapping = {}
user_count = 0

lyrics_size = len(all_lyrics)
lyric_vectors = np.zeros((len(taste_profile_raw.keys()), lyrics_size))
print(lyric_vectors.shape)
for i, (user_id, tracklist) in enumerate(taste_profile_raw.items()):
    if i % 500 == 0:
        print(i, end='\r')
    
    user_id_mapping[user_id] = user_count
    user_count += 1
        
    initial = {
        'duration': 0,
        'key': 0,
        'key_confidence': 0,
        'mode': 0,
        'mode_confidence': 0,
        'loudness': 0,
        'tempo': 0,
        'time_signature': 0,
        'time_signature_confidence': 0,
    }
    
    user_setlists[user_id] = {}
    
    # Generate weighted average of features
    total_track_counts = 0
    
    for track in tracklist:
        track_id, count = track['track'], track['count']
        if track_id not in merged_features_processed:
            raise 'AAAAA'
        
        total_track_counts += count
        user_setlists[user_id][track_id] = count
        
        track_features = merged_features_processed[track_id]
        for feature, value in track_features.items():
            if feature != 'lyrics':
                initial[feature] += value * count
            else:
                lyric_vectors[user_id_mapping[user_id]] += value * count
            
    for feature in initial:
        initial[feature] /= total_track_counts
    
    lyric_vectors[user_id_mapping[user_id]] /= total_track_counts
    
    average_user_profiles[user_id] = initial

(225495, 5000)
123000

: 

In [13]:
np.zeros((len(average_user_profiles), lyrics_size))[0].shape

(5000,)

In [None]:
len(taste_profile_raw)

225495

In [None]:
n = np.random.rand(5)
n + (n * 5) / 3

array([2.11256306, 0.21351618, 0.65725724, 2.63948283, 0.23207305])

todo:
- generate avg. user profiles
- write sim scores for songs
- do it all