Takes in artist similarity scores and artist feature representations and generates features (absolute difference between the two artists' feature representations) for each score. Splits into training/test sets.

In [11]:
import numpy as np
from util import load_filtered
from util import load_similarity
from collections import defaultdict

In [2]:
data = load_filtered('data/lyrics_380k.csv')

154781


In [3]:
artist_list = list(sorted(data.keys())) # map every artist to an index

num_artists = len(artist_list)
print(num_artists)

2944


In [4]:
artist_scores = load_similarity('data/similarity/good_filtered.csv') # list of (artist1, artist2, similarity score) tuples

num_scores = (len(artist_scores))
print(num_scores)

13912


In [12]:
artist_features = np.load('data/artist_agg_features.npy') # Numpy array of artist features

print(artist_features.shape)

(2944, 395)


In [13]:
# Generate numpy array of all features of artist similiarities. Features are the absolute difference between the two artists and the score is the similarity score
# Inputs:
#    artist_list: list of artists in same order as artist_features, used to get indices
#    artist_features: Numpy array of artist feature representations
#    artist_scores: list of similarity scores between pairs of artists
# Outputs:
#    similarity_features: Numpy array of similarity features
#    similarity_scores: Numpy array of similarity scores corresponding to features in similarity_features
#    artist_indices: dict from artist to a list of which indices in similarity_features and similarity_scores they are involved in
def get_similarity_features(artist_list, artist_features, artist_scores):
    
    num_scores = len(artist_scores)
    
    similarity_features = np.zeros((num_scores, artist_features.shape[1]))
    similarity_scores = np.zeros(num_scores)
    
    artist_indices = defaultdict(list)
    
    # For each similarity scores, generate that feature by taking absolute difference of the two artists' feature rep
    for i in range(num_scores):
        
        artist1, artist2, score = artist_scores[i]
        
        artist1_ind = artist_list.index(artist1)
        artist2_ind = artist_list.index(artist2)
        
        artist1_feat = artist_features[artist1_ind]
        artist2_feat = artist_features[artist2_ind]
        
        similarity_feat = np.absolute(artist1_feat - artist2_feat)
        
        similarity_features[i] = similarity_feat
        similarity_scores[i] = score
        
        artist_indices[artist1].append(i)
        artist_indices[artist2].append(i)
    
    return similarity_features, similarity_scores, artist_indices

In [14]:
similarity_features, similarity_scores, artist_indices = get_similarity_features(artist_list, artist_features, artist_scores)

print(similarity_features.shape, similarity_scores.shape, len(artist_indices))

(13912, 395) (13912,) 2944


In [15]:
# Generate training/testing splits for similarity features and scores without artist overlap between train/test sets
# Inputs:
#    artist_list: list of artists, used to randomly select
#    similarity_features: Numpy array of similarity features
#    similarity_scores: Numpy array of similarity scores corresponding to features in similarity_features
#    artist_indices: dict from artist to a list of which indices in similarity_features and similarity_scores they are involved in
#    train_split: decimal representing what percentage of the artists to keep as the training split 
# Outputs:
#    train_features: training set of similarity features
#    train_scores: training set of similarity scores corresponding to train_features
#    test_features: testing set of simliarity features
#    test_scores: test set of similarity scores corresponding to test_features
def split_train_test(artist_list, similarity_features, similarity_scores, artist_indices, train_split):
    
    num_artists = len(artist_list)
    
    # Make copy of artist_list to shuffle
    artist_copy = list(artist_list)
    np.random.shuffle(artist_copy)
    
    # Split into training and test sets
    split_ind = int(round(train_split * num_artists))
    train_artists = artist_copy[:split_ind]
    test_artists = artist_copy[split_ind:]
    
    train_inds = []
    for artist in train_artists:
        train_inds += artist_indices[artist]
    
    train_inds = set(train_inds)
    
    test_inds = []
    for artist in test_artists:
        test_inds += artist_indices[artist]
    
    test_inds = set(test_inds)
    
    # Remove intersection between lists
    train_test_intersection = train_inds.intersection(test_inds)
    
    train_inds = sorted(list(train_inds - train_test_intersection))
    test_inds = sorted(list(test_inds - train_test_intersection))
    
    train_features = similarity_features[train_inds]
    train_scores = similarity_scores[train_inds]
    
    test_features = similarity_features[test_inds]
    test_scores = similarity_scores[test_inds]
    
    return train_features, train_scores, test_features, test_scores

In [16]:
train_features, train_scores, test_features, test_scores = split_train_test(artist_list, similarity_features, similarity_scores, artist_indices, 0.7)

print(train_features.shape, train_scores.shape, test_features.shape, test_scores.shape)
num_final_scores = len(train_features) + len(test_features)
print('Num Final Scores:', num_final_scores)
print('% of Train Scores:', float(len(train_features)) / num_final_scores)
print('% of Test Scores:', float(len(test_features)) / num_final_scores)

(6905, 395) (6905,) (1211, 395) (1211,)
Num Final Scores: 8116
% of Train Scores: 0.8507885657959586
% of Test Scores: 0.1492114342040414


In [17]:
np.save('data/similarity/train_sim_features.npy', train_features)
np.save('data/similarity/train_sim_scores.npy', train_scores)
np.save('data/similarity/test_sim_features.npy', test_features)
np.save('data/similarity/test_sim_scores.npy', test_scores)