In [1]:
# Feature extraction
import musicbrainzngs

# Data processing
import numpy as np
import csv
# import matplotlib.pyplot as plt

# Models
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import BaggingRegressor

# PCA and k-fold validation
# from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

# Data Processing

In [3]:
train_file = '../musicrec-data/train.csv'
test_file  = '../musicrec-data/test.csv'
# change this to reflect different solutions
soln_file  = 'user_median.csv'

# Load the training data.
train_data = {}
Y_actual = {} # for testing
id = 1
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = int(row[2])
        Y_actual[id] = plays
        id += 1
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = plays

In [44]:
soln_file = 'global_median.csv'

# globalmedian.py

In [45]:
# Predict via the median number of plays.

# Compute the global median.
plays_array = []
for user, user_data in train_data.items():
    for artist, plays in user_data.items():
        plays_array.append(plays)
global_median = np.median(np.array(plays_array))
print("global median:", global_median)

global median: 118.0


In [46]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            soln_csv.writerow([id, global_median])

# usermedian.py

In [4]:
# Predict via the user-specific median.
# If the user has no data, use the global median.

# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.items():
    user_plays = []
    for artist, plays in user_data.items():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [60]:
# Write out test solutions.
Y_hat = {}
line_id = 1
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
                Y_hat[line_id] = user_medians[user]
            else:
                print("User", id, "not in training data.")
                soln_csv.writerow([id, global_median])
                Y_hat[line_id] = global_median
            
            line_id += 1

# Feature Extraction

## Playing around with musicbrainzngs

In [43]:
# must set user agent before any requests are made
musicbrainzngs.set_useragent(1,1)

## Artist information, searching, and browsing

In [99]:
musicbrainzngs.search_artists("radiohead")
#musicbrainzngs.search_recordings("bennie and the jets")

radiohead_id = "a74b1b7f-71a5-4011-9441-d0b5e4122711"
radiohead_result = musicbrainzngs.get_artist_by_id(radiohead_id,includes=["releases","ratings"])
radiohead_artist = radiohead_result["artist"]

print("name:\t\t%s" % radiohead_artist["name"])
print("rating:\t\t%s" % radiohead_artist["rating"])
print("release 0:\t%s" % radiohead_artist["release-list"][0])

# dictionary of recordings for radiohead
# musicbrainzngs.browse_recordings(radiohead_id)

# "artist" can include the following information:
# recordings, releases, release-groups, works, various-artists,
# discids, media, isrcs, aliases, annotation, area-rels, artist-rels,
# label-rels, place-rels, event-rels, recording-rels, release-rels,
# release-group-rels, series-rels, url-rels, work-rels, instrument-rels,
# tags, user-tags, ratings, user-ratings

name:		Radiohead
rating:		{'rating': '4.45', 'votes-count': '59'}
release 0:	{'status': 'Official', 'release-event-count': 1, 'title': 'Pablo Honey', 'country': 'XE', 'barcode': '077778140924', 'release-event-list': [{'date': '1993', 'area': {'sort-name': 'Europe', 'iso-3166-1-code-list': ['XE'], 'id': '89a675c2-3e37-3518-b83c-418bad59a85a', 'name': 'Europe'}}], 'packaging': 'Jewel Case', 'text-representation': {'language': 'eng', 'script': 'Latn'}, 'date': '1993', 'quality': 'normal', 'id': '06487940-c7d4-4af3-976d-0be796d686ce'}


## Cover art information

In [42]:
# gets the list of cover art associated with a release
musicbrainzngs.get_image_list("081e7033-5282-4bd7-9963-e64edf8c693a")

# downloads the front cover art of a release
# musicbrainzngs.get_image_front("081e7033-5282-4bd7-9963-e64edf8c693a")

{u'images': [{u'approved': True,
   u'back': False,
   u'comment': u'',
   u'edit': 35299266,
   u'front': True,
   u'id': 11628918400,
   u'image': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400.jpg',
   u'thumbnails': {u'large': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-500.jpg',
    u'small': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-250.jpg'},
   u'types': [u'Front']}],
 u'release': u'https://musicbrainz.org/release/081e7033-5282-4bd7-9963-e64edf8c693a'}

# Training

In [12]:
# Function for k-fold cross validation using MAE (mean absolute error)
def kfold(k, model, data):
    kf = KFold(n_splits=k)
    maes = []
    for train_fold_index, validate_fold_index in kf.split(data):
        train_fold = data[train_fold_index]
        test_fold = data[validate_fold_index]
        X_train_fold = train_fold[:, :-1]
        Y_train_fold = train_fold[:, -1]
        X_test_fold = test_fold[:, :-1]
        Y_test_fold = test_fold[:, -1]
        model.fit(X_train_fold, Y_train_fold)
        Y_hat = model.predict(X_test_fold)
        mae = np.mean([abs(Y_hat[i] - Y_test_fold[i]) for i in range(len(Y_test_fold))])
        maes.append(mae)
        print "MAEs: ", maes
    return np.mean(maes)

# Making Predictions

In [76]:
# Make predictions on training data (using user median criteria)
Y_hat = {}
line_id = 1
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    
    for row in train_csv:
        user = row[0]
        artist = row[1]
        
        # modify this part if using different criteria
        if user in user_medians:
            Y_hat[line_id] = user_medians[user]
        else:
            break
            print("User", id, "not in training data.")
            Y_hat[line_id] = global_median
        line_id += 1

In [78]:
np.mean([abs(Y_hat[i] - Y_actual[i]) for i in range(1, len(Y_hat))])

129.34241791969438

In [77]:
# c = 0
# for k, v in Y_hat.items():
#     c += 1
#     if (c > 8):
#         break
#     print(k)
#     print(v)