In [2]:
# Feature extraction
import musicbrainzngs

# Data processing
import numpy as np
import csv
# import matplotlib.pyplot as plt

# Models
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import BaggingRegressor

# PCA and k-fold validation
# from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

# Data Processing

In [3]:
fulltrain_file = '../musicrec-data/train.csv'
ktrain_file = '../musicrec-data/ktrain.csv'
kvalid_file = '../musicrec-data/kval.csv'

# change this to reflect different solutions
soln_file  = 'user_median.csv'

# for full data, non kfold
train_file = fulltrain_file
test_file  = '../musicrec-data/test.csv'

# for kfold validation
train_file = ktrain_file
test_file = kvalid_file

In [6]:
# get length of fulltrain
with open(fulltrain_file, 'r') as fulltrain_fh:
    fulltrain_csv = csv.reader(fulltrain_fh, delimiter=',', quotechar='"')
    next(fulltrain_csv, None)
    
    data = list(fulltrain_csv)
    rowsoftrain = len(data)
rowsoftrain

4154804

In [10]:
# Separate fulltrain_file into training set and validation set (for kfold)
c = 0
with open(fulltrain_file, 'r') as fulltrain_fh:
    fulltrain_csv = csv.reader(fulltrain_fh, delimiter=',', quotechar='"')
    next(fulltrain_csv, None)

    with open(ktrain_file, 'w') as ktrain_fh:
        ktrain_csv = csv.writer(ktrain_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        
        with open(kvalid_file, 'w') as kvalid_fh:
            kvalid_csv = csv.writer(kvalid_fh,
                                   delimiter=',',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            
            for row in fulltrain_csv:
#                 user   = row[0]
#                 artist = row[1]
#                 plays  = row[2]
#                 print(user, artist, plays)
#                 assert(0 == 1)
                if (c < (rowsoftrain * 4 / 5)):
                    ktrain_csv.writerow(row)
                else:
                    kvalid_csv.writerow(row)
                c += 1

In [3]:
# Load the training data.
train_data = {}
Y_actual = {} # for testing at end
id = 1
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = int(row[2])
        Y_actual[id] = plays
        id += 1
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = plays

# globalmedian.py

In [45]:
# Predict via the median number of plays.

# Compute the global median.
plays_array = []
for user, user_data in train_data.items():
    for artist, plays in user_data.items():
        plays_array.append(plays)
global_median = np.median(np.array(plays_array))
print("global median:", global_median)

global median: 118.0


In [46]:
# Write out test solutions.
Y_hat = {} # add this Y_hat line_id stuff to every solution
line_id = 1
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            soln_csv.writerow([id, global_median])
            Y_hat[line_id] = global_median
            line_id += 1

# usermedian.py

In [4]:
# Predict via the user-specific median.
# If the user has no data, use the global median.

# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.items():
    user_plays = []
    for artist, plays in user_data.items():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [60]:
# Write out test solutions.
Y_hat = {} # add this Y_hat line_id stuff to every solution
line_id = 1
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
                Y_hat[line_id] = user_medians[user]
            else:
                print("User", id, "not in training data.")
                soln_csv.writerow([id, global_median])
                Y_hat[line_id] = global_median
            
            line_id += 1

# Feature Extraction

In [43]:
# must set user agent before any requests are made
musicbrainzngs.set_useragent(1,1)
# set user agent before any requests are made
musicbrainzngs.set_useragent(2,1)

In [None]:
# function for getting the mode tag for an artist (alphabetical order)
def mode_tag(artist_id):
    result = musicbrainzngs.get_artist_by_id(artist_id,includes=['tags'])
    tags = result['artist']['tag-list']
    mode_tag = None
    mode = None
    for tag_dict in tags:
        if tag_dict['count'] > mode:
            mode = tag_dict['count']
            mode_tag = tag_dict['name']
    return mode_tag

# mode_tag('a74b1b7f-71a5-4011-9441-d0b5e4122711')

In [None]:
# read in artist information
artists = pd.read_csv('../musicrec-data/artists.csv', sep=',')
artist_ids = artists['artist'].values

In [None]:
def build_artist_genre_dict(artists):
    artist_genre_dict = {}
    for artist_id in artists:
        print artist_id
        # update artist_genre dict
        try: 
            genre = mode_tag(artist_id)
            artist_genre_dict[artist_id] = genre
        except (KeyError, musicbrainzngs.ResponseError):
            artist_genre_dict[artist_id] = None
    return artist_genre_dict
            
artist_genre_dict = build_artist_genre_dict(artist_ids)

In [None]:
def build_user_genre_dict(train_data):
    user_genre_dict = {}
    for user, artist_info in train_data.iteritems():
        print user
        genre_dict = {}
        for artist, plays in artist_info.iteritems():
            genre = artist_genre_dict[artist]
            # only proceed if genre is not None
            if genre: 
                # if the user hasn't already listened to the genre
                if genre not in genre_dict:
                    genre_dict[genre] = (plays, 1)
                # otherwise, add number of plays to current total
                else:
                    (num_genre_plays, num_genre_artists) = genre_dict[genre]
                    genre_dict[genre] = (num_genre_plays + plays, num_genre_artists + 1)
        user_genre_dict[user] = genre_dict
    return user_genre_dict

user_genre_dict = build_user_genre_dict(train_data)

In [None]:
for k,v in artist_genre_dict.iteritems():
    print(k)
    print(v)
    break

print(len(artist_genre_dict))

for k,v in user_genre_dict.iteritems():
    print(k)
    print(v)
    break
    
print(len(user_genre_dict))

In [None]:
# SAVE TO CSV
with open('artist_genre_dict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in artist_genre_dict.items():
        if value:
            writer.writerow([key, value.encode('ascii', 'ignore').decode('ascii')])
        else:
            writer.writerow([key, value])
            
# SAVE AS CSV
with open('user_genre_dict.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in user_genre_dict.items():
       writer.writerow([key, value])

## Artist information, searching, and browsing

In [1]:
musicbrainzngs.search_artists("radiohead")
#musicbrainzngs.search_recordings("bennie and the jets")

radiohead_id = "a74b1b7f-71a5-4011-9441-d0b5e4122711"
radiohead_result = musicbrainzngs.get_artist_by_id(radiohead_id,includes=["releases","ratings"])
radiohead_artist = radiohead_result["artist"]

print("name:\t\t%s" % radiohead_artist["name"])
print("rating:\t\t%s" % radiohead_artist["rating"])
print("release 0:\t%s" % radiohead_artist["release-list"][0])

# dictionary of recordings for radiohead
# musicbrainzngs.browse_recordings(radiohead_id)

# "artist" can include the following information:
# recordings, releases, release-groups, works, various-artists,
# discids, media, isrcs, aliases, annotation, area-rels, artist-rels,
# label-rels, place-rels, event-rels, recording-rels, release-rels,
# release-group-rels, series-rels, url-rels, work-rels, instrument-rels,
# tags, user-tags, ratings, user-ratings

NameError: name 'musicbrainzngs' is not defined

## Cover art information

In [42]:
# gets the list of cover art associated with a release
musicbrainzngs.get_image_list("081e7033-5282-4bd7-9963-e64edf8c693a")

# downloads the front cover art of a release
# musicbrainzngs.get_image_front("081e7033-5282-4bd7-9963-e64edf8c693a")

{u'images': [{u'approved': True,
   u'back': False,
   u'comment': u'',
   u'edit': 35299266,
   u'front': True,
   u'id': 11628918400,
   u'image': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400.jpg',
   u'thumbnails': {u'large': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-500.jpg',
    u'small': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-250.jpg'},
   u'types': [u'Front']}],
 u'release': u'https://musicbrainz.org/release/081e7033-5282-4bd7-9963-e64edf8c693a'}

# Training

In [12]:
# Function for k-fold cross validation using MAE (mean absolute error)
def kfold(k, model, data):
    kf = KFold(n_splits=k)
    maes = []
    for train_fold_index, validate_fold_index in kf.split(data):
        train_fold = data[train_fold_index]
        test_fold = data[validate_fold_index]
        X_train_fold = train_fold[:, :-1]
        Y_train_fold = train_fold[:, -1]
        X_test_fold = test_fold[:, :-1]
        Y_test_fold = test_fold[:, -1]
        model.fit(X_train_fold, Y_train_fold)
        Y_hat = model.predict(X_test_fold)
        mae = np.mean([abs(Y_hat[i] - Y_test_fold[i]) for i in range(len(Y_test_fold))])
        maes.append(mae)
        print "MAEs: ", maes
    return np.mean(maes)

# Making Predictions

In [76]:
# For kfold: make predictions on partial training data (using user median criteria)
Y_hat = {}
line_id = 1
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    
    for row in train_csv:
        user = row[0]
        artist = row[1]
        
        # modify this part if using different criteria
        if user in user_medians:
            Y_hat[line_id] = user_medians[user]
        else:
            break
            print("User", id, "not in training data.")
            Y_hat[line_id] = global_median
        line_id += 1

In [78]:
np.mean([abs(Y_hat[i] - Y_actual[i]) for i in range(1, len(Y_hat))])

129.34241791969438

In [77]:
# c = 0
# for k, v in Y_hat.items():
#     c += 1
#     if (c > 8):
#         break
#     print(k)
#     print(v)