In [1]:
# Feature extraction
import musicbrainzngs

# Data processing
import numpy as np
import csv
import pandas as pd
# import matplotlib.pyplot as plt

# Models
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import BaggingRegressor

# PCA and k-fold validation
# from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

# Data Processing

In [2]:
train_file = 'musicrec-data/train.csv'
test_file  = 'musicrec-data/test.csv'
# change this to reflect different solutions
soln_file  = 'sex_median.csv'

# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = int(row[2])
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = plays

In [8]:
# Load the test data.
test_data = {}
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)
    for row in test_csv:
        user   = row[0]
        artist = row[1]
    
        if not user in test_data:
            test_data[user] = {}
        
        test_data[user][artist] = plays

## Artists overlap

In [3]:
artists_df = pd.read_csv('musicrec-data/artists.csv')

In [4]:
artists_df.head()

Unnamed: 0,artist,name
0,03098741-08b3-4dd7-b3f6-1b0bfa2c879c,Liars
1,69c4cc43-8163-41c5-ac81-30946d27bb69,CunninLynguists
2,7a2e6b55-f149-4e74-be6a-30a1b1a387bb,The Desert Sessions
3,7002bf88-1269-4965-a772-4ba1e7a91eaa,Glenn Gould
4,dbf7c761-e332-467b-b4d9-aafe06bbcf8f,G. Love & Special Sauce


In [5]:
train_artists = []
for user, artist_dict in train_data.iteritems():
    for artist in artist_dict.keys():
        train_artists.append(artist)

In [6]:
train_artists = set(train_artists)

In [24]:
test_artists = []
for user, artist_dict in test_data.iteritems():
    for artist in artist_dict.keys():
        test_artists.append(artist)

In [26]:
test_artists = set(test_artists)

In [40]:
len(set.intersection(test_artists, set(artists_df['artist'].values)))

0

In [27]:
len(test_artists)

233286

In [29]:
len(train_artists)

2000

## Append column giving user sex

In [45]:
user_df = pd.read_csv('musicrec-data/profiles.csv')

In [48]:
user_df.head()

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25.0,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29.0,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30.0,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21.0,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24.0,Netherlands


In [79]:
sex_dict = {row['user']:row['sex'] for (ind, row) in user_df.iterrows()}

In [103]:
male_plays = []
female_plays = []
for user, user_data in train_data.iteritems():
    try:
        sex = sex_dict[user]
    except KeyError:
        sex = 'u'
    for artist, plays in user_data.iteritems():
        if sex == 'm':
            male_plays.append(plays)
        elif sex == 'f':
            female_plays.append(plays)

In [104]:
male_median = np.median(np.array(male_plays))
female_median = np.median(np.array(female_plays))
nan_median = np.median(np.array(nan_plays))
print 'Male median:', np.median(np.array(male_plays))
print 'Female median:', np.median(np.array(female_plays))

Male median: 126.0
Female median: 103.0
NaN median: nan


In [105]:
print nan_plays

[]


# globalmedian.py

## Prediction wrapper function

In [2]:
def genre_weight(predict_file, alpha, user_genre_dict, artist_genre_dict, global_median, user_medians, test_flag=False, soln_file='dummy.csv'):
    with open(predict_file, 'r') as predict_fh:
        predict_csv = csv.reader(predict_fh, delimiter=',', quotechar='"')
        next(predict_csv, None)
        
        with open(soln_file, 'w') as soln_fh:
            soln_csv = csv.writer(soln_fh,
                                  delimiter=',',
                                  quotechar='"',
                                  quoting=csv.QUOTE_MINIMAL)
            soln_csv.writerow(['Id', 'plays'])
            predictions = {} # dictionary of id -> prediction
            for row in predict_csv:
                id     = row[0]
                user   = row[1]
                artist = row[2]
                genre = artist_genre_dict[artist]
                user_genre_avg_plays = user_genre_dict[user][genre][0] / user_genre_dict[user]

                # If artist has a genre
                if genre:
                    if user in user_medians:
                        predictions[id] = alpha * (user_genre_avg_plays) + (1 - alpha) * user_medians[user]
                    else
                        predictions[id] = alpha * (user_genre_avg_plays) + (1 - alpha) * global_median
                else:
                    if user in user_medians:
                        predictions[id] = user_medians[user]
                    else:
                        predictions[id] = global_median
                # If test flag, write prediction to soln_csv
                if test_flag:
                    soln_csv.writerow([id, predictions[id]])
            return predictions
            
            

In [5]:
# Predict via the median number of plays.

# Compute the global median.
plays_array = []
for user, user_data in train_data.iteritems():
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
global_median = np.median(np.array(plays_array))
print "global median:", global_median

# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            soln_csv.writerow([id, global_median])

global median: 118.0


# usermedian.py

In [88]:
# Predict via the user-specific median.
# If the user has no data, use the global median.
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)
    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

soln_file = 'user_median.csv'


In [None]:
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                soln_csv.writerow([id, user_medians[user]])
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])

## sexmedian

In [106]:
# Write out test solutions.
soln_file  = 'sex_median.csv'
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            
            try:
                sex = sex_dict[user]
                if sex == 'm':
                    soln_csv.writerow([id, 0.5*(user_medians[user] + male_median)])
                elif sex == 'f':
                    soln_csv.writerow([id, 0.5*(user_medians[user] + female_median)])
                else:
                    soln_csv.writerow([id, 0.5*(user_medians[user] + global_median)])
            except KeyError:
                soln_csv.writerow([id, global_median])

In [107]:
check = pd.read_csv('sex_median.csv')

In [108]:
check.shape

(4154804, 2)

In [101]:
print np.unique(sex_dict.values())

['f' 'm' 'nan']


# Feature Extraction

## Playing around with musicbrainzngs

In [43]:
# must set user agent before any requests are made
musicbrainzngs.set_useragent(1,1)

## Artist information, searching, and browsing

In [99]:
musicbrainzngs.search_artists("radiohead")
#musicbrainzngs.search_recordings("bennie and the jets")

radiohead_id = "a74b1b7f-71a5-4011-9441-d0b5e4122711"
radiohead_result = musicbrainzngs.get_artist_by_id(radiohead_id,includes=["releases","ratings"])
radiohead_artist = radiohead_result["artist"]

print("name:\t\t%s" % radiohead_artist["name"])
print("rating:\t\t%s" % radiohead_artist["rating"])
print("release 0:\t%s" % radiohead_artist["release-list"][0])

# dictionary of recordings for radiohead
# musicbrainzngs.browse_recordings(radiohead_id)

# "artist" can include the following information:
# recordings, releases, release-groups, works, various-artists,
# discids, media, isrcs, aliases, annotation, area-rels, artist-rels,
# label-rels, place-rels, event-rels, recording-rels, release-rels,
# release-group-rels, series-rels, url-rels, work-rels, instrument-rels,
# tags, user-tags, ratings, user-ratings

name:		Radiohead
rating:		{'rating': '4.45', 'votes-count': '59'}
release 0:	{'status': 'Official', 'release-event-count': 1, 'title': 'Pablo Honey', 'country': 'XE', 'barcode': '077778140924', 'release-event-list': [{'date': '1993', 'area': {'sort-name': 'Europe', 'iso-3166-1-code-list': ['XE'], 'id': '89a675c2-3e37-3518-b83c-418bad59a85a', 'name': 'Europe'}}], 'packaging': 'Jewel Case', 'text-representation': {'language': 'eng', 'script': 'Latn'}, 'date': '1993', 'quality': 'normal', 'id': '06487940-c7d4-4af3-976d-0be796d686ce'}


## Cover art information

In [42]:
# gets the list of cover art associated with a release
musicbrainzngs.get_image_list("081e7033-5282-4bd7-9963-e64edf8c693a")

# downloads the front cover art of a release
# musicbrainzngs.get_image_front("081e7033-5282-4bd7-9963-e64edf8c693a")

{u'images': [{u'approved': True,
   u'back': False,
   u'comment': u'',
   u'edit': 35299266,
   u'front': True,
   u'id': 11628918400,
   u'image': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400.jpg',
   u'thumbnails': {u'large': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-500.jpg',
    u'small': u'http://coverartarchive.org/release/081e7033-5282-4bd7-9963-e64edf8c693a/11628918400-250.jpg'},
   u'types': [u'Front']}],
 u'release': u'https://musicbrainz.org/release/081e7033-5282-4bd7-9963-e64edf8c693a'}

# Training

In [12]:
# Function for k-fold cross validation using MAE (mean absolute error)
def kfold(k, model, data):
    kf = KFold(n_splits=k)
    maes = []
    for train_fold_index, validate_fold_index in kf.split(data):
        train_fold = data[train_fold_index]
        test_fold = data[validate_fold_index]
        X_train_fold = train_fold[:, :-1]
        Y_train_fold = train_fold[:, -1]
        X_test_fold = test_fold[:, :-1]
        Y_test_fold = test_fold[:, -1]
        model.fit(X_train_fold, Y_train_fold)
        Y_hat = model.predict(X_test_fold)
        mae = np.mean([abs(Y_hat[i] - Y_test_fold[i]) for i in range(len(Y_test_fold))])
        maes.append(mae)
        print "MAEs: ", maes
    return np.mean(maes)

# Making Predictions