In [97]:
import requests as req
import pandas as pd
import xmltodict
from incf.countryutils import transformations
import numpy as np
from collections import defaultdict
import time
import pickle

In [2]:
resp = req.get('https://musicbrainz.org/ws/2/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234?inc=tags%20ratings')

In [3]:
artist = xmltodict.parse(resp.text)['metadata']['artist']

In [4]:
artist

OrderedDict([(u'@type-id', u'e431f5f6-b5d2-343d-8b36-72607fffb74b'),
             (u'@type', u'Group'),
             (u'@id', u'cc197bad-dc9c-440d-a5b5-d52ba2e14234'),
             (u'name', u'Coldplay'),
             (u'sort-name', u'Coldplay'),
             (u'isni-list', OrderedDict([(u'isni', u'000000011551394X')])),
             (u'country', u'GB'),
             (u'area',
              OrderedDict([(u'@id', u'8a754a16-0027-3a29-b6d7-2b40ea0481ed'),
                           (u'name', u'United Kingdom'),
                           (u'sort-name', u'United Kingdom'),
                           (u'iso-3166-1-code-list',
                            OrderedDict([(u'iso-3166-1-code', u'GB')]))])),
             (u'begin-area',
              OrderedDict([(u'@id', u'f03d09b3-39dc-4083-afd6-159e3f0d462f'),
                           (u'name', u'London'),
                           (u'sort-name', u'London')])),
             (u'life-span', OrderedDict([(u'begin', u'1996-09')])),
             

### Features for Artists
- Number of users who've listened to them
- Total plays
- Plays per listener
- Average age of listeners
- Other artists shared on playlists 
- Counts of users per country or region
- Genre (MB)
- Last year active (MB)
- Country (MB)
- Type (MB)
- Rating (MB)

In [5]:
artists_data = pd.read_csv('artists.csv')
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [6]:
def get_region(country):
    
    # TODO Map with more granularity. Especially countries in Asia that are 
    # really middle eastern along with edge cases like South Africa

    cn_to_ctn = {
        'Antarctica': 'Other',
        'Bouvet Island': 'Other',
        'British Indian Ocean Territory': 'Other',
        'Congo, the Democratic Republic of the': 'Africa',
        "Cote D'Ivoire": 'Africa',
        'Heard Island and Mcdonald Islands': 'Oceania',
        'Iran, Islamic Republic of': 'Iran',
        "Korea, Democratic People's Republic of": 'North Korea',
        'Korea, Republic of': 'Asia',
        'Kyrgyzstan': 'Asia',
        'Micronesia, Federated States of': 'Oceania',
        'Palestinian Territory, Occupied': 'Asia',
        'Pitcairn': 'Other',
        'Slovakia': 'Europe',
        'Svalbard and Jan Mayen': 'Europe',
        'Tanzania, United Republic of': 'Africa',
        'United Kingdom': 'Europe',
        'United States': 'North America',
        'Viet Nam': 'Asia',
        'Virgin Islands, British': 'North America',
        'Virgin Islands, U.s.': 'North America',
    }
    
    return cn_to_ctn[country] if country in cn_to_ctn else transformations.cn_to_ctn(country)

In [103]:
def extract_artistic_features(data):
    
    start = time.clock()
    
    artistic_features = defaultdict(dict)
    iter = 1
    
    users = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    fan_groups = data.set_index(['user', 'plays']).groupby(['artist']).groups
    
    for artist, fans in fan_groups.iteritems():
        
        artistic_features[artist] = defaultdict(int)        
        for fan, plays in fans:
            
            age, sex, cn = users[fan][0]
            
            # Tally of number of listeners in each major region
            region = get_region(cn)
            artistic_features[artist][region] += 1

            # Listeners by gender
            if sex == sex:  # Sex is defined
                artistic_features[artist][sex] += 1
                artistic_features[artist]['gender_count'] += 1

            # To calculate average age of listeners 
            if age == age: 
                artistic_features[artist]['age_count'] += 1
                artistic_features[artist]['sum_of_ages'] += age

            # Total listeners
            artistic_features[artist]['count'] += 1

            # Total plays
            artistic_features[artist]['total_plays'] += plays  
            
        artistic_features[artist]['average_age'] = artistic_features[artist]['sum_of_ages'] / float(artistic_features[artist]['age_count'])
        artistic_features[artist]['prop_female'] = artistic_features[artist]['f'] / float(artistic_features[artist]['gender_count'])
        artistic_features[artist]['prop_male'] = artistic_features[artist]['m'] / float(artistic_features[artist]['gender_count'])
        for r in ['Africa', 'Antarctica', 'Asia', 'Europe', 'North America', 
                  'North Korea', 'Oceania', 'Other', 'South America']:
            artistic_features[artist][r] = artistic_features[artist][r] / float(artistic_features[artist]['count'])
        
    print(time.clock() - start)
    return artistic_features

In [104]:
artistic_features = extract_artistic_features(train)

20.98387


In [105]:
pickle.dump(artistic_features, open('artistic_features.pkl', 'w'))

In [106]:
artistic_features

defaultdict(dict,
            {'23a03e33-a603-404e-bcbf-2c00159d7067': defaultdict(int,
                         {'Africa': 0.005048943843379701,
                          'Antarctica': 0.00010303967027305513,
                          'Asia': 0.02555383822771767,
                          'Europe': 0.6963420917053066,
                          'North America': 0.16609994848016488,
                          'North Korea': 0.00010303967027305513,
                          'Oceania': 0.031117980422462648,
                          'Other': 0.000824317362184441,
                          'South America': 0.07480680061823802,
                          'age_count': 8254,
                          'average_age': 22.035740247152894,
                          'count': 9705,
                          'f': 1631,
                          'gender_count': 9080,
                          'm': 7449,
                          'prop_female': 0.17962555066079294,
                          'prop_male': 