In [97]:
import requests as req
import pandas as pd
import xmltodict
from incf.countryutils import transformations
import numpy as np
from collections import defaultdict
import time
import pickle

In [2]:
resp = req.get('https://musicbrainz.org/ws/2/artist/cc197bad-dc9c-440d-a5b5-d52ba2e14234?inc=tags%20ratings')

In [3]:
artist = xmltodict.parse(resp.text)['metadata']['artist']

In [4]:
artist

OrderedDict([(u'@type-id', u'e431f5f6-b5d2-343d-8b36-72607fffb74b'),
             (u'@type', u'Group'),
             (u'@id', u'cc197bad-dc9c-440d-a5b5-d52ba2e14234'),
             (u'name', u'Coldplay'),
             (u'sort-name', u'Coldplay'),
             (u'isni-list', OrderedDict([(u'isni', u'000000011551394X')])),
             (u'country', u'GB'),
             (u'area',
              OrderedDict([(u'@id', u'8a754a16-0027-3a29-b6d7-2b40ea0481ed'),
                           (u'name', u'United Kingdom'),
                           (u'sort-name', u'United Kingdom'),
                           (u'iso-3166-1-code-list',
                            OrderedDict([(u'iso-3166-1-code', u'GB')]))])),
             (u'begin-area',
              OrderedDict([(u'@id', u'f03d09b3-39dc-4083-afd6-159e3f0d462f'),
                           (u'name', u'London'),
                           (u'sort-name', u'London')])),
             (u'life-span', OrderedDict([(u'begin', u'1996-09')])),
             

### Features for Artists
- Number of users who've listened to them
- Total plays
- Plays per listener
- Average age of listeners
- Other artists shared on playlists 
- Counts of users per country or region
- Genre (MB)
- Last year active (MB)
- Country (MB)
- Type (MB)
- Rating (MB)

In [5]:
artists_data = pd.read_csv('artists.csv')
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [6]:
def get_region(country):
    
    # TODO Map with more granularity. Especially countries in Asia that are 
    # really middle eastern along with edge cases like South Africa

    cn_to_ctn = {
        'Antarctica': 'Other',
        'Bouvet Island': 'Other',
        'British Indian Ocean Territory': 'Other',
        'Congo, the Democratic Republic of the': 'Africa',
        "Cote D'Ivoire": 'Africa',
        'Heard Island and Mcdonald Islands': 'Oceania',
        'Iran, Islamic Republic of': 'Asia',
        "Korea, Democratic People's Republic of": 'North Korea',
        'Korea, Republic of': 'Asia',
        'Kyrgyzstan': 'Asia',
        'Micronesia, Federated States of': 'Oceania',
        'Palestinian Territory, Occupied': 'Asia',
        'Pitcairn': 'Other',
        'Slovakia': 'Europe',
        'Svalbard and Jan Mayen': 'Europe',
        'Tanzania, United Republic of': 'Africa',
        'United Kingdom': 'Europe',
        'United States': 'North America',
        'Viet Nam': 'Asia',
        'Virgin Islands, British': 'North America',
        'Virgin Islands, U.s.': 'North America',
    }
    
    return cn_to_ctn[country] if country in cn_to_ctn else transformations.cn_to_ctn(country)

In [103]:
def extract_artistic_features(data):
    
    start = time.clock()
    
    artistic_features = defaultdict(dict)
    iter = 1
    
    users = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    fan_groups = data.set_index(['user', 'plays']).groupby(['artist']).groups
    
    for artist, fans in fan_groups.iteritems():
        
        artistic_features[artist] = defaultdict(int)        
        for fan, plays in fans:
            
            age, sex, cn = users[fan][0]
            
            # Tally of number of listeners in each major region
            region = get_region(cn)
            artistic_features[artist][region] += 1

            # Listeners by gender
            if sex == sex:  # Sex is defined
                artistic_features[artist][sex] += 1
                artistic_features[artist]['gender_count'] += 1

            # To calculate average age of listeners 
            if age == age: 
                artistic_features[artist]['age_count'] += 1
                artistic_features[artist]['sum_of_ages'] += age

            # Total listeners
            artistic_features[artist]['count'] += 1

            # Total plays
            artistic_features[artist]['total_plays'] += plays  
            
        artistic_features[artist]['average_age'] = artistic_features[artist]['sum_of_ages'] / float(artistic_features[artist]['age_count'])
        artistic_features[artist]['prop_female'] = artistic_features[artist]['f'] / float(artistic_features[artist]['gender_count'])
        artistic_features[artist]['prop_male'] = artistic_features[artist]['m'] / float(artistic_features[artist]['gender_count'])
        for r in ['Africa', 'Antarctica', 'Asia', 'Europe', 'North America', 
                  'North Korea', 'Oceania', 'Other', 'South America']:
            artistic_features[artist][r] = artistic_features[artist][r] / float(artistic_features[artist]['count'])
        
    print(time.clock() - start)
    return artistic_features

In [104]:
artistic_features = extract_artistic_features(train)

20.98387


In [105]:
pickle.dump(artistic_features, open('artistic_features.pkl', 'w'))

### TODO
- Representation of whether or not an artist is 'popular' (maybe just by number of play?) 
- Should the above be conditioning on characteristics of the user?

#### Feature interactions 
- Artist is similar by Genre to other artists the user listens to 
- Artist appears on playlists with other songs the user listens to how many times
- User is close to the average age of the listeners for the artist 
- User is in majority gender for the artist 
- User is in same region as artist... Not sure we have this data.
- User typically listens to artists in same region

In [107]:
pd.DataFrame(artistic_features)

Unnamed: 0,000d90ec-d64c-48a1-b775-e726fd240e9f,000fc734-b7e1-4a01-92d1-f544261b43f5,0019749d-ee29-4a5f-ab17-6bfa11deb969,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,004e5eed-e267-46ea-b504-54526f1f377d,00565b31-14a3-4913-bd22-385eb40dd13c,00a9f935-ba93-4fc8-a33a-993abe9c936b,00eeed6b-5897-4359-8347-b8cd28375331,0103c1cc-4a09-4a5d-a344-56ad99a77193,0110e63e-0a9b-4818-af8e-41e180c20b9a,...,ff6e677f-91dd-4986-a174-8db0474b1799,ff7f80cd-05c2-4068-a00e-fbfbd453d049,ff865aa0-4603-4f79-ae8b-8735332e2cfa,ff95eb47-41c4-4f7f-a104-cdc30f02e872,ff9deaae-da4f-42b7-a19e-36fedd3fc706,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,ffb2d3e3-a4cc-48cf-8fb0-f2f846e9d7b9,ffb390b8-8df4-4b72-97d1-7b2fc008a452,ffe16bba-4d84-409b-8f22-5242c60b930f,ffe9ec08-6b6b-4993-9394-e280b429dbfd
Africa,0.001961,0.00267,0.002899,0.00487904,0.001899,0.005155,0.005475628,0.003842,0.008381503,0.00754539,...,0.007291952,0.002273,0.012203,0.006423,0.003683,0.0059,0.002220577,0.00206,0.0,0.010152
Antarctica,0.0,0.0,0.0,0.0002710578,0.0,0.0,0.0005368263,0.000384,0.0001445087,0.0004715869,...,0.000455747,0.0,0.00061,0.0,0.000921,0.000492,0.0003700962,0.0,0.0,0.0
Asia,0.017647,0.054458,0.037681,0.03896456,0.075949,0.076031,0.04176508,0.057626,0.06560694,0.04197123,...,0.04119953,0.003409,0.075046,0.039342,0.036832,0.07473,0.02516654,0.028846,0.0373,0.075417
Europe,0.868627,0.532835,0.736232,0.4034018,0.686709,0.606959,0.7614344,0.801767,0.6367052,0.4656921,...,0.5230152,0.971591,0.352654,0.45845,0.292818,0.444444,0.9137676,0.326236,0.179396,0.651922
Iran,,,,2.0,3.0,,8.0,2.0,6.0,1.0,...,2.0,,2.0,,,,,,,1.0
North America,0.078431,0.312867,0.15942,0.4668293,0.109494,0.203608,0.1091905,0.063388,0.1635838,0.3864655,...,0.3152858,0.0125,0.440513,0.431955,0.540516,0.427729,0.04219097,0.482143,0.740675,0.218274
North Korea,0.0,0.000534,0.0,6.776445e-05,0.0,0.0,0.0,0.0,0.0001445087,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Oceania,0.017647,0.029899,0.04058,0.0367961,0.012658,0.018041,0.01793,0.011525,0.02485549,0.03088894,...,0.03099079,0.002273,0.052471,0.038137,0.064457,0.022124,0.004071058,0.054945,0.035524,0.032632
Other,0.001961,0.001068,0.0,0.0008131734,0.001266,0.001289,0.0006441915,0.001153,0.0005780347,0.001414761,...,0.001367241,0.0,0.00122,0.000401,0.000921,0.00295,0.0003700962,0.0,0.001776,0.00145
South America,0.013725,0.06567,0.023188,0.0478417,0.110127,0.088918,0.06216448,0.059547,0.09913295,0.06531478,...,0.08021147,0.007955,0.064063,0.025291,0.059853,0.021632,0.01184308,0.105769,0.005329,0.009427
