In [1]:
import requests as req
import pandas as pd
import xmltodict
from incf.countryutils import transformations
import numpy as np
from collections import defaultdict
import time
import pickle

In [2]:
artists_data = pd.read_csv('artists.csv')
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [3]:
def get_region(country):
    
    # TODO Map with more granularity. Especially countries in Asia that are 
    # really middle eastern along with edge cases like South Africa

    cn_to_ctn = {
        'Antarctica': 'Other',
        'Bouvet Island': 'Other',
        'British Indian Ocean Territory': 'Other',
        'Congo, the Democratic Republic of the': 'Africa',
        "Cote D'Ivoire": 'Africa',
        'Heard Island and Mcdonald Islands': 'Oceania',
        'Iran, Islamic Republic of': 'Asia',
        "Korea, Democratic People's Republic of": 'Other',
        'Korea, Republic of': 'Asia',
        'Kyrgyzstan': 'Asia',
        'Micronesia, Federated States of': 'Oceania',
        'Palestinian Territory, Occupied': 'Asia',
        'Pitcairn': 'Other',
        'Slovakia': 'Europe',
        'Svalbard and Jan Mayen': 'Europe',
        'Tanzania, United Republic of': 'Africa',
        'United Kingdom': 'Europe',
        'United States': 'North America',
        'Viet Nam': 'Asia',
        'Virgin Islands, British': 'North America',
        'Virgin Islands, U.s.': 'North America',
    }
    
    return cn_to_ctn[country] if country in cn_to_ctn else transformations.cn_to_ctn(country)

In [12]:
def extract_artistic_features(data):
    
    start = time.clock()
    
    artistic_features = defaultdict(dict)
    
    users = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    fan_groups = data.set_index(['user', 'plays']).groupby(['artist']).groups
    
    for artist, fans in fan_groups.iteritems():
        
        artistic_features[artist] = defaultdict(int)        
        for fan, plays in fans:
            
            age, sex, cn = users[fan][0]
            
            # Tally of number of listeners in each major region
            region = get_region(cn)
            if region == 'Antarctica':  # This is the weirdest bug in history
                region = 'Other'
            artistic_features[artist][region] += 1

            # Listeners by gender
            if sex == sex:  # Sex is defined
                artistic_features[artist][sex] += 1
                artistic_features[artist]['gender_count'] += 1

            # To calculate average age of listeners 
            if age == age: 
                artistic_features[artist]['age_count'] += 1
                artistic_features[artist]['sum_of_ages'] += age

            # Total listeners
            artistic_features[artist]['count'] += 1

            # Total plays
            artistic_features[artist]['total_plays'] += plays  
            
        artistic_features[artist]['average_age'] = artistic_features[artist]['sum_of_ages'] / float(artistic_features[artist]['age_count'])
        artistic_features[artist]['prop_female'] = artistic_features[artist]['f'] / float(artistic_features[artist]['gender_count'])
        artistic_features[artist]['prop_male'] = artistic_features[artist]['m'] / float(artistic_features[artist]['gender_count'])
        
        for r in ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'Other', 'South America']:
            artistic_features[artist][r] = artistic_features[artist][r] / float(artistic_features[artist]['count'])
        
    print(time.clock() - start)
    return artistic_features

In [14]:
artistic_features = extract_artistic_features(train)

21.27849


In [16]:
pickle.dump(artistic_features, open('artistic_features.pkl', 'w'))