In [12]:
import numpy as np
import csv
from sklearn.cluster import KMeans
import pandas as pd
import pickle
from sklearn import preprocessing
from collections import Counter, defaultdict
import time
from incf.countryutils import transformations

In [2]:
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [8]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))

In [6]:
def get_region(country):
    
    # TODO Map with more granularity. Especially countries in Asia that are 
    # really middle eastern along with edge cases like South Africa

    cn_to_ctn = {
        'Antarctica': 'Other',
        'Bouvet Island': 'Other',
        'British Indian Ocean Territory': 'Other',
        'Congo, the Democratic Republic of the': 'Africa',
        "Cote D'Ivoire": 'Africa',
        'Heard Island and Mcdonald Islands': 'Oceania',
        'Iran, Islamic Republic of': 'Other',
        "Korea, Democratic People's Republic of": 'Other',
        'Korea, Republic of': 'Asia',
        'Kyrgyzstan': 'Asia',
        'Micronesia, Federated States of': 'Oceania',
        'Palestinian Territory, Occupied': 'Asia',
        'Pitcairn': 'Other',
        'Slovakia': 'Europe',
        'Svalbard and Jan Mayen': 'Europe',
        'Tanzania, United Republic of': 'Africa',
        'United Kingdom': 'Europe',
        'United States': 'North America',
        'Viet Nam': 'Asia',
        'Virgin Islands, British': 'North America',
        'Virgin Islands, U.s.': 'North America',
    }
    
    return cn_to_ctn[country] if country in cn_to_ctn else transformations.cn_to_ctn(country)

In [19]:
def extract_user_features(data, artist_lookup):
    
    start = time.clock()
    
    demographic_lookup = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    user_lookup = defaultdict(dict)
    
    playlists = data.set_index(['artist', 'plays']).groupby(['user']).groups
    for user, playlist in playlists.iteritems():
        
        age, sex, cn = demographic_lookup[user][0]
        num_artists = len(playlist)
        region = get_region(cn)
        if region == 'Antarctica':
            region = 'Other'
        
        user_features = {
            'sex': sex if sex == sex else 'u',
            'age': age if age == age else 0,
            'region': region,
            'average_popularity': 0,
            'num_artists': num_artists,
            'total_plays': 0,
        }
        
        favorite_genres = Counter()
        total_plays = 0
        total_popularity = 0
        
        for artist, plays in playlist:
            total_plays += plays
            genres = artist_lookup[artist]['genres']
            for genre in genres:
                favorite_genres[genre] += 1
            
            total_popularity += artist_lookup[artist]['popularity']
            
        user_features['average_popularity'] = total_popularity / float(num_artists)
        user_features['total_plays'] = total_plays
        user_features['favorite_genres'] = favorite_genres
        
        user_lookup[user] = user_features
        
    print(time.clock() - start)
    return user_lookup

In [25]:
user_lookup = extract_user_features(train, artist_lookup)

33.639834


In [28]:
pickle.dump(user_lookup, open('user_lookup.pkl', 'w'))