In [8]:
import numpy as np
import pandas as pd
import pickle
from collections import Counter, defaultdict
import time
import util

In [2]:
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [3]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))

In [27]:
def extract_user_features(data, artist_lookup):
    
    start = time.clock()
    
    demographic_lookup = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    user_lookup = defaultdict(dict)
    
    playlists = data.set_index(['artist', 'plays']).groupby(['user']).groups
    for user, playlist in playlists.iteritems():
        
        age, sex, cn = demographic_lookup[user][0]
        num_artists = len(playlist)
        
        if age != age or age > 80 or age < 15:
            age = util.AVERAGE_AGE  
        
        user_features = {
            'sex': sex if sex == sex else 'u',
            'age': age,
            'cn': cn,
            'average_popularity': 0,
            'num_artists': num_artists,
            'total_plays': 0,
        }
        
        favorite_genres = Counter()
        total_plays = 0
        total_popularity = 0
        
        for artist, plays in playlist:
            total_plays += plays
            genres = artist_lookup[artist]['genres']
            for genre in genres:
                favorite_genres[genre] += 1
            
            total_popularity += artist_lookup[artist]['popularity']
            
        user_features['average_popularity'] = total_popularity / float(num_artists)
        user_features['average_plays'] = total_plays / float(num_artists)
        user_features['total_plays'] = total_plays
        user_features['favorite_genres'] = favorite_genres
        user_features['user_avg_sub_global_avg'] = user_features['average_plays'] / float(util.AVERAGE_PLAYS)
        
        user_lookup[user] = user_features
        
    print(time.clock() - start)
    return user_lookup

In [30]:
user_lookup = extract_user_features(train, artist_lookup)

34.735833


In [31]:
pd.DataFrame(extract_user_features(train[:10], artist_lookup))

0.197341


Unnamed: 0,056d5d2467dc63c4520963323e2ebf9576b58229,44ce793a6cd9d20f13f4a576a818ef983314bb5d,5641e1e6f04868a61dc29f7227e34f4640163e9b,8fa49ab25d425edcf05d44bfc1d5aea895287d81,9f748976d303db79f61bf570d9549d6335b11b0e,b85fcaef67d2669cd99b334b5e8c8705263db2cf,cbb86d88a8d2d0bab8956807c6c45cd0c752324b,da9cf3f557161d54b76f24db64be9cc76db008e3,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,feed7a0dc74c5251283a1505adf453a2061d08f7
age,20,23,24.5,23,21,24.5,25,24.5,25,23
average_plays,7,81,305,265,705,220,127,708,554,2113
average_popularity,65,50,46,40,63,75,75,77,62,66
cn,United Kingdom,United Kingdom,Belgium,United Kingdom,Antarctica,Netherlands,Ukraine,Netherlands,Sweden,Canada
favorite_genres,"{u'alternative rock': 1, u'garage rock': 1, u'...","{u'alternative dance': 1, u'madchester': 1, u'...","{u'funk rock': 1, u'industrial metal': 1, u'sp...","{u'alternative dance': 1, u'indie rock': 1, u'...","{u'pop christmas': 1, u'australian pop': 1, u'...","{u'pop rock': 1, u'permanent wave': 1, u'mello...","{u'alternative rock': 1, u'garage rock': 1, u'...","{u'alternative rock': 1, u'glam metal': 1, u'r...","{u'pop rock': 1, u'pop': 1, u'synthpop': 1, u'...","{u'alternative rock': 1, u'funk rock': 1, u'po..."
num_artists,1,1,1,1,1,1,1,1,1,1
sex,m,f,m,m,f,u,m,m,m,f
total_plays,7,81,305,265,705,220,127,708,554,2113
user_avg_sub_global_avg,0.0276661,0.320137,1.20545,1.04736,2.78638,0.869507,0.501943,2.79823,2.18958,8.35122


In [32]:
pickle.dump(user_lookup, open('user_lookup.pkl', 'w'))