In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import time
import pickle

In [2]:
import util

In [5]:
artists_data = pd.read_csv('artists.csv')
profiles_data = pd.read_csv('profiles.csv')
train = pd.read_csv('train.csv')

In [6]:
TOTAL_USERS = len(profiles_data)
AVERAGE_AGE = 24.5
TOTAL_PLAYS = np.sum(train['plays'])
AVERAGE_PLAYS = TOTAL_PLAYS / float(len(train))

In [10]:
def extract_from_training_data(data, profiles_data):
    
    start = time.clock()
    
    artistic_features = defaultdict(dict)
    
    users = profiles_data.set_index(['age', 'sex', 'country']).groupby('user').groups
    fan_groups = data.set_index(['user', 'plays']).groupby(['artist']).groups
    
    for artist, fans in fan_groups.iteritems():
        
        artistic_features[artist] = defaultdict(int) 
        fan_cns = Counter()
        fan_regions = Counter()
        sum_of_ages = 0
        total_listeners = 0
        total_plays = 0
        play_counts = []
        
        for fan, plays in fans:
            
            age, sex, cn = users[fan][0]

            fan_cns[cn] += 1
            fan_regions[util.get_region(cn)] += 1

            # Listeners by gender
            if sex == sex:  # Sex is defined
                artistic_features[artist][sex] += 1
            else:
                artistic_features[artist]['u'] += 1  # Unknown gender
                
            # To calculate average age of listeners 
            sum_of_ages += age if age == age else AVERAGE_AGE

            total_listeners += 1
            total_plays += plays  
            play_counts.append(plays)
            
        artistic_features[artist]['total_plays'] = total_plays
        artistic_features[artist]['total_listeners'] = total_listeners
        artistic_features[artist]['prop_plays'] = float(total_plays) / TOTAL_PLAYS
        artistic_features[artist]['avg_plays'] = float(total_plays) / total_listeners
        artistic_features[artist]['fan_cns'] = fan_cns
        artistic_features[artist]['fan_regions'] = fan_regions
        artistic_features[artist]['average_age'] = sum_of_ages / float(total_listeners)
        artistic_features[artist]['f'] = artistic_features[artist]['f'] / float(total_listeners)
        artistic_features[artist]['m'] = artistic_features[artist]['m'] / float(total_listeners)
        artistic_features[artist]['u'] = artistic_features[artist]['u'] / float(total_listeners)
        artistic_features[artist]['prop_listeners'] = float(total_listeners) / TOTAL_USERS
        artistic_features[artist]['artist_avg_sub_global_avg'] = artistic_features[artist]['avg_plays'] - AVERAGE_PLAYS
        artistic_features[artist]['median_plays'] = np.median(play_counts)
        
    print(time.clock() - start)
    return artistic_features

In [11]:
artistic_features = extract_from_training_data(train, profiles_data)

24.474331


In [12]:
pd.DataFrame(artistic_features)

Unnamed: 0,000d90ec-d64c-48a1-b775-e726fd240e9f,000fc734-b7e1-4a01-92d1-f544261b43f5,0019749d-ee29-4a5f-ab17-6bfa11deb969,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,004e5eed-e267-46ea-b504-54526f1f377d,00565b31-14a3-4913-bd22-385eb40dd13c,00a9f935-ba93-4fc8-a33a-993abe9c936b,00eeed6b-5897-4359-8347-b8cd28375331,0103c1cc-4a09-4a5d-a344-56ad99a77193,0110e63e-0a9b-4818-af8e-41e180c20b9a,...,ff6e677f-91dd-4986-a174-8db0474b1799,ff7f80cd-05c2-4068-a00e-fbfbd453d049,ff865aa0-4603-4f79-ae8b-8735332e2cfa,ff95eb47-41c4-4f7f-a104-cdc30f02e872,ff9deaae-da4f-42b7-a19e-36fedd3fc706,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,ffb2d3e3-a4cc-48cf-8fb0-f2f846e9d7b9,ffb390b8-8df4-4b72-97d1-7b2fc008a452,ffe16bba-4d84-409b-8f22-5242c60b930f,ffe9ec08-6b6b-4993-9394-e280b429dbfd
artist_avg_sub_global_avg,-56.7207,35.8711,-107.565,88.7817,117.548,104.834,181.174,81.1138,17.9675,7.79174,...,26.3923,202.015,5.83677,2.76401,-58.1862,5.19264,241.166,66.6686,16.1555,25.9498
average_age,23.2373,27.6223,27.2319,23.8569,25.9111,25.3718,23.1036,24.0338,22.4299,25.0144,...,24.6486,22.7159,22.3984,26.506,24.7247,24.7257,24.1834,19.6494,22.2558,24.2785
avg_plays,196.296,288.888,145.452,341.798,370.565,357.851,434.19,334.131,270.984,260.809,...,279.409,455.032,258.854,255.781,194.831,258.209,494.182,319.685,269.172,278.967
f,0.303922,0.247731,0.153623,0.352782,0.241772,0.0940722,0.229762,0.114483,0.382803,0.371846,...,0.287029,0.2625,0.383771,0.143717,0.445672,0.256146,0.30607,0.588599,0.452931,0.158086
fan_cns,"{u'Brazil': 7, u'Turkey': 1, u'Italy': 3, u'Cz...","{u'Canada': 33, u'Brazil': 63, u'Hungary': 3, ...","{u'Canada': 18, u'Turkey': 7, u'Italy': 12, u'...","{u'Canada': 619, u'Libyan Arab Jamahiriya': 1,...","{u'Turkey': 89, u'Brazil': 70, u'Italy': 39, u...","{u'Canada': 20, u'Turkey': 43, u'Sao Tome and ...","{u'Canada': 180, u'Saint Helena': 2, u'Montene...","{u'Canada': 20, u'Brazil': 103, u'Italy': 95, ...","{u'Canada': 160, u'Turkmenistan': 1, u'Saint H...","{u'Canada': 142, u'Sao Tome and Principe': 2, ...",...,"{u'Canada': 369, u'Sao Tome and Principe': 2, ...","{u'Brazil': 5, u'Canada': 1, u'Italy': 3, u'Cz...","{u'Canada': 60, u'Lithuania': 6, u'Argentina':...","{u'Canada': 82, u'Saint Helena': 1, u'Lithuani...","{u'Canada': 39, u'Brazil': 55, u'Hungary': 1, ...","{u'Canada': 89, u'Libyan Arab Jamahiriya': 1, ...","{u'Brazil': 14, u'Canada': 18, u'Hungary': 3, ...","{u'Canada': 50, u'Turkey': 3, u'Italy': 5, u'C...","{u'Canada': 27, u'Turkey': 1, u'Italy': 1, u'K...","{u'Canada': 40, u'Montenegro': 1, u'Lithuania'..."
fan_regions,"{u'Europe': 443, u'Oceania': 9, u'Africa': 1, ...","{u'Europe': 998, u'Oceania': 56, u'Africa': 5,...","{u'Europe': 508, u'Oceania': 28, u'Africa': 2,...","{u'Europe': 5953, u'Oceania': 543, u'Africa': ...","{u'Europe': 1085, u'Oceania': 20, u'Africa': 3...","{u'Europe': 471, u'Oceania': 14, u'Africa': 4,...","{u'Europe': 7092, u'Oceania': 167, u'Africa': ...","{u'Europe': 2087, u'Oceania': 30, u'Africa': 1...","{u'Europe': 4406, u'Oceania': 172, u'Africa': ...","{u'Europe': 1975, u'Oceania': 131, u'Africa': ...",...,"{u'Europe': 5738, u'Oceania': 340, u'Africa': ...","{u'Europe': 855, u'Oceania': 2, u'Africa': 2, ...","{u'Europe': 578, u'Oceania': 86, u'Africa': 20...","{u'Europe': 1142, u'Oceania': 95, u'Africa': 1...","{u'Europe': 318, u'Oceania': 70, u'Africa': 4,...","{u'Europe': 904, u'Oceania': 45, u'Africa': 12...","{u'Europe': 2469, u'Oceania': 11, u'Africa': 6...","{u'Europe': 475, u'Oceania': 80, u'Africa': 3,...","{u'Europe': 101, u'Oceania': 20, u'Asia': 21, ...","{u'Europe': 899, u'Oceania': 45, u'Africa': 14..."
m,0.641176,0.640149,0.733333,0.568883,0.703797,0.831186,0.710651,0.824818,0.558382,0.518981,...,0.623188,0.686364,0.559487,0.732637,0.477901,0.656834,0.594004,0.349588,0.483126,0.757796
median_plays,121.5,137,69.5,176,164,159,163,132,112,153,...,138,201.5,118,131,118,145,193,146,132,142
prop_listeners,0.00218616,0.00802877,0.00295774,0.0632571,0.0067728,0.00332639,0.0399252,0.011158,0.0296632,0.0181794,...,0.0470281,0.00377219,0.00702571,0.0106779,0.00465523,0.00871891,0.0115823,0.00624127,0.00241335,0.0059112
prop_plays,9.52318e-05,0.000514715,9.54705e-05,0.00479809,0.000556957,0.000264158,0.00384695,0.000827352,0.00178382,0.00105218,...,0.002916,0.000380912,0.000403583,0.000606097,0.000201274,0.000499601,0.0012702,0.000442776,0.000144158,0.000365946


In [13]:
pickle.dump(artistic_features, open('features_from_training_data.pkl', 'w'))