In [39]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import pickle
import time

In [4]:
train = pd.read_csv('train.csv')
artist_lookup = pickle.load(open('artist_lookup.pkl'))
user_lookup = pickle.load(open('user_lookup.pkl'))

### Features

#### User
* sex [m, f, u]
* age [int]
* region [one_hot]
* total plays
* size of playlist (num artists)
* average popularity

#### Artist
* region of majority listeners
* popularity (can also add num followers from spotify in addition)
* average_age of listeners
* prop_male listeners
* num_listeners (count)
* num_plays
* POTENTIAL: clusters over genres

#### Interaction
* shares region with artist
* is within 2 years of average listener age
* genre overlap 
* popularity match (i.e. hipster listeners, artists)
* gender match
* POTENTIAL: appears on playlists with other songs user has listened to

In [82]:
def feature_extraction(training_data, num_examples=None):
    
    start = time.clock()
    if not num_examples:
        num_examples = len(training_data)
    
    list_of_features = np.empty(num_examples, dtype=dict)

    example_i = 0
    for _, row in train[:num_examples].iterrows():
        
        if not example_i % 100000:
            print('Iter: {}'.format(example_i))
        
        features = {}

        # Just the user
        user_features = user_lookup[row['user']]
        
        user_sex = user_features['sex']
        features[user_sex] = 1
        
        user_region = user_features['region']
        features['user_region_{}'.format(user_region)] = 1
        
        user_age = user_features['age']
        if not user_age:
            user_age = 24.51  # Average of all the ages
        features['user_age'] = user_age
        
        features['user_play_count'] = user_features['total_plays']
        features['num_user_artists'] = user_features['num_artists']
        
        user_popularity = user_features['average_popularity']
        features['user_avg_popularity'] = user_popularity
        
        # Just the artist
        artist_features = artist_lookup[row['artist']]
        
        artist_region = max([(r, artist_features[r]) for r in ['Africa', 
                                                        'Asia', 
                                                        'Europe', 
                                                        'North America', 
                                                        'Oceania', 
                                                        'Other', 
                                                        'South America']
                     ], key=lambda x: x[1])[0]
        
        features['artist_region_{}'.format(artist_region)] = 1
        
        artist_popularity = artist_features['popularity']
        features['artist_popularity'] = artist_popularity
        
        artist_age = artist_features['average_age']
        features['average_listener_age'] = artist_age
        
        prop_male = artist_features['prop_male']
        features['prop_male_listeners'] = prop_male
        features['num_listeners'] = artist_features['count']
        features['num_plays'] = artist_features['total_plays']
        
        # Modeling the interaction
        if user_region == artist_region:
            features['share_region'] = 1
            
        if abs(user_age - artist_age) < 2:
            features['share_age'] = 1
            
        fav_user_genres = set([genre for genre, _ in user_features['favorite_genres'].most_common(5)])
        artist_genres = set(artist_features['genres'])
        if len(fav_user_genres & artist_genres) > 0:
            features['share_genre'] = 1
            
        if abs(user_popularity - artist_popularity) < 10:
            features['share_popularity'] = 1
            
        if (prop_male > .5 and user_sex == 'm') or (prop_male <= .5 and user_sex == 'f'):
            features['share_sex'] = 1
            
        list_of_features[example_i] = features
        example_i += 1
        
    print(time.clock() - start)
    return list_of_features

In [83]:
feature_list = feature_extraction(train)

Iter: 0
Iter: 100000
Iter: 200000
Iter: 300000
Iter: 400000
Iter: 500000
Iter: 600000
Iter: 700000
Iter: 800000
Iter: 900000
Iter: 1000000
Iter: 1100000
Iter: 1200000
Iter: 1300000
Iter: 1400000
Iter: 1500000
Iter: 1600000
Iter: 1700000
Iter: 1800000
Iter: 1900000
Iter: 2000000
Iter: 2100000
Iter: 2200000
Iter: 2300000
Iter: 2400000
Iter: 2500000
Iter: 2600000
Iter: 2700000
Iter: 2800000
Iter: 2900000
Iter: 3000000
Iter: 3100000
Iter: 3200000
Iter: 3300000
Iter: 3400000
Iter: 3500000
Iter: 3600000
Iter: 3700000
Iter: 3800000
Iter: 3900000
Iter: 4000000
Iter: 4100000
612.309793


In [84]:
start = time.clock()
pickle.dump(feature_list, open('X.pkl', 'w'))
print(time.clock() - start)

261.735069


In [85]:
pickle.dump(np.array(train['plays']), open('Y.pkl', 'w'))

In [71]:
start = time.clock()
v = DictVectorizer(sparse=True)
X = v.fit_transform(feature_list)
print(time.clock() - start)

1.723496


In [None]:
start = time.clock()
Y = np.array(train['plays'])
clf = RandomForestRegressor(n_estimators=200)
clf.fit(X, Y[:100000])
print(time.clock() - start)