In [3]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import pickle
import time

In [4]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))
user_lookup = pickle.load(open('user_lookup.pkl'))

In [7]:
train = pd.read_csv('train.csv')

### Features

#### User
* sex [m, f, u]
* age [int]
* region [one_hot]
* total plays
* size of playlist (num artists)
* average popularity

#### Artist
* region of majority listeners
* popularity (can also add num followers from spotify in addition)
* average_age of listeners
* prop_male listeners
* num_listeners (count)
* num_plays
* POTENTIAL: clusters over genres

#### Interaction
* shares region with artist
* is within 2 years of average listener age
* genre overlap 
* popularity match (i.e. hipster listeners, artists)
* gender match
* POTENTIAL: appears on playlists with other songs user has listened to

In [169]:
def feature_extraction(training_data, num_examples=None):
    
    start = time.clock()
    if not num_examples:
        num_examples = len(training_data)
    
    list_of_features = np.empty(num_examples, dtype=dict)

    example_i = 0
    for _, row in training_data[:num_examples].iterrows():
        
#         if not example_i % 100000:
#             print('Iter: {}'.format(example_i))
        
        features = {}

        # Just the user
        user_features = user_lookup[row['user']]
        
        user_sex = user_features['sex']
        features['user_sex_{}'.format(user_sex)] = 1
        
        user_region = user_features['region']
        features['user_region_{}'.format(user_region)] = 1
        
        user_age = user_features['age']
        if not user_age or user_age > 80 or user_age < 15:
            user_age = 24.51  # Average of all the ages
        features['user_age'] = user_age
        
        features['user_avg_play_count'] = user_features['total_plays'] / float(user_features['num_artists'])
        features['user_num_artists'] = user_features['num_artists']
        
        user_popularity = user_features['average_popularity']
        features['user_avg_popularity'] = user_popularity
        
        # Just the artist
        artist_features = artist_lookup[row['artist']]
        
        artist_region = max([(r, artist_features[r]) for r in ['Africa', 
                                                        'Asia', 
                                                        'Europe', 
                                                        'North America', 
                                                        'Oceania', 
                                                        'Other', 
                                                        'South America']
                     ], key=lambda x: x[1])[0]
        
        features['artist_region_{}'.format(artist_region)] = 1
        
        artist_popularity = artist_features['popularity']
        features['artist_popularity'] = artist_popularity
        
        artist_age = artist_features['average_age']
        features['artist_average_listener_age'] = artist_age
        
        prop_male = artist_features['prop_male']
        features['artist_prop_male_listeners'] = prop_male
        artist_num_listeners = artist_features['count']
        features['artist_num_listeners'] = artist_num_listeners
        features['artist_avg_num_plays'] = artist_features['total_plays'] / float(artist_num_listeners)
        
        # Modeling the interaction
        if user_region == artist_region:
            features['share_region'] = 1
            
        if abs(user_age - artist_age) < 2:
            features['share_age'] = 1
            
        features['diff_age'] = 1.0 / (abs(user_age - artist_age) + 0.0000001)   
            
        fav_user_genres = set([genre for genre, _ in user_features['favorite_genres'].most_common(3)])
        artist_genres = set(artist_features['genres'])
        if len(fav_user_genres & artist_genres) > 0:
            features['share_genre'] = 1
            
        if abs(user_popularity - artist_popularity) < 5:
            features['share_popularity'] = 1
            
        features['diff_popularity'] = 1.0 / (abs(user_popularity - artist_popularity) + 0.0000001)   

        if (prop_male > .5 and user_sex == 'm') or (prop_male <= .5 and user_sex == 'f'):
            features['share_sex'] = 1
            
        list_of_features[example_i] = features
        example_i += 1
        
    print(time.clock() - start)
    return list_of_features

In [181]:
feature_list = feature_extraction(train[:20000])
Y = np.array(train[:20000]['plays'])

3.790873


In [182]:
feature_list[0]

{'artist_average_listener_age': 24.6387987012987,
 'artist_avg_num_plays': 152.11037234042553,
 'artist_num_listeners': 1504,
 'artist_popularity': 62,
 'artist_prop_male_listeners': 0.6431686046511628,
 'artist_region_Europe': 1,
 'diff_age': 2.768538559361902,
 'diff_popularity': 0.12790697510816657,
 'share_age': 1,
 'share_genre': 1,
 'share_region': 1,
 'share_sex': 1,
 'user_age': 25.0,
 'user_avg_play_count': 732.5,
 'user_avg_popularity': 54.18181818181818,
 'user_num_artists': 22,
 'user_region_Europe': 1,
 'user_sex_m': 1}

In [183]:
start = time.clock()
v = DictVectorizer(sparse=True)
X = v.fit_transform(feature_list)
print(time.clock() - start)

0.415132


In [184]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import time
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import maxabs_scale
from sklearn.linear_model import Ridge

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [186]:
# X_train = maxabs_scale(X_train)
# X_test = maxabs_scale(X_test)

In [187]:
clf = LinearRegression(normalize=True)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
mean_absolute_error(y_test, y_hat)

158.4256659325861