In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import pickle
import time

In [2]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))

In [3]:
user_lookup = pickle.load(open('user_lookup.pkl'))

In [9]:
train = pd.read_csv('train.csv')

### Features

#### User
* sex [m, f, u]
* age [int]
* region [one_hot]
* total plays
* size of playlist (num artists)
* average popularity

#### Artist
* region of majority listeners
* popularity (can also add num followers from spotify in addition)
* average_age of listeners
* prop_male listeners
* num_listeners (count)
* num_plays
* POTENTIAL: clusters over genres

#### Interaction
* shares region with artist
* is within 2 years of average listener age
* genre overlap 
* popularity match (i.e. hipster listeners, artists)
* gender match
* TODO: Number of times previously listened to artist 
* POTENTIAL: appears on playlists with other songs user has listened to

In [50]:
def feature_extraction(training_data, num_examples=None):
    
    start = time.clock()
    if not num_examples:
        num_examples = len(training_data)
    
    list_of_features = np.empty(num_examples, dtype=dict)

    example_i = 0
    for _, row in training_data[:num_examples].iterrows():
        
        features = {}

        # Features for the user, independent of the artist
        user_features = user_lookup[row['user']]
        
        features['user_sex_{}'.format(user_features['sex'])] = 1
        
        user_cn = user_features['cn']
#         features['user_cn_{}'.format(user_cn)] = 1
        
        user_age = user_features['age']
        features['user_age'] = user_age
        
        features['user_avg_plays'] = user_features['average_plays']
        features['user_num_artists'] = user_features['num_artists']
        
        user_popularity = user_features['average_popularity']
        features['user_avg_popularity'] = user_popularity
        
        features['user_avg_sub_global_avg'] = user_features['user_avg_sub_global_avg']
        
        
        # Just the artist, independent of the user
        artist_features = artist_lookup[row['artist']]
        
        artist_popularity = artist_features['popularity']
        features['artist_spotify_popularity'] = artist_popularity
        
        features['artist_avg_sub_global_avg'] = artist_features['artist_avg_sub_global_avg']
        
        artist_age = artist_features['average_age']
        features['artist_average_listener_age'] = artist_age
        
        prop_male, prop_female, prop_unknown = artist_features['m'], artist_features['f'], artist_features['u']

        artist_num_listeners = artist_features['total_listeners']
        features['artist_num_listeners'] = artist_num_listeners
        
        features['artist_avg_plays'] = artist_features['avg_plays'] 
        
        features['genre_{}'.format(artist_features['genre_id'])] = 1
        
        # Features dependent on both the artist and user
        features['share_cn'] = 1 if user_cn in [cn for cn, _ in artist_features['fan_cns'][:5]] else 0
            
        if abs(user_age - artist_age) < 2:
            features['share_age'] = 1
            
        features['diff_age'] = 1.0 / (abs(user_age - artist_age) + 0.0000001)   
            
        fav_user_genres = set([genre for genre, _ in user_features['favorite_genres'].most_common(4)])
        artist_genres = set(artist_features['genres'])
        shared_genres = fav_user_genres & artist_genres
#         g_count = 0
#         for g in shared_genres:  
#             g_count += user_features['favorite_genres'][g]
#         features['share_genre'] = g_count
        features['share_genre'] = 1 if len(shared_genres) > 0 else 0
        
        if abs(user_popularity - artist_popularity) < 5:
            features['share_popularity'] = 1
            
        features['diff_popularity'] = 1.0 / (abs(user_popularity - artist_popularity) + 0.0000001)   

        features['share_prob_user_listens_by_gender'] = artist_features[user_features['sex']]
            
        list_of_features[example_i] = features
        example_i += 1
        
    print(time.clock() - start)
    return list_of_features

In [62]:
feature_list = feature_extraction(train[:400000])
Y = np.array(train[:400000]['plays'])

66.728112


In [91]:
start = time.clock()
v = DictVectorizer()
X = v.fit_transform(feature_list)
print(time.clock() - start)

8.81502


In [96]:
available_features = v.get_feature_names()
desired_features = {
    'artist_avg_sub_global_avg',
    'genre_0',
    'genre_1',
    'genre_10',
    'genre_11',
    'genre_12',
    'genre_13',
    'genre_14',
    'genre_15',
    'genre_16',
    'genre_17',
    'genre_18',
    'genre_19',
    'genre_2',
    'genre_3',
    'genre_4',
    'genre_5',
    'genre_6',
    'genre_7',
    'genre_8',
    'genre_9',
    'share_cn',
    'share_genre',
    'share_popularity',
    'share_prob_user_listens_by_gender',
    'user_avg_plays',
}
support = [0 for _ in range(len(available_features))]
for i, feature in enumerate(available_features):
    if feature in desired_features:
        support[i] = 1

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import time
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import maxabs_scale
from sklearn.linear_model import Ridge

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [74]:
# User this for KNN, but nothing else...
X_train = maxabs_scale(X_train)
X_test = maxabs_scale(X_test)

In [73]:
clf = LinearRegression()
# from sklearn.neighbors import KNeighborsRegressor
# clf = KNeighborsRegressor()
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
mean_absolute_error(y_test, y_hat)

162.03318080310186