In [114]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import maxabs_scale, normalize
import numpy as np
import cPickle as pickle
import time

In [115]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))

In [116]:
user_lookup = pickle.load(open('user_lookup.pkl'))

In [117]:
train = pd.read_csv('train.csv')

In [118]:
profiles_data = pd.read_csv('profiles.csv')
user_id_lookup = profiles_data.groupby('user').groups
def get_user_id(user_hash):
    return user_id_lookup[user_hash][0]

In [119]:
artists_data = pd.read_csv('artists.csv')
artist_id_lookup = artists_data.groupby('artist').groups
def get_artist_id(artist_hash):
    return artist_id_lookup[artist_hash][0]

### Features

#### User
* sex [m, f, u]
* age [int]
* region [one_hot]
* total plays
* size of playlist (num artists)
* average popularity

#### Artist
* region of majority listeners
* popularity (can also add num followers from spotify in addition)
* average_age of listeners
* prop_male listeners
* num_listeners (count)
* num_plays
* POTENTIAL: clusters over genres

#### Interaction
* shares region with artist
* is within 2 years of average listener age
* genre overlap 
* popularity match (i.e. hipster listeners, artists)
* gender match
* TODO: Number of times previously listened to artist 
* POTENTIAL: appears on playlists with other songs user has listened to

In [127]:
def feature_extraction(training_data, num_examples=None):
        
    start = time.clock()
    if not num_examples:
        num_examples = len(training_data)
    
    list_of_features = np.empty(num_examples, dtype=dict)

    example_i = 0
    for _, row in training_data[:num_examples].iterrows():
        
        features = {}

        # ************************************************
        # Features for the user, independent of the artist
        user_features = user_lookup[row['user']]
        
        features['user_id_{}'.format(get_user_id(row['user']))] = 1
        
        features['user_sex_{}'.format(user_features['sex'])] = 1

        # Seems unlikely to improve model if included raw
        user_cn = user_features['cn']
        user_region = user_features['region']
        
        user_age = user_features['age']
        features['user_age'] = user_age
        
        features['user_avg_plays'] = user_features['average_plays']
        features['user_num_artists'] = user_features['num_artists']
        
        user_popularity = user_features['average_popularity']
        features['user_avg_popularity'] = user_popularity
        
        features['user_avg_sub_global_avg'] = user_features['user_avg_sub_global_avg']
        
        # ****************************************
        # Just the artist, independent of the user
        artist_features = artist_lookup[row['artist']]
        
        features['artist_id_{}'.format(get_artist_id(row['artist']))] = 1
        
        artist_popularity = artist_features['popularity']
        features['artist_spotify_popularity'] = artist_popularity
        
        features['artist_avg_sub_global_avg'] = artist_features['artist_avg_sub_global_avg']
        
        artist_age = artist_features['average_age']
        features['artist_average_listener_age'] = artist_age
        
        prop_male, prop_female, prop_unknown = artist_features['m'], artist_features['f'], artist_features['u']

        artist_num_listeners = artist_features['total_listeners']
        features['artist_num_listeners'] = artist_num_listeners
        
        features['artist_avg_plays'] = artist_features['avg_plays'] 
        
        features['genre_{}'.format(artist_features['genre_id'])] = 1
        
        
        # **********************************************
        # Features dependent on both the artist and user
        features['share_cn'] = 1 if user_cn in [cn for cn, _ in artist_features['fan_cns'].most_common(5)] else 0
        features['share_prob_user_from_region'] = artist_features['fan_cns'][user_cn] / np.sum(artist_features['fan_cns'].values())
        features['share_prob_user_from_country'] = artist_features['fan_regions'][user_region] / np.sum(artist_features['fan_regions'].values())
        
        # TODO: Get percentage of fans in same region, and percentage in same country.
            
        if abs(user_age - artist_age) < 2:
            features['share_age'] = 1
            
        features['diff_age'] = abs(user_age - artist_age)  
            
        fav_user_genres = set([genre for genre, _ in user_features['favorite_genres'].most_common(4)])
        artist_genres = set(artist_features['genres'])
        shared_genres = fav_user_genres & artist_genres
#         g_count = 0
#         for g in shared_genres:  
#             g_count += user_features['favorite_genres'][g]
#         features['share_genre'] = g_count
        features['share_genre'] = 1 if len(shared_genres) > 0 else 0
        
        if abs(user_popularity - artist_popularity) < 5:
            features['share_popularity'] = 1
            
        features['diff_popularity'] = abs(user_popularity - artist_popularity)   

        features['share_prob_user_listens_by_gender_{}'.format(user_features['sex'])] = artist_features[user_features['sex']]
            
        list_of_features[example_i] = features
        example_i += 1
        
    print(time.clock() - start)
    return list_of_features

In [132]:
feature_list = feature_extraction(train[:100000])
Y = np.array(train[:100000]['plays'])

28.276508


In [133]:
v = DictVectorizer()
X = v.fit(feature_list)

In [134]:
available_features = v.get_feature_names()
support = [1 for _ in range(len(available_features))]
# available_features

In [49]:
desired_features = {
    
#     'artist_average_listener_age',
#     'artist_avg_plays',
#     'artist_avg_sub_global_avg',
#     'artist_num_listeners',
#     'artist_spotify_popularity',
#     'diff_age',
#     'diff_popularity',
#     'genre_0',
#     'genre_1',
#     'genre_10',
#     'genre_11',
#     'genre_12',
#     'genre_13',
#     'genre_14',
#     'genre_15',
#     'genre_16',
#     'genre_17',
#     'genre_18',
#     'genre_19',
#     'genre_2',
#     'genre_3',
#     'genre_4',
#     'genre_5',
#     'genre_6',
#     'genre_7',
#     'genre_8',
#     'genre_9',
#     'share_age',
#     'share_cn',
#     'share_genre',
#     'share_popularity',
#     'share_prob_user_listens_by_gender',
#     'user_age',
#     'user_avg_plays',
#     'user_avg_popularity',
#     'user_avg_sub_global_avg',
#     'user_num_artists',
#     'user_sex_f',
#     'user_sex_m',
#     'user_sex_u',
    
    'artist_avg_sub_global_avg',
    'artist_avg_plays',
    'artist_average_listener_age',
    'genre_0',
    'genre_1',
    'genre_10',
    'genre_11',
    'genre_12',
    'genre_13',
    'genre_14',
    'genre_15',
    'genre_16',
    'genre_17',
    'genre_18',
    'genre_19',
    'genre_2',
    'genre_3',
    'genre_4',
    'genre_5',
    'genre_6',
    'genre_7',
    'genre_8',
    'genre_9',
    'share_cn',
    'share_age',
    'share_genre',
    'share_popularity',
    'share_prob_user_listens_by_gender',
    'user_avg_plays',
}
support = [0 for _ in range(len(available_features))]
for i, feature in enumerate(available_features):
    if feature in desired_features:
        support[i] = 1

In [135]:
v.restrict(support)
X_restricted = v.transform(feature_list)

In [136]:
X_restricted.shape

(100000, 83220)

In [137]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Linear SVR, RandomForest seem promising.

In [111]:
def KNN(X, Y, samples):
    
    # For D < 20 use KDTree
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=0.33)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = KNeighborsRegressor(n_neighbors=3, n_jobs=-1, algorithm='ball_tree', leaf_size=30)  # 'ball_tree' for higher D
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    print(mae)
    
    print(time.clock() - start)

In [113]:
KNN(X_restricted, Y, 10000)

240.556969697
1.956768


In [138]:
def RF(X, Y, samples):
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=0.33)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    print(mae)
    
    print(time.clock() - start)

In [140]:
RF(X_restricted, Y, 10000)

125.221321212
1115.305561


In [101]:
def LR(X, Y, samples):
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=0.33)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = LinearRegression(n_jobs=-1)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    print(mae)
    
    print(time.clock() - start)

In [110]:
LR(X_restricted, Y, 400000)

293.292537955
1.77469
