In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import maxabs_scale, normalize
from sklearn.linear_model import LinearRegression
import numpy as np
import cPickle as pickle
import time

In [2]:
artist_lookup = pickle.load(open('artist_lookup.pkl'))

In [3]:
user_lookup = pickle.load(open('user_lookup.pkl'))

In [4]:
train = pd.read_csv('train.csv')

In [5]:
profiles_data = pd.read_csv('profiles.csv')
user_id_lookup = profiles_data.groupby('user').groups
def get_user_id(user_hash):
    return user_id_lookup[user_hash][0]

In [6]:
artists_data = pd.read_csv('artists.csv')
artist_id_lookup = artists_data.groupby('artist').groups
def get_artist_id(artist_hash):
    return artist_id_lookup[artist_hash][0]

In [7]:
def feature_extraction(training_data, num_examples=None):
        
    start = time.clock()
    if not num_examples:
        num_examples = len(training_data)
    
    list_of_features = np.empty(num_examples, dtype=dict)

    example_i = 0
    for _, row in training_data[:num_examples].iterrows():
        
        features = {}

        # ************************************************
        # Features for the user, independent of the artist
        user_features = user_lookup[row['user']]
        
#         features['user_id_{}'.format(get_user_id(row['user']))] = 1
        
        features['user_sex_{}'.format(user_features['sex'])] = 1

        # Seems unlikely to improve model if included raw
        user_cn = user_features['cn']
        user_region = user_features['region']
        
        user_age = user_features['age']
        features['user_age'] = user_age
        
        features['user_avg_plays'] = user_features['average_plays']
        features['user_num_artists'] = user_features['num_artists']
        features['user_median_plays'] = user_features['user_median_plays']
        features['user_avg_log_plays'] = user_features['average_log_plays']
        
        user_popularity = user_features['average_popularity']
        features['user_avg_popularity'] = user_popularity
        
        features['user_avg_sub_global_avg'] = user_features['user_avg_sub_global_avg']
        
#         features['user_fav_genre_{}'.format(user_features['favorite_genres'].most_common(1)[0][0])] = 1
        
        # ****************************************
        # Just the artist, independent of the user
        artist_features = artist_lookup[row['artist']]
        
#         features['artist_id_{}'.format(get_artist_id(row['artist']))] = 1
        
        artist_popularity = artist_features['popularity']
        features['artist_spotify_popularity'] = artist_popularity
        
        features['artist_avg_sub_global_avg'] = artist_features['artist_avg_sub_global_avg']
        
        artist_age = artist_features['average_age']
        features['artist_average_listener_age'] = artist_age
        
        prop_male, prop_female, prop_unknown = artist_features['m'], artist_features['f'], artist_features['u']

        artist_num_listeners = artist_features['total_listeners']
        features['artist_num_listeners'] = artist_num_listeners
        
        features['artist_avg_plays'] = artist_features['avg_plays'] 
        features['artist_median_plays'] = artist_features['median_plays']
        features['artist_avg_log_plays'] = artist_features['log_average_plays']
        
        features['artist_genre_{}'.format(artist_features['genre_id'])] = 1
        
        
        # **********************************************
        # Features dependent on both the artist and user
        features['share_cn'] = 1 if user_cn in [cn for cn, _ in artist_features['fan_cns'].most_common(5)] else 0
        features['share_prob_user_from_region'] = artist_features['fan_cns'][user_cn] / np.sum(artist_features['fan_cns'].values())
        features['share_prob_user_from_country'] = artist_features['fan_regions'][user_region] / np.sum(artist_features['fan_regions'].values())
        
        # TODO: Get percentage of fans in same region, and percentage in same country.
            
        if abs(user_age - artist_age) < 2:
            features['share_age'] = 1
            
        features['diff_age'] = abs(user_age - artist_age)  
            
        fav_user_genres = set([genre for genre, _ in user_features['favorite_genres'].most_common(4)])
        artist_genres = set(artist_features['genres'])
        shared_genres = fav_user_genres & artist_genres
        g_count = 0
        for g in shared_genres:  
            g_count += user_features['favorite_genres'][g]
        features['share_genre'] = g_count
        # TODO Maybe also encode the actual genres? 
#         features['share_genre'] = 1 if len(shared_genres) > 0 else 0
        
        if abs(user_popularity - artist_popularity) < 5:
            features['share_popularity'] = 1
            
        features['diff_popularity'] = abs(user_popularity - artist_popularity)   

        features['share_prob_user_listens_by_gender_{}'.format(user_features['sex'])] = artist_features[user_features['sex']]
            
        list_of_features[example_i] = features
        example_i += 1
        
    print(time.clock() - start)
    return list_of_features

In [31]:
train_n = 4000000
feature_list = feature_extraction(train[:train_n])
Y = np.array(train[:train_n]['plays'])

1437.083219


In [32]:
pickle.dump(feature_list, open('trainer.pkl', 'wb'))

In [33]:
v = DictVectorizer()
X = v.fit(feature_list)

In [34]:
available_features = v.get_feature_names()
support = [1 for _ in range(len(available_features))]

In [47]:
len(available_features)

48

In [35]:
possible_features = set(available_features)

undesired_features = {
    'artist_average_listener_age',
    'artist_avg_plays',
    'artist_avg_sub_global_avg',
    'share_age',
    'share_popularity',
    'user_age',
    'user_avg_plays',
    'user_avg_sub_global_avg',
}

actual_features = possible_features - undesired_features

support = [0 for _ in range(len(possible_features))]
for i, feature in enumerate(possible_features):
    if feature in actual_features:
        support[i] = 1

In [36]:
v.restrict(support)
X_restricted = v.transform(feature_list)

In [37]:
X_restricted.shape

(4000000, 40)

In [43]:
def LR(X, Y, samples):
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=2)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = LinearRegression(n_jobs=-1)
    clf.fit(X_train, np.log(y_train))
    
    y_hat = clf.predict(X_test)
    y_hat_train = clf.predict(X_train)
    
    test_mae = mean_absolute_error(y_test, np.exp(y_hat))
    train_mae = mean_absolute_error(y_train, np.exp(y_hat_train))
    
    print('Testing:')
    print(test_mae)
    
    print('Training:')
    print(train_mae)
    
    print('Time:')
    print(time.clock() - start)
    
    return clf

In [44]:
slf_to_submit = LR(X_restricted, Y, 4000000)

Testing:
6559.90448321
Training:
130.589052999
Time:
34.582461


In [45]:
pickle.dump(slf_to_submit, open('lr_final.pkl', 'w'))

In [32]:
def RF(X, Y, samples, depth=None):
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=0.33)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = RandomForestRegressor(n_estimators=40, n_jobs=-1, max_depth=depth)
    clf.fit(X_train, np.log(y_train))
    
    y_hat = clf.predict(X_test)
    y_hat_train = clf.predict(X_train)
    
    test_mae = mean_absolute_error(y_test, np.exp(y_hat))
    train_mae = mean_absolute_error(y_train, np.exp(y_hat_train))
    
    print('Testing:')
    print(test_mae)
    
    print('Training:')
    print(train_mae)
    
    print('Time:')
    print(time.clock() - start)

In [36]:
RF(X_restricted, Y, 30000)

Testing:
197.695324592
Training:
62.1124184893
Time:
121.641689


In [41]:
RF(X_restricted, Y, 50000, depth=100)

Testing:
138.068898838
Training:
60.7693671304
Time:
296.966475


In [213]:
RF(X_restricted, Y, 30000, depth=30)

Testing:
207.044377488
Training:
62.8956469081
Time:
116.359481


In [101]:
def LR(X, Y, samples):
    
    start = time.clock()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:samples], Y[:samples], test_size=0.33)
    
    X_train = maxabs_scale(X_train)
    X_test = maxabs_scale(X_test)
    
    clf = LinearRegression(n_jobs=-1)
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    print(mae)
    
    print(time.clock() - start)

In [110]:
LR(X_restricted, Y, 400000)

293.292537955
1.77469
