from https://www.kaggle.com/opanichev/xgb-baseline

In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import Counter

from scipy.io import mmread
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
#import xgboost as xgb
from xgboost import XGBRegressor # sklearn-style API
import utils



In [2]:
dfs = {}

dfs['train'] = pd.read_csv('data/postdiamond/train_full.csv')
dfs['test'] = pd.read_csv('data/postdiamond/test.csv')
df_songs = pd.read_csv('data/raw/songs.csv')
df_members = pd.read_csv('data/raw/members.csv')
song_map = pickle.load(open('data/processed/song_map.p', 'rb'))
X_genre = mmread('data/processed/X_genre.mtx') # shape is (n_songs, n_genres)

Filter to known songs

In [3]:
for x in dfs:
    dfs[x]['song_id'] = dfs[x]['song_id'].astype(str)
    dfs[x] = dfs[x][dfs[x]['song_id'].isin(df_songs['song_id'])]
    dfs[x]['song_idx'] = dfs[x]['song_id'].apply(lambda x: song_map[x])

# Genres and songs

In [4]:
df_genres = pd.DataFrame(X_genre.todense())
old_names = range(df_genres.shape[1])
new_names = ['genre%d' % x for x in old_names]
df_genres.rename(columns=dict(zip(old_names, new_names)), inplace=True)
df_genres.reset_index(inplace=True)
df_genres.rename(columns={'index': 'song_idx'}, inplace=True)
df_genres.sample(n=3)

Unnamed: 0,song_idx,genre0,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8,...,genre182,genre183,genre184,genre185,genre186,genre187,genre188,genre189,genre190,genre191
598141,598141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
519512,519512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116579,116579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
genres = {}
for x in 

In [8]:
sparse.csr_matrix(X_genre.todense()[[598141, 519512], :])

<2x192 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [5]:
for x in dfs:
    dfs[x] = pd.merge(dfs[x], df_genres, how='left', on='song_idx')

In [6]:
del X_genre, df_genres

Add song features

In [7]:
df_songs.sample(n=3)

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
734012,tuutCvYaSKBeBBz80qy0uGgohnRkwlEEXZTYJC7hWQs=,287834,1609,James Grant & Jody Wisternoff,,,52.0
1292145,oiXX5XOD0zjcXMdXiDlAdX3vLflYSmdUXZoc133TrPE=,201734,465,Tim Be Told,Timothy Ouyang,,52.0
50939,OkuUwIeDyoxHQIJkC8AFHLjh+uUdV38V+/93PB8tMSA=,307571,465,張衛健 (Dicky Cheung),Peter Lai,Poon Wai Yuen,24.0


In [8]:
df_songs['language'] = df_songs['language'].astype(str)
song_features = ['song_id', 'song_length', 'artist_name', 'composer', 'lyricist', 'language']
for x in dfs:
    dfs[x] = pd.merge(dfs[x], df_songs[song_features], how='left', on='song_id')

In [9]:
del df_songs

# Members

In [10]:
df_members.sample(n=3)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
30461,E9285C+V22kokpkOauwK02uzWGRMWCnzBG3xVE4YhgU=,1,0,,3,20130123,20130206
22403,2SE2+gpMhsI8VdSEcDj5g+JRFhr+RPmCQ5wi00stzIw=,22,23,,3,20140427,20170920
25918,5Fbbr04b/KbERvpAc5gnNqF6wBy7WxnIcOMT53m8A0g=,18,22,male,7,20130831,20171002


In [11]:
# Preprocess members data
for x in ['registration_init_time', 'expiration_date']:
    df_members[x] = df_members[x].astype(int)
df_members['registered_via'] = df_members['registered_via'].astype(str)

for x in dfs:
    dfs[x] = pd.merge(dfs[x], df_members, on='msno', how='left')

In [12]:
del df_members

# Preprocess df_train, df_test

* tqdm is a decorator on an iterator that adds a progress bar
* Label encoder seems like the wrong thing here: xgb will treat it as an int!

In [19]:
dfs['train'][[c for c in dfs['train'].columns if 'genre' not in c]].isnull().sum()

Unnamed: 0                      0
msno                            0
song_id                         0
source_system_tab           12767
source_screen_name         260863
source_type                 14299
target                          0
role                            0
row_index                       0
intercept                       0
diamond_prediction              0
song_idx                        0
song_length                     0
artist_name                     0
composer                  1106799
lyricist                  2096641
language                        0
city                            0
bd                              0
gender                    1949868
registered_via                  0
registration_init_time          0
expiration_date                 0
dtype: int64

In [20]:
[c for c, d in zip(dfs['train'].columns, dfs['train'].dtypes) \
 if d == 'object']

['msno',
 'song_id',
 'source_system_tab',
 'source_screen_name',
 'source_type',
 'role',
 'artist_name',
 'composer',
 'lyricist',
 'language',
 'gender',
 'registered_via']

In [21]:
# Preprocess dataset
for x in ['source_system_tab', 'source_screen_name', 'source_type']:
    for tt in dfs:
        dfs[tt][x] = dfs[tt][x].astype(str)

cols_to_drop = ['target', 'msno', 'song_id', 'song_idx', 'role',
                'Unnamed: 0', 'diamond_prediction', 'intercept', 'row_index']
cols = set(list(dfs['train'].columns)) - set(cols_to_drop)
cols = list(cols)
print([c for c in cols if
      dfs['train'].dtypes[c] == 'object'])

['registered_via', 'lyricist', 'source_system_tab', 'source_type', 'source_screen_name', 'language', 'composer', 'artist_name', 'gender']


In [22]:
for col in cols:
    if dfs['train'][col].dtype == 'object':
        print('building label encoder for `%s`' % col)
        le = LabelEncoder()
        unique_values = set()
        for x in dfs:
            dfs[x][col] = dfs[x][col].apply(str)
            unique_values = unique_values.union(set(dfs[x][col].unique()))
        le.fit(list(unique_values))
        for x in dfs:
            dfs[x][col] = le.transform(dfs[x][col])

building label encoder for `registered_via`
building label encoder for `lyricist`
building label encoder for `source_system_tab`
building label encoder for `source_type`
building label encoder for `source_screen_name`
building label encoder for `language`
building label encoder for `composer`
building label encoder for `artist_name`
building label encoder for `gender`


In [23]:
[(c, d) for c, d in zip(dfs['train'].columns, dfs['train'].dtypes) \
 if 'genre' not in c]

[('Unnamed: 0', dtype('int64')),
 ('msno', dtype('O')),
 ('song_id', dtype('O')),
 ('source_system_tab', dtype('int64')),
 ('source_screen_name', dtype('int64')),
 ('source_type', dtype('int64')),
 ('target', dtype('int64')),
 ('role', dtype('O')),
 ('row_index', dtype('int64')),
 ('intercept', dtype('float64')),
 ('diamond_prediction', dtype('float64')),
 ('song_idx', dtype('int64')),
 ('song_length', dtype('int64')),
 ('artist_name', dtype('int64')),
 ('composer', dtype('int64')),
 ('lyricist', dtype('int64')),
 ('language', dtype('int64')),
 ('city', dtype('int64')),
 ('bd', dtype('int64')),
 ('gender', dtype('int64')),
 ('registered_via', dtype('int64')),
 ('registration_init_time', dtype('int64')),
 ('expiration_date', dtype('int64'))]

In [27]:
dfs['train'].sample(n=3).drop(cols_to_drop, axis=1).T

Unnamed: 0,2474408,536259,870492
source_system_tab,0.0,3.0,3.0
source_screen_name,5.0,8.0,8.0
source_type,0.0,3.0,3.0
genre0,0.0,0.0,0.0
genre1,0.0,0.0,0.0
genre2,0.0,0.0,0.0
genre3,0.0,0.0,0.0
genre4,0.0,0.0,0.0
genre5,0.0,0.0,0.0
genre6,0.0,0.0,0.0


In [43]:
# label encoding -> one hot encoding
hot_encodings = {}
ohe_cols = ['source_system_tab', 'source_screen_name', 'source_type', 
            'artist_name', 'composer', 'lyricist', 'language', 'city',
           'gender', 'registered_via']
for x in ohe_cols:
    ohe = OneHotEncoder()
    hot_encodings[x] = ohe.fit_transform(dfs['train'][[x]].as_matrix())

In [54]:
X_train = sparse.hstack([v for k, v in hot_encodings.items()] +
                       [sparse.csr_matrix(dfs['train'][[x for x in dfs['train'].columns \
                                                       if 'genre' in x]].as_matrix())] +
                        [dfs['train'][['song_length', 'bd', 'registration_init_time',
                                       'expiration_date']].as_matrix()])

In [None]:
y_train = y_train.target - y_train.diamond_prediction

In [None]:
y_train = dfs['train']['target'] - dfs['train']['diamond_prediction']

In [56]:
del dfs['train']

In [55]:
model = XGBRegressor(max_depth=4,
                    learning_rate=0.1,
                    n_estimators=200,
                    nthread=6)

In [56]:
model.fit(X['train'], y['train'])

In [19]:

# df_train model, evaluate and make predictions
watchlist = [(D['train'], 'train'), (D['val'], 'val')]
params = {}
params['objective'] = 'reg:linear'
params['max_depth'] = 4
params['silent'] = 1
MAX_ITS = 200


model = xgb.train(params, D['train'], MAX_ITS, watchlist, early_stopping_rounds=10, verbose_eval=1)
#model.save('model/xgboost_postdiamond_%d' % MAX_ITS)

[0]	train-rmse:0.56525	val-rmse:0.661395
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.504811	val-rmse:0.590195
[2]	train-rmse:0.472358	val-rmse:0.548613
[3]	train-rmse:0.455573	val-rmse:0.524506
[4]	train-rmse:0.447043	val-rmse:0.510399
[5]	train-rmse:0.442767	val-rmse:0.502121
[6]	train-rmse:0.440593	val-rmse:0.497007
[7]	train-rmse:0.439501	val-rmse:0.493842
[8]	train-rmse:0.438938	val-rmse:0.491808
[9]	train-rmse:0.438648	val-rmse:0.490567
[10]	train-rmse:0.438484	val-rmse:0.489682
[11]	train-rmse:0.438354	val-rmse:0.489024
[12]	train-rmse:0.438293	val-rmse:0.488604
[13]	train-rmse:0.438238	val-rmse:0.488307
[14]	train-rmse:0.438193	val-rmse:0.488105
[15]	train-rmse:0.438162	val-rmse:0.487991
[16]	train-rmse:0.438099	val-rmse:0.487845
[17]	train-rmse:0.438057	val-rmse:0.487702
[18]	train-rmse:0.438014	val-rmse:0.487613
[19]	train-rmse:0.437987	val-rmse:0.487594
[20]	train

In [20]:
model.save_model('models/xgboost_postdiamond_%d' % MAX_ITS)

Evaluate on validation data

In [21]:
xgb_preds = model.predict(D['val'])
roc_auc_score(dfs['val']['target'],
              dfs['val']['diamond_prediction'] + xgb_preds)

0.6517490247447284

In [22]:
model.attributes()

{'best_iteration': '103',
 'best_msg': '[103]\ttrain-rmse:0.436809\tval-rmse:0.48686',
 'best_score': '0.48686'}

In [23]:
model.best_ntree_limit

104