In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime

### Loading the data

In [2]:
print('Loading data...')
data_path = 'input/'
train = pd.read_csv(data_path + 'train.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})
test = pd.read_csv(data_path + 'test.csv', dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})
songs = pd.read_csv(data_path + 'songs.csv',dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})
members = pd.read_csv(data_path + 'members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                     parse_dates=['registration_init_time','expiration_date'])
songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
print('Done loading...')

Loading data...
Done loading...


### Merging train and test datasets 

In [3]:
print('Data merging...')

train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

Data merging...


### Creating new features 

In [4]:
members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)

members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day
members = members.drop(['registration_init_time'], axis=1)

### Getting year of the music from isrc

In [5]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [6]:
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

### Merging remain datasets

In [7]:
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')


test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

print('Done merging...')

Done merging...


In [8]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'] = train['genre_ids'].cat.add_categories(['no_genre_id'])
test['genre_ids'] = test['genre_ids'].cat.add_categories(['no_genre_id'])
train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)

In [9]:
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count)

In [10]:
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'] = train['lyricist'].cat.add_categories(['no_lyricist'])
test['lyricist'] = test['lyricist'].cat.add_categories(['no_lyricist'])

train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)



In [11]:
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'] = train['composer'].cat.add_categories(['no_composer'])
test['composer'] = test['composer'].cat.add_categories(['no_composer'])    
    
train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)

In [12]:
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0


train['artist_name'] = train['artist_name'].cat.add_categories(['no_artist'])
test['artist_name'] = test['artist_name'].cat.add_categories(['no_artist'])  

train['artist_name'].fillna('no_artist',inplace=True)
test['artist_name'].fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)


In [13]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)


# if artist is same as composer
train['artist_composer'] = (train['artist_name'] == train['composer']).astype(np.int8)
test['artist_composer'] = (test['artist_name'] == test['composer']).astype(np.int8)

In [43]:
train['artist_name'] == train['composer']

TypeError: Categoricals can only be compared if 'categories' are the same. Categories are different lengths

print ("Adding new features")

# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((train['artist_name'] == train['composer']) & (train['artist_name'] == train['lyricist']) & (train['composer'] == train['lyricist'])).astype(np.int8)
test['artist_composer_lyricist'] = ((test['artist_name'] == test['composer']) & (test['artist_name'] == test['lyricist']) & (test['composer'] == test['lyricist'])).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)


_mean_song_length = np.mean(train['song_length'])
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)


print("Done adding features")

In [14]:
print ("Train test and validation sets")
for col in train.columns:
    if train[col].dtype == object:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')


X_train = train.drop(['target'], axis=1)
y_train = train['target'].values


X_test = test.drop(['id'], axis=1)
ids = test['id'].values


d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)
print('Processed data...')

Train test and validation sets
Processed data...


### Model 1

In [15]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)



[5]	valid_0's auc: 0.720951
[10]	valid_0's auc: 0.735892
[15]	valid_0's auc: 0.743342
[20]	valid_0's auc: 0.748352
[25]	valid_0's auc: 0.752787
[30]	valid_0's auc: 0.755986
[35]	valid_0's auc: 0.758878
[40]	valid_0's auc: 0.761747
[45]	valid_0's auc: 0.764294
[50]	valid_0's auc: 0.766708
[55]	valid_0's auc: 0.768616
[60]	valid_0's auc: 0.770675
[65]	valid_0's auc: 0.772564
[70]	valid_0's auc: 0.77455
[75]	valid_0's auc: 0.776301
[80]	valid_0's auc: 0.777782
[85]	valid_0's auc: 0.77895
[90]	valid_0's auc: 0.780336
[95]	valid_0's auc: 0.781561
[100]	valid_0's auc: 0.782774
[105]	valid_0's auc: 0.783975
[110]	valid_0's auc: 0.785055
[115]	valid_0's auc: 0.786174
[120]	valid_0's auc: 0.787242
[125]	valid_0's auc: 0.78871
[130]	valid_0's auc: 0.789762
[135]	valid_0's auc: 0.791192
[140]	valid_0's auc: 0.792167
[145]	valid_0's auc: 0.79291
[150]	valid_0's auc: 0.79449
[155]	valid_0's auc: 0.795391
[160]	valid_0's auc: 0.796227
[165]	valid_0's auc: 0.797608
[170]	valid_0's auc: 0.798361
[175]

### Model 2

In [16]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f2 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

[5]	valid_0's auc: 0.720951
[10]	valid_0's auc: 0.734028
[15]	valid_0's auc: 0.740547
[20]	valid_0's auc: 0.745759
[25]	valid_0's auc: 0.750017
[30]	valid_0's auc: 0.752448
[35]	valid_0's auc: 0.754569
[40]	valid_0's auc: 0.757201
[45]	valid_0's auc: 0.759645
[50]	valid_0's auc: 0.76078
[55]	valid_0's auc: 0.76421
[60]	valid_0's auc: 0.765976
[65]	valid_0's auc: 0.767258
[70]	valid_0's auc: 0.769022
[75]	valid_0's auc: 0.770298
[80]	valid_0's auc: 0.771426
[85]	valid_0's auc: 0.771402
[90]	valid_0's auc: 0.771706
[95]	valid_0's auc: 0.772173
[100]	valid_0's auc: 0.772576
[105]	valid_0's auc: 0.772691
[110]	valid_0's auc: 0.772866
[115]	valid_0's auc: 0.774864
[120]	valid_0's auc: 0.775801
[125]	valid_0's auc: 0.776462
[130]	valid_0's auc: 0.777561
[135]	valid_0's auc: 0.778437
[140]	valid_0's auc: 0.779192
[145]	valid_0's auc: 0.779845
[150]	valid_0's auc: 0.780467
[155]	valid_0's auc: 0.780515
[160]	valid_0's auc: 0.779834
[165]	valid_0's auc: 0.780753
[170]	valid_0's auc: 0.781726
[1

### Model 3

In [17]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.4 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 250,
        'metric' : 'auc'
    }

%time model_f3 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

[5]	valid_0's auc: 0.723729
[10]	valid_0's auc: 0.739205
[15]	valid_0's auc: 0.746846
[20]	valid_0's auc: 0.752161
[25]	valid_0's auc: 0.757114
[30]	valid_0's auc: 0.760884
[35]	valid_0's auc: 0.764447
[40]	valid_0's auc: 0.767514
[45]	valid_0's auc: 0.770833
[50]	valid_0's auc: 0.773266
[55]	valid_0's auc: 0.775311
[60]	valid_0's auc: 0.77757
[65]	valid_0's auc: 0.779417
[70]	valid_0's auc: 0.781149
[75]	valid_0's auc: 0.782635
[80]	valid_0's auc: 0.784085
[85]	valid_0's auc: 0.786266
[90]	valid_0's auc: 0.78741
[95]	valid_0's auc: 0.788687
[100]	valid_0's auc: 0.789852
[105]	valid_0's auc: 0.791132
[110]	valid_0's auc: 0.792272
[115]	valid_0's auc: 0.793334
[120]	valid_0's auc: 0.79478
[125]	valid_0's auc: 0.796126
[130]	valid_0's auc: 0.797103
[135]	valid_0's auc: 0.798319
[140]	valid_0's auc: 0.799268
[145]	valid_0's auc: 0.800811
[150]	valid_0's auc: 0.802055
[155]	valid_0's auc: 0.80265
[160]	valid_0's auc: 0.803421
[165]	valid_0's auc: 0.804306
[170]	valid_0's auc: 0.805373
[175

In [16]:
print('Making predictions')
p_test_1 = model_f1.predict(X_test)
p_test_2 = model_f2.predict(X_test)
p_test_avg = np.mean([p_test_1, p_test_2], axis = 0)


print('Done making predictions')

Making predictions
Done making predictions


In [17]:
p_test_3 = model_f3.predict(X_test)

In [18]:
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_3
subm.to_csv('submissions/' + 'submission_lgbm_new_parameters.csv', index=False, float_format = '%.5f')


In [18]:
print ('Saving predictions Model model of gbdt')

subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_avg
subm.to_csv('submissions/' + 'submission_lgbm_avg.csv', index=False, float_format = '%.5f')

print('Done!')

Saving predictions Model model of gbdt
Done!


In [3]:
d1 = pd.read_csv('submissions/submission_lgbm_new_parameters.csv')
d2 = pd.read_csv('submissions/submission_lgbm_avg.csv')

p_test_avg_3 = np.mean([d1['target'].values, d2['target'].values], axis = 0)

In [4]:
s = pd.DataFrame()
s['id'] = d1['id']
s['target'] = p_test_avg_3

In [5]:
s.to_csv('submissions/'+'submission_lgbm_avg_between_others.csv', index=False, float_format = '%.5f')