In [14]:
import keras
import pandas as pd
import numpy as np
from keras.layers import Dense, Activation, Embedding, Input, Concatenate, Flatten
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
import pickle
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [6]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
songs = pd.read_csv('../Data/songs.csv', usecols = ['song_id', 'artist_name']).astype(str)
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

In [9]:
cols = ['msno', 'song_id', 'source_screen_name', 'source_type']

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 4/4 [01:33<00:00, 21.58s/it]


In [10]:
#Case 1: User missing in training but song is present (in training)
msno_song = test[~test.msno.isin(train.msno) & test.song_id.isin(train.song_id)]

In [33]:
song_embedding_size = 64
user_embedding_size = 64
other_embedding_size = 16
source_embedding_size = 10
extra_dense = 128
batch_size = 32768
num_epochs = 100
save_path = '../Models/only_song_var2.h5'

Case 1: User missing in training but song is present (in training)

In [12]:
# Song Stats used in embedding
# number of times an artist has been played 
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0
        
# number of times a song has been played
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0

repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        return repeated_songs_dict[x]
    except KeyError:
        return 0

#has anyone listened to the artist again?
repeated_artists_dict = train[train.target == 1].groupby(['artist_name'])['target'].count().to_dict()
def repeated_artists(x):
    try:
        return repeated_artists_dict[x]
    except KeyError:
        return 0

In [24]:
train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played)
train['count_song_played'] = train['song_id'].apply(count_song_played)
train['repeated_song'] = train['song_id'].apply(repeated_songs)
train['repeated_artist'] = train['artist_name'].map(str).apply(repeated_artists)

In [26]:
msno_song['count_artist_played'] = msno_song['artist_name'].map(str).apply(count_artist_played)
msno_song['count_song_played'] = msno_song['song_id'].apply(count_song_played)
msno_song['repeated_song'] = msno_song['song_id'].apply(repeated_songs)
msno_song['repeated_artist'] = msno_song['artist_name'].map(str).apply(repeated_artists)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [28]:
input_sizes = {
    'song_id': max(train.song_id.max(), test.song_id.max()) +1,
    'source_type': max(train.source_type.max(), test.source_type.max()) +1,
    'source_screen_name': max(train.source_screen_name.max(), test.source_screen_name.max()) +1,
    'count_artist_played': train.count_artist_played.max() +1,
    'count_song_played': train.count_song_played.max() +1,
    'repeated_song': train.repeated_song.max() +1,
    'repeated_artist': train.repeated_artist.max() +1, 
}

In [29]:
song_input = Input(shape = (1, ))
#user_input = Input(shape = (1, ))
count_artist_input = Input(shape = (1, ))
count_song_input = Input(shape = (1, ))
repeated_song_input = Input(shape = (1, ))
repeated_artist_input = Input(shape = (1, ))
s_scr_name_input = Input(shape = (1, ))
s_type_input = Input(shape = (1, ))

In [34]:
song_emb = Flatten()(Embedding(output_dim = song_embedding_size, input_dim=input_sizes['song_id'], embeddings_regularizer=l2(1e-4), embeddings_initializer='glorot_uniform')(song_input))
#user_emb = Flatten()(Embedding(output_dim = user_embedding_size, input_dim=input_sizes['msno'], embeddings_regularizer=l2(1e-4), embeddings_initializer='glorot_uniform')(user_input))
count_artist_emb = Flatten()(Embedding(output_dim = other_embedding_size, input_dim=input_sizes['count_artist_played'], embeddings_initializer='glorot_uniform')(count_artist_input))
count_song_emb = Flatten()(Embedding(output_dim = other_embedding_size, input_dim=input_sizes['count_song_played'], embeddings_initializer='glorot_uniform')(count_song_input))
repeated_song_emb = Flatten()(Embedding(output_dim = other_embedding_size, input_dim=input_sizes['repeated_song'], embeddings_initializer='glorot_uniform')(repeated_song_input))
repeated_artist_emb = Flatten()(Embedding(output_dim = other_embedding_size, input_dim=input_sizes['repeated_artist'], embeddings_initializer='glorot_uniform')(repeated_artist_input))
s_scr_name_emb = Flatten()(Embedding(output_dim = source_embedding_size, input_dim=input_sizes['source_screen_name'], embeddings_initializer='glorot_uniform')(s_scr_name_input))
s_type_emb = Flatten()(Embedding(output_dim = source_embedding_size, input_dim=input_sizes['source_type'], embeddings_initializer='glorot_uniform')(s_type_input))

In [35]:
embedding_layer = Concatenate(axis=-1)([song_emb, count_artist_emb, count_song_emb, repeated_song_emb, repeated_artist_emb, s_scr_name_emb, s_type_emb])
embedding_layer = keras.layers.Dropout(0.5)(Dense(extra_dense, activation = 'relu', kernel_initializer = 'glorot_normal')(embedding_layer))
prediction = Dense(1, activation='sigmoid')(embedding_layer)

In [36]:
model = keras.models.Model(inputs=[song_input, count_artist_input, count_song_input, repeated_song_input, repeated_artist_input, s_scr_name_input, s_type_input],
                           outputs = [prediction])
model.summary()
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_3 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_4 (InputLayer)             (None, 1)             0                                            
___________________________________________________________________________________________

In [None]:
early_stopping = EarlyStopping(monitor='val_acc', patience = 5)
model_checkpoint = ModelCheckpoint(save_path, save_best_only = True, save_weights_only=False)

model.fit([train.song_id, train.count_artist_played, train.count_song_played, train.repeated_song, train.repeated_artist, train.source_screen_name, train.source_type],
          [train.target], epochs = num_epochs, batch_size = batch_size, verbose=1,
          validation_split=0.2, validation_data=None, shuffle=True,
          callbacks = [early_stopping, model_checkpoint])

model.save(save_path)

Train on 5901934 samples, validate on 1475484 samples
Epoch 1/100

In [None]:
predicted = model.predict([msno_song.song_id, msno_song.count_artist_played, msno_song.count_song_played, msno_song.repeated_song, msno_song.repeated_artist, msno_song.source_screen_name, msno_song.source_type], batch_size=batch_size, verbose=2)
new_test = pd.DataFrame({'id': msno_song.id, 'target': predicted.ravel()})
new_test.to_csv('../Test/submission_only_song_var2.csv', index=False)