In [1]:
import keras
import pandas as pd
import numpy as np
from keras.layers import Dense, Activation, Embedding, Input, Concatenate, Flatten
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
import pickle
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [41]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
members = pd.read_csv('../Data/members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                             parse_dates=['registration_init_time','expiration_date'])

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members = members.drop(['bd', 'gender','registration_init_time','expiration_date'], axis=1)
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

train['source_screen_name'].fillna('none',inplace=True)
train['source_type'].fillna('none', inplace=True)
test['source_screen_name'].fillna('none',inplace=True)
test['source_type'].fillna('none',inplace=True)

In [42]:
cols = ['msno']

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 1/1 [00:23<00:00, 23.04s/it]


In [43]:
#Case 2: song missing in training but User exists
song_msno = test[~test.song_id.isin(train.song_id) & test.msno.isin(train.msno)]

In [44]:
user_embedding_size = 64
other_embedding_size = 32
source_embedding_size = 10
member_embedding_size = 10
extra_dense = 128
batch_size = 32768
num_epochs = 100
save_path = '../Models/only_user_var2.h5'

Case 2: Song missing in training but user is present (in training)

In [45]:
# User Stats used in embedding
#how frequently has the user repeated songs?
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        return repeated_dict[x]
    except KeyError:
        return 0

# number of times source_screen_name has contributed to target=1
source_screen_dict = train[train.target == 1].groupby(['source_screen_name'])['target'].count().to_dict()
def source_screen_repeated(x):
    if x == 'none':
        return 0
    try:
        return source_screen_dict[x]
    except KeyError:
        return 0

# number of times source_type has contributed to target=1
source_type_dict = train[train.target == 1].groupby(['source_type'])['target'].count().to_dict()
def source_type_repeated(x):
    if x == 'none':
        return 0
    try:
        return source_type_dict[x]
    except KeyError:
        return 0

In [53]:
song_msno['user_repeated'] = song_msno['msno'].apply(user_repeated_songs).astype(np.int64)
song_msno['s_screen_repeat'] = song_msno['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int64)
song_msno['s_type_repeat'] = song_msno['source_type'].map(str).apply(source_type_repeated).astype(np.int64)

train['user_repeated'] = train['msno'].apply(user_repeated_songs).astype(np.int64)
train['s_screen_repeat'] = train['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int64)
train['s_type_repeat'] = train['source_type'].map(str).apply(source_type_repeated).astype(np.int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [54]:
input_sizes = {
    'msno': max(train.msno.max(), song_msno.msno.max()) +1,
    'user_repeated': train.user_repeated.max() +1,
    's_screen_repeat': train.s_screen_repeat.max() +1,
    's_type_repeat': train.s_type_repeat.max() +1,
    'membership_days': max(train.membership_days.max(), song_msno.membership_days.max()) +1,
}

In [78]:
train['membership_days'] = train['membership_days'].apply(lambda x: 0 if x < 0 else x)

Empty DataFrame
Columns: [msno, song_id, source_system_tab, source_screen_name, source_type, target, city, registered_via, membership_days, user_repeated, s_screen_repeat, s_type_repeat]
Index: []


In [55]:
user_input = Input(shape = (1, ))
user_repeated_input = Input(shape = (1, ))
s_screen_repeat_input = Input(shape = (1, ))
s_type_repeat_input = Input(shape = (1, ))
membership_days_input = Input(shape = (1, ))

In [79]:
user_emb = Flatten()(Embedding(output_dim = user_embedding_size, input_dim=input_sizes['msno'], embeddings_regularizer=l2(1e-4), embeddings_initializer='glorot_uniform')(user_input))
user_repeated_emb = Flatten()(Embedding(output_dim = other_embedding_size, input_dim=input_sizes['user_repeated'], embeddings_initializer='glorot_uniform')(user_repeated_input))
s_screen_repeat_emb = Flatten()(Embedding(output_dim = source_embedding_size, input_dim=input_sizes['s_screen_repeat'], embeddings_initializer='glorot_uniform')(s_screen_repeat_input))
s_type_repeat_emb = Flatten()(Embedding(output_dim = source_embedding_size, input_dim=input_sizes['s_type_repeat'], embeddings_initializer='glorot_uniform')(s_type_repeat_input))
membership_days_emb = Flatten()(Embedding(output_dim = member_embedding_size, input_dim=input_sizes['membership_days'], embeddings_initializer='glorot_uniform')(membership_days_input))

In [80]:
embedding_layer = Concatenate(axis=-1)([user_emb, user_repeated_emb, s_screen_repeat_emb, s_type_repeat_emb, membership_days_emb])
embedding_layer = keras.layers.Dropout(0.5)(Dense(extra_dense, activation = 'relu', kernel_initializer = 'glorot_normal')(embedding_layer))
prediction = Dense(1, activation='sigmoid')(embedding_layer)

In [81]:
model = keras.models.Model(inputs=[user_input, user_repeated_input, s_screen_repeat_input, s_type_repeat_input, membership_days_input],
                           outputs = [prediction])
#model.summary()
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_acc', patience = 5)
model_checkpoint = ModelCheckpoint(save_path, save_best_only = True, save_weights_only=False)

model.fit([train.msno, train.user_repeated,  train.s_screen_repeat, train.s_type_repeat, train.membership_days],
          [train.target], epochs = num_epochs, batch_size = batch_size, verbose=1,
          validation_split=0.2, validation_data=None, shuffle=True,
          callbacks = [early_stopping, model_checkpoint])

model.save(save_path)

Train on 5901934 samples, validate on 1475484 samples
Epoch 1/100

In [None]:
predicted = model.predict([song_msno.msno, song_msno.user_repeated, song_msno.s_screen_repeat, song_msno.s_type_repeat, song_msno.membership_days], batch_size=batch_size, verbose=2)
new_test = pd.DataFrame({'id': song_msno.id, 'target': predicted.ravel()})
new_test.to_csv('../Test/submission_only_user_var2.csv', index=False)