In [1]:
import gc
import numpy as np
import math
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


In [18]:
songs = pd.read_csv('../Data/songs.csv').astype(str)
members = pd.read_csv('../Data/members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                             parse_dates=['registration_init_time','expiration_date'])

In [19]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members['registration_year'] = members['registration_init_time'].dt.year
members['expiration_year'] = members['expiration_date'].dt.year
members = members.drop(['bd', 'gender','registration_init_time','expiration_date'], axis=1)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

In [4]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

In [5]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].map(str).apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].map(str).apply(artist_count).astype(np.int8)

In [6]:
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].map(str).apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].map(str).apply(lyricist_count).astype(np.int8)

In [7]:
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].map(str).apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].map(str).apply(composer_count).astype(np.int8)

In [8]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].map(str).apply(count_song_played).astype(np.int64)

In [9]:
# number of times an artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].map(str).apply(count_artist_played).astype(np.int64)

In [10]:
cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 22/22 [04:28<00:00, 12.21s/it]


In [20]:
X = np.array(train.drop(['target'], axis=1), dtype=np.int32)
Y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1), dtype=np.int32)
Y_test = test['id'].values

In [12]:
cols_name = list(train.columns)
X_new = X

max_vals = X_new.max(axis = 0).transpose()
min_vals = X_new.min(axis = 0).transpose()
mean_vals = np.mean(X_new, axis = 0).transpose()

In [14]:
#training set
X_new = X_new - mean_vals
X_new = X_new / (max_vals - min_vals)
X_new = np.around(X_new,decimals = 2)

In [17]:
#testing set
X_new_test = X_test - mean_vals
X_new_test = X_new_test / (max_vals - min_vals)
X_new_test = np.around(X_new_test,decimals = 2)

In [18]:
del X; 
del members, songs;
del train, test;
gc.collect();

In [19]:
training = True
model = Sequential([
        Dense(units=1024, kernel_initializer='uniform', input_dim=X_new.shape[1], activation='relu'),
        Dense(units=512, kernel_initializer='uniform', activation='relu'),
        Dropout(0.25),
        Dense(128, kernel_initializer='uniform', activation='relu'),
        Dense(64, kernel_initializer='uniform', activation='relu'),
        Dense(1, kernel_initializer='uniform', activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

if training:
    model.fit(X_new, Y, batch_size=256, epochs=20, validation_split=0.1, validation_data=None, shuffle=True)
    model.save('../Models/dnn_new_feats.h5')
else:
    weights = '../Models/dnn_new_feats.h5'
    model.load_weights(weights)

Train on 18000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
predicted = model.predict(X_new_test, batch_size=256, verbose=0)

In [None]:
headers = ['id', 'prob', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_test
new_test['prob'] = predicted

duplicated_idx = new_test.duplicated(subset='id', keep='first')
new_test = new_test[~duplicated_idx]
new_test['target'] = new_test['prob'].apply(lambda x: 1 if x>0.5 else 0)
new_test.to_csv('../Test/submission_dnn_new_feat.csv', index=False, header=headers)