In [1]:
import gc
import numpy as np
import math
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


In [2]:
songs = pd.read_csv('../Data/songs.csv').astype(str)
members = pd.read_csv('../Data/members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                             parse_dates=['registration_init_time','expiration_date'])

In [3]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members['registration_month'] = members['registration_init_time'].dt.month
members['expiration_year'] = members['expiration_date'].dt.year
members = members.drop(['bd', 'gender','registration_init_time','expiration_date'], axis=1)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

In [None]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

In [None]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].map(str).apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].map(str).apply(artist_count).astype(np.int8)

In [None]:
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'].fillna('no_lyricist',inplace=True)
test['lyricist'].fillna('no_lyricist',inplace=True)
train['lyricists_count'] = train['lyricist'].map(str).apply(lyricist_count).astype(np.int8)
test['lyricists_count'] = test['lyricist'].map(str).apply(lyricist_count).astype(np.int8)

In [None]:
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'].fillna('no_composer',inplace=True)
test['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].map(str).apply(composer_count).astype(np.int8)
test['composer_count'] = test['composer'].map(str).apply(composer_count).astype(np.int8)

In [None]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].map(str).apply(count_song_played).astype(np.int64)

In [None]:
# number of times an artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].map(str).apply(count_artist_played).astype(np.int64)

In [None]:
#how frequently has the user repeated songs?
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        counts = repeated_dict[x]
        if counts == 0:
            return 0
        elif counts < 100:
            return 1
        return 2
    except KeyError:
        return 0

train['user_repeated'] = train['msno'].map(str).apply(user_repeated_songs).astype(np.int8)
test['user_repeated'] = test['msno'].map(str).apply(user_repeated_songs).astype(np.int8)

In [None]:
#has anyone listened to the song again?
repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        count = repeated_songs_dict[x]
        if count == 0:
            return 0
        elif count < 100:
            return 1
        return 2
    except KeyError:
        return 0

train['repeated_song'] = train['song_id'].map(str).apply(repeated_songs).astype(np.int8)
test['repeated_song'] = test['song_id'].map(str).apply(repeated_songs).astype(np.int8)

In [None]:
cols = list(train.columns)
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

In [None]:
X = np.array(train.drop(['target'], axis=1), dtype=np.int32)
Y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1), dtype=np.int32)
Y_test = test['id'].values

In [None]:
cols_name = list(train.columns)
X_new = X

max_vals = X_new.max(axis = 0).transpose()
min_vals = X_new.min(axis = 0).transpose()
mean_vals = np.mean(X_new, axis = 0).transpose()

In [None]:
#training set
X_new = X_new - mean_vals
X_new = X_new / (max_vals - min_vals)
X_new = np.around(X_new,decimals = 2)

In [None]:
#testing set
X_new_test = X_test - mean_vals
X_new_test = X_new_test / (max_vals - min_vals)
X_new_test = np.around(X_new_test,decimals = 2)

In [None]:
del X; 
del members, songs;
del train, test;
gc.collect();

In [None]:
training = True
def scheduler(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

lrate = LearningRateScheduler(scheduler)

model = Sequential([
        Dense(units=1024, kernel_initializer='uniform', input_dim=X_new.shape[1], activation='relu'),
        Dense(units=512, kernel_initializer='uniform', activation='relu'),
        Dropout(0.25),
        Dense(128, kernel_initializer='uniform', activation='relu'),
        Dense(64, kernel_initializer='uniform', activation='relu'),
        Dense(1, kernel_initializer='uniform', activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

if training:
    model.fit(X_new, Y, batch_size=256, epochs=20, validation_split=0.1, validation_data=None, shuffle=True, callbacks=[lrate])
    model.save('../Models/dnn_third_var.h5')
else:
    weights = '../Models/dnn_third_var.h5'
    model.load_weights(weights)

In [None]:
predicted = model.predict(X_new_test, batch_size=256, verbose=0)

In [None]:
headers = ['id', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_test
new_test['target'] = predicted
duplicated_idx = new_test.duplicated(subset='id', keep='first')
new_test = new_test[~duplicated_idx]
new_test.to_csv('submitted_dnn_songs_user.csv', index=False)