In [None]:
import gc
import numpy as np
import math
import pandas as pd
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from tqdm import tqdm

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import LearningRateScheduler

In [None]:
songs = pd.read_csv('../Data/songs.csv').astype(str)
members = pd.read_csv('../Data/members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                             parse_dates=['registration_init_time','expiration_date'])

In [None]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
members['registration_month'] = members['registration_init_time'].dt.month
members['expiration_year'] = members['expiration_date'].dt.year
members = members.drop(['bd', 'gender','registration_init_time','expiration_date'], axis=1)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

In [None]:
cols = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type']

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

In [None]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0
    
train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].map(str).apply(count_song_played).astype(np.int64)

In [None]:
# number of times an artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}

def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0

train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].map(str).apply(count_artist_played).astype(np.int64)

In [None]:
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        count = repeated_dict[x]
        return count
    except KeyError:
        return 0
    
repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        count = repeated_songs_dict[x]
        return count
    except KeyError:
        return 0

#has anyone listened to the artist again?
repeated_artists_dict = train[train.target == 1].groupby(['artist_name'])['target'].count().to_dict()
def repeated_artists(x):
    try:
        count = repeated_artists_dict[x]
        return count
    except KeyError:
        return 0
    
#how frequently have people repeatedly listened to the language
repeated_lang_dict = train[train.target == 1].groupby(['language'])['target'].count().to_dict()
def repeated_lang(x):
    try:
        count = repeated_lang_dict[x]
        return count
    except KeyError:
        return 0 
    
# number of times an artist has been played 
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        return 0
        
# number of times a song has been played
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        return 0

In [None]:
train['user_repeated'] = train['msno'].map(str).apply(user_repeated_songs).astype(np.int32)
train['repeated_song'] = train['song_id'].map(str).apply(repeated_songs).astype(np.int32)
train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
train['repeated_artist'] = train['artist_name'].map(str).apply(repeated_artists).astype(np.int32)
train['repeated_lang'] = train['language'].map(str).apply(repeated_lang).astype(np.int32)

In [None]:
test['user_repeated'] = test['msno'].map(str).apply(user_repeated_songs).astype(np.int32)
test['repeated_song'] = test['song_id'].map(str).apply(repeated_songs).astype(np.int32)
test['count_artist_played'] = test['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
test['count_song_played'] = test['song_id'].map(str).apply(count_song_played).astype(np.int64)
test['repeated_artist'] = test['artist_name'].map(str).apply(repeated_artists).astype(np.int32)
test['repeated_lang'] = test['language'].map(str).apply(repeated_lang).astype(np.int32)

In [None]:
source_system_dict = train[train.target == 1].groupby(['source_system_tab'])['target'].count().to_dict()
def source_system_repeated(x):
    try:
        count = source_system_dict[x]
        return binning(count)
    except KeyError:
        return 0    

# number of times source_screen_name has contributed to target=1
source_screen_dict = train[train.target == 1].groupby(['source_screen_name'])['target'].count().to_dict()
def source_screen_repeated(x):
    try:
        count = source_screen_dict[x]
        return binning(count)
    except KeyError:
        return 0

# number of times source_type has contributed to target=1
source_type_dict = train[train.target == 1].groupby(['source_type'])['target'].count().to_dict()
def source_type_repeated(x):
    try:
        count = source_type_dict[x]
        return binning(count)
    except KeyError:
        return 0

In [None]:
train['s_system_repeat'] = train['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
train['s_screen_repeat'] = train['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
train['s_type_repeat'] = train['source_type'].map(str).apply(source_type_repeated).astype(np.int32)

In [None]:
test['s_system_repeat'] = test['source_system_tab'].map(str).apply(source_system_repeated).astype(np.int32)
test['s_screen_repeat'] = test['source_screen_name'].map(str).apply(source_screen_repeated).astype(np.int32)
test['s_type_repeat'] = test['source_type'].map(str).apply(source_type_repeated).astype(np.int32)

In [None]:
X_prev = np.array(train.drop(['target'], axis=1), dtype=np.int32)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_prev, train.target)
model = SelectFromModel(lsvc, prefit=True)
X = model.transform(X_prev)

In [None]:
X_test_prev = np.array(test.drop(['id'], axis=1), dtype=np.int32)
X_test = model.transform(X_test_prev)

In [None]:
training = True
def scheduler(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

lrate = LearningRateScheduler(scheduler)

model = Sequential([
        Dense(units=1024, kernel_initializer='uniform', input_dim=X.shape[1], activation='relu'),
        Dense(units=512, kernel_initializer='uniform', activation='relu'),
        Dropout(0.25),
        Dense(128, kernel_initializer='uniform', activation='relu'),
        Dense(64, kernel_initializer='uniform', activation='relu'),
        Dense(1, kernel_initializer='uniform', activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

if training:
    model.fit(X, train.target, batch_size=32768, epochs=30, validation_split=0.2, validation_data=None, shuffle=True, callbacks=[lrate])
    model.save('../Models/dnn_lasso_var.h5')

In [None]:
predicted = model.predict(X_test, batch_size=32768, verbose=0)
headers = ['id', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_test
new_test['target'] = predicted
duplicated_idx = new_test.duplicated(subset='id', keep='first')
new_test = new_test[~duplicated_idx]
new_test.to_csv('submission_dnn_lasso.csv', index=False)