In [None]:
import gc
import numpy as np
import math
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import lightgbm as lgb

In [None]:
songs = pd.read_csv('../Data/songs.csv').astype(str)
members = pd.read_csv('../Data/members.csv',dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                             parse_dates=['registration_init_time','expiration_date'])

In [None]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
train = train.merge(songs, on='song_id', how='left')
test = test.merge(songs, on='song_id', how='left')

members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)
#members['registration_month'] = members['registration_init_time'].dt.month
members['expiration_year'] = members['expiration_date'].dt.year
members = members.drop(['bd', 'gender','registration_init_time','expiration_date'], axis=1)

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

In [None]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'].fillna('no_genre_id',inplace=True)
test['genre_ids'].fillna('no_genre_id',inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

In [None]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

train['artist_count'] = train['artist_name'].map(str).apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].map(str).apply(artist_count).astype(np.int8)

In [None]:
# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in train['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0
    

train['count_song_played'] = train['song_id'].map(str).apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].map(str).apply(count_song_played).astype(np.int64)

In [None]:
# number of times an artist has been played
_dict_count_artist_played_train = {k: v for k, v in train['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

train['count_artist_played'] = train['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].map(str).apply(count_artist_played).astype(np.int64)

In [None]:
#how frequently has the user repeated songs?
repeated_dict = train[train.target == 1].groupby(['msno'])['target'].count().to_dict()
def user_repeated_songs(x):
    try:
        counts = repeated_dict[x]
        if counts == 0:
            return 0
        elif counts < 100:
            return 1
        return 2
    except KeyError:
        return 0

train['user_repeated'] = train['msno'].map(str).apply(user_repeated_songs).astype(np.int8)
test['user_repeated'] = test['msno'].map(str).apply(user_repeated_songs).astype(np.int8)

In [None]:
#has anyone listened to the song again?
repeated_songs_dict = train[train.target == 1].groupby(['song_id'])['target'].count().to_dict()
def repeated_songs(x):
    try:
        count = repeated_songs_dict[x]
        if count == 0:
            return 0
        elif count < 100:
            return 1
        return 2
    except KeyError:
        return 0

train['repeated_song'] = train['song_id'].map(str).apply(repeated_songs).astype(np.int8)
test['repeated_song'] = test['song_id'].map(str).apply(repeated_songs).astype(np.int8)

In [None]:
cols = list(train.columns)
print cols
cols.remove('target')

for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

In [None]:
X = np.array(train.drop(['target'], axis=1), dtype=np.int32)
Y = train['target'].values

X_test = np.array(test.drop(['id'], axis=1), dtype=np.int32)
Y_test = test['id'].values

In [None]:
del members, songs;
del train, test;
gc.collect();

In [None]:
params = {}
params['learning_rate'] = 0.2
params['application'] = 'binary'
params['max_depth'] = 8
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=40)
lgb_train = lgb.Dataset(X_new, Y, free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train, free_raw_data=False)

In [None]:
model = lgb.train(params, train_set=lgb_train, num_boost_round=40, valid_sets=lgb_eval, verbose_eval=10)

In [None]:
# feature names
print('Feature names:', model.feature_name())

# feature importances
print('Feature importances:', list(model.feature_importance()))

In [None]:
predicted = model.predict(X_test)
subm = pd.DataFrame()
subm['id'] = Y_test
subm['target'] = predicted > 0.5
subm.to_csv('../Test/submission_lgbm.csv', index=False)