In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

## Lendo informações dos usuários e das músicas

In [2]:
#read members
member = pd.read_csv("../input/members.csv", dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                     parse_dates=['registration_init_time','expiration_date'])

In [3]:
# read songs
song = pd.read_csv("../input/songs.csv", dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})

In [4]:
song_extra = pd.read_csv("../input/song_extra_info.csv")

Pegando nomes das colunas que tem em usuários e músicas

In [5]:
columns_member = member.columns.tolist()
columns_song = song.columns.tolist()
columns_song_extra = song_extra.columns.tolist()

print (columns_member)
print (columns_song)
print (columns_song_extra)

['msno', 'city', 'bd', 'gender', 'registered_via', 'registration_init_time', 'expiration_date']
['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']
['song_id', 'name', 'isrc']


## Lendo base de dados de treinamento

In [6]:
train = pd.read_csv("../input/train.csv", dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})

## Lendo base de dados de teste

In [7]:
test = pd.read_csv("../input/test.csv", dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})

### Criando conjuntos de treinamento e teste

Conjunto de treinamento

In [8]:

ds_train = train.merge(member[columns_member], on='msno', how='left')
ds_train = ds_train.merge(song[columns_song], on='song_id', how='left')
ds_train = ds_train.merge(song_extra[columns_song_extra], on='song_id', how='left')

Conjunto de teste

In [9]:
ds_test = test.merge(member[columns_member], on='msno', how='left')
ds_test = ds_test.merge(song[columns_song], on='song_id', how='left')
ds_test = ds_test.merge(song_extra[columns_song_extra], on='song_id', how='left')

# 2. Adição de features

tempo de assinatura

In [10]:
ds_train['membership_days'] = ds_train['expiration_date'].subtract(ds_train['registration_init_time']).dt.days.astype(int)
ds_test['membership_days'] = ds_test['expiration_date'].subtract(ds_test['registration_init_time']).dt.days.astype(int)

artista e compositor é o mesmo

In [11]:
ds_train['artist_composer'] = (np.asarray(ds_train['artist_name']) == np.asarray(ds_train['composer'])).astype(np.int8)
ds_test['artist_composer'] = (np.asarray(ds_test['artist_name']) == np.asarray(ds_test['composer'])).astype(np.int8)

quantidade de generos, compositores e liricos

In [12]:
def count_vals(x):
    # count number of values (since we can have mutliple values separated by '|')
    if type(x) != str:
        return 1
    else:
        return 1 + x.count('|')

In [13]:
ds_train['number_of_genres'] = ds_train['genre_ids'].apply(count_vals)
ds_train['number_of_composers'] = ds_train['composer'].apply(count_vals)
ds_train['number_of_lyricists'] = ds_train['lyricist'].apply(count_vals)

ds_test['number_of_genres'] = ds_test['genre_ids'].apply(count_vals)
ds_test['number_of_composers'] = ds_test['composer'].apply(count_vals)
ds_test['number_of_lyricists'] = ds_test['lyricist'].apply(count_vals)

data de registro na plataforma

In [14]:
ds_train['registration_year'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
ds_train['registration_month'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[5:7]))
ds_train['registration_date'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[8:10]))
ds_train = ds_train.drop(['registration_init_time'], axis=1)

ds_test['registration_year'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
ds_test['registration_month'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[5:7]))
ds_test['registration_date'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[8:10]))
ds_test = ds_test.drop(['registration_init_time'], axis=1)

data de expiração do registro na plataforma

In [15]:
ds_train['expiration_year'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[0:4]))
ds_train['expiration_month'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[5:7]))
ds_train['expiration_date'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[8:10]))

ds_test['expiration_year'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[0:4]))
ds_test['expiration_month'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[5:7]))
ds_test['expiration_date'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[8:10]))

dividindo isrc em pais, organização que gravou a musica, ano de gravação e numero unico da musica

In [16]:
isrc_values = pd.Series(ds_train.isrc.values)
ds_train.drop(['isrc'], axis=1, inplace=True)
ds_train['isrc_pais'] = isrc_values.str.slice(0,2).astype('category')
ds_train["isrc_pais"] = pd.factorize(ds_train['isrc_pais'])[0]

ds_train['isrc_org'] = isrc_values.str.slice(2,5).astype('category')
ds_train["isrc_org"] = pd.factorize(ds_train['isrc_org'])[0]
ds_train['isrc_year'] = isrc_values.str.slice(5,7).astype(float)  # IRSC issue date
# ds_train['isrc_number'] = isrc_values.str.slice(7,12)
ds_train.loc[ds_train['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
ds_train.loc[ds_train['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs



isrc_values_test = pd.Series(ds_test.isrc.values)
ds_test.drop(['isrc'], axis=1, inplace=True)
ds_test['isrc_pais'] = isrc_values_test.str.slice(0,2).astype('category')
ds_test["isrc_pais"] = pd.factorize(ds_test['isrc_pais'])[0]

ds_test['isrc_org'] = isrc_values_test.str.slice(2,5).astype('category')
ds_test["isrc_org"] = pd.factorize(ds_test['isrc_org'])[0]

ds_test['isrc_year'] = isrc_values_test.str.slice(5,7).astype(float)  # IRSC issue date
# ds_test['isrc_number'] = isrc_values_test.str.slice(7,12)
ds_test.loc[ds_test['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
ds_test.loc[ds_test['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs

Teste

substituindo os dados faltantes pela moda do conjunto

In [17]:
columns_tt = ds_train.columns.tolist()
for i in columns_tt:
    ttt = ds_train[ds_train[i].notnull()]
    moda = ttt[i].mode()[0]
    ds_train[i].replace([np.nan, "null"], moda, inplace=True)
    print (i, moda)

msno MXIMDXO0j3UpaT7FvOSGW6Y5zfhlh+xYjTqGoUdMzEE=
song_id reXuGcEWDDCnL0K3Th//3DFG4S1ACSpJMzA+CFipo1g=
source_system_tab my library
source_screen_name Local playlist more
source_type local-library
target 1
city 1
bd 0
gender male
registered_via 9
expiration_date 30
song_length 235415.0
genre_ids 465
artist_name Various Artists
composer 周杰倫
lyricist 阿信
language 3.0
name 演員
membership_days 3
artist_composer 0
number_of_genres 1
number_of_composers 1
number_of_lyricists 1
registration_year 2016
registration_month 12
registration_date 6
expiration_year 2017
expiration_month 9
isrc_pais 3
isrc_org 0
isrc_year 2016.0


In [18]:
columns_tt = ds_test.columns.tolist()
for i in columns_tt:
    ttt = ds_test[ds_test[i].notnull()]
    moda = ttt[i].mode()[0]
    ds_test[i].replace([np.nan, "null"], moda, inplace=True)
    print (i, moda)
# columns_tt = ds_test.columns.tolist()
# for ind, i in enumerate(columns_tt):
#     cc = list(ds_test[i].unique())
#     print (ind, i, "==>" , len(cc), type(cc[0]) , "\n")
# # ds_test

id 0
msno KGXNZ/H3VxvET/+rGxlrAe7Gpz2eKMXyuSg3xh8Ij1M=
song_id ZcKgNis1AP1LA0sdtIddrtk7P04iiJzJrXvwXdT/X3Q=
source_system_tab my library
source_screen_name Local playlist more
source_type online-playlist
city 1
bd 0
gender male
registered_via 9
expiration_date 30
song_length 234161.0
genre_ids 465
artist_name Various Artists
composer 周杰倫
lyricist 方文山
language 3.0
name 妮妮 (Nini)
membership_days 3
artist_composer 0
number_of_genres 1
number_of_composers 1
number_of_lyricists 1
registration_year 2016
registration_month 1
registration_date 6
expiration_year 2017
expiration_month 9
isrc_pais 0
isrc_org 0
isrc_year 2016.0


### Retirando colunas de id do usuário e musica e nome da musica

In [19]:
ds_train.drop(["msno", "song_id", "name"], axis=1, inplace=True)
ds_test.drop(["msno", "song_id", "name"], axis=1, inplace=True)

### Dividindo em input e output

In [20]:
X = ds_train.drop(['target'], axis=1)
y = ds_train.target

X_test = ds_test.drop(['id'], axis=1)
index_test = ds_test['id'].values

### Treinamento com LGBD

In [21]:
d_train = lgb.Dataset(X, y)
watchlist = [d_train]

#Those parameters are almost out of hat, so feel free to play with them. I can tell
#you, that if you do it right, you will get better results for sure ;)
print('Training LGBM model...')
params = {}
params['learning_rate'] = 0.5
params['application'] = 'binary'
# params['feature_fraction'] = 0.8
params['max_depth'] = 8
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

model = lgb.train(params, train_set=d_train, num_boost_round=150, valid_sets=watchlist, \
verbose_eval=1)

print ("done")

Training LGBM model...
[1]	training's auc: 0.675523
[2]	training's auc: 0.684953
[3]	training's auc: 0.69067
[4]	training's auc: 0.693497
[5]	training's auc: 0.697877
[6]	training's auc: 0.699972
[7]	training's auc: 0.70172
[8]	training's auc: 0.702612
[9]	training's auc: 0.703397
[10]	training's auc: 0.70418
[11]	training's auc: 0.70536
[12]	training's auc: 0.705597
[13]	training's auc: 0.706047
[14]	training's auc: 0.706363
[15]	training's auc: 0.707552
[16]	training's auc: 0.707996
[17]	training's auc: 0.708233
[18]	training's auc: 0.708699
[19]	training's auc: 0.709407
[20]	training's auc: 0.710184
[21]	training's auc: 0.710452
[22]	training's auc: 0.710949
[23]	training's auc: 0.711219
[24]	training's auc: 0.71207
[25]	training's auc: 0.713361
[26]	training's auc: 0.713665
[27]	training's auc: 0.715109
[28]	training's auc: 0.715668
[29]	training's auc: 0.715919
[30]	training's auc: 0.716294
[31]	training's auc: 0.71652
[32]	training's auc: 0.716941
[33]	training's auc: 0.718025
[3

### Fazendo o predict com o LGBD

In [22]:
print('Making predictions and saving them...')
p_test = model.predict(X_test)

subm = pd.DataFrame()
subm['id'] = index_test
subm['target'] = p_test
subm.to_csv('submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
print('Done!')

Making predictions and saving them...
Done!
