In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

## Lendo informações dos usuários e das músicas

In [2]:
#read members
member = pd.read_csv("../input/members.csv", dtype={'city' : 'category',
                                                      'bd' : np.uint8,
                                                      'gender' : 'category',
                                                      'registered_via' : 'category'},
                     parse_dates=['registration_init_time','expiration_date'])

In [3]:
# read songs
song = pd.read_csv("../input/songs.csv", dtype={'genre_ids': 'category',
                                                  'language' : 'category',
                                                  'artist_name' : 'category',
                                                  'composer' : 'category',
                                                  'lyricist' : 'category',
                                                  'song_id' : 'category'})

In [4]:
song_extra = pd.read_csv("../input/song_extra_info.csv")

Pegando nomes das colunas que tem em usuários e músicas

In [5]:
columns_member = member.columns.tolist()
columns_song = song.columns.tolist()
columns_song_extra = song_extra.columns.tolist()

print (columns_member)
print (columns_song)
print (columns_song_extra)

['msno', 'city', 'bd', 'gender', 'registered_via', 'registration_init_time', 'expiration_date']
['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer', 'lyricist', 'language']
['song_id', 'name', 'isrc']


## Lendo base de dados de treinamento

In [None]:
train = pd.read_csv("../input/train.csv", dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                  'source_screen_name' : 'category',
                                                  'source_type' : 'category',
                                                  'target' : np.uint8,
                                                  'song_id' : 'category'})

## Lendo base de dados de teste

In [None]:
test = pd.read_csv("../input/test.csv", dtype={'msno' : 'category',
                                                'source_system_tab' : 'category',
                                                'source_screen_name' : 'category',
                                                'source_type' : 'category',
                                                'song_id' : 'category'})

### Criando conjuntos de treinamento e teste

Conjunto de treinamento

In [None]:

ds_train = train.merge(member[columns_member], on='msno', how='left')
ds_train = ds_train.merge(song[columns_song], on='song_id', how='left')
ds_train = ds_train.merge(song_extra[columns_song_extra], on='song_id', how='left')

Conjunto de teste

In [None]:
ds_test = test.merge(member[columns_member], on='msno', how='left')
ds_test = ds_test.merge(song[columns_song], on='song_id', how='left')
ds_test = ds_test.merge(song_extra[columns_song_extra], on='song_id', how='left')

# 2. Adição de features

tempo de assinatura

In [None]:
ds_train['membership_days'] = ds_train['expiration_date'].subtract(ds_train['registration_init_time']).dt.days.astype(int)
ds_test['membership_days'] = ds_test['expiration_date'].subtract(ds_test['registration_init_time']).dt.days.astype(int)

artista e compositor é o mesmo

In [None]:
ds_train['artist_composer'] = (np.asarray(ds_train['artist_name']) == np.asarray(ds_train['composer'])).astype(np.int8)
ds_test['artist_composer'] = (np.asarray(ds_test['artist_name']) == np.asarray(ds_test['composer'])).astype(np.int8)

quantidade de generos, compositores e liricos

In [None]:
def count_vals(x):
    # count number of values (since we can have mutliple values separated by '|')
    if type(x) != str:
        return 1
    else:
        return 1 + x.count('|')

In [None]:
ds_train['number_of_genres'] = ds_train['genre_ids'].apply(count_vals)
ds_train['number_of_composers'] = ds_train['composer'].apply(count_vals)
ds_train['number_of_lyricists'] = ds_train['lyricist'].apply(count_vals)

ds_test['number_of_genres'] = ds_test['genre_ids'].apply(count_vals)
ds_test['number_of_composers'] = ds_test['composer'].apply(count_vals)
ds_test['number_of_lyricists'] = ds_test['lyricist'].apply(count_vals)

data de registro na plataforma

In [None]:
ds_train['registration_year'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
ds_train['registration_month'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[5:7]))
ds_train['registration_date'] = ds_train['registration_init_time'].apply(lambda x: int(str(x)[8:10]))
ds_train = ds_train.drop(['registration_init_time'], axis=1)

ds_test['registration_year'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
ds_test['registration_month'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[5:7]))
ds_test['registration_date'] = ds_test['registration_init_time'].apply(lambda x: int(str(x)[8:10]))
ds_test = ds_test.drop(['registration_init_time'], axis=1)

data de expiração do registro na plataforma

In [None]:
ds_train['expiration_year'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[0:4]))
ds_train['expiration_month'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[5:7]))
ds_train['expiration_date'] = ds_train['expiration_date'].apply(lambda x: int(str(x)[8:10]))

ds_test['expiration_year'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[0:4]))
ds_test['expiration_month'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[5:7]))
ds_test['expiration_date'] = ds_test['expiration_date'].apply(lambda x: int(str(x)[8:10]))

dividindo isrc em pais, organização que gravou a musica, ano de gravação e numero unico da musica

In [None]:
isrc_values = pd.Series(ds_train.isrc.values)
ds_train.drop(['isrc'], axis=1, inplace=True)
ds_train['isrc_pais'] = isrc_values.str.slice(0,2).astype('category')
ds_train["isrc_pais"] = pd.factorize(ds_train['isrc_pais'])[0]

ds_train['isrc_org'] = isrc_values.str.slice(2,5).astype('category')
ds_train["isrc_org"] = pd.factorize(ds_train['isrc_org'])[0]
ds_train['isrc_year'] = isrc_values.str.slice(5,7).astype(float)  # IRSC issue date
# ds_train['isrc_number'] = isrc_values.str.slice(7,12)
ds_train.loc[ds_train['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
ds_train.loc[ds_train['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs



isrc_values_test = pd.Series(ds_test.isrc.values)
ds_test.drop(['isrc'], axis=1, inplace=True)
ds_test['isrc_pais'] = isrc_values_test.str.slice(0,2).astype('category')
ds_test["isrc_pais"] = pd.factorize(ds_test['isrc_pais'])[0]

ds_test['isrc_org'] = isrc_values_test.str.slice(2,5).astype('category')
ds_test["isrc_org"] = pd.factorize(ds_test['isrc_org'])[0]

ds_test['isrc_year'] = isrc_values_test.str.slice(5,7).astype(float)  # IRSC issue date
# ds_test['isrc_number'] = isrc_values_test.str.slice(7,12)
ds_test.loc[ds_test['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
ds_test.loc[ds_test['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs

Teste

In [77]:
ds_train["isrc_pais"] = pd.factorize(ds_train['isrc_pais'])[0]
ds_train["isrc_org"] = pd.factorize(ds_train['isrc_org'])[0]

ds_test["isrc_pais"] = pd.factorize(ds_test['isrc_pais'])[0]
ds_test["isrc_org"] = pd.factorize(ds_test['isrc_org'])[0]

In [85]:
columns_tt = ds_test.columns.tolist()
for ind, i in enumerate(columns_tt):
    cc = list(ds_test[i].unique())
    print (ind, i, "==>" , len(cc), type(cc[0]) , "\n")
# ds_test

0 id ==> 2556790 <class 'numpy.int64'> 

1 source_system_tab ==> 10 <class 'str'> 

2 source_screen_name ==> 23 <class 'str'> 

3 source_type ==> 13 <class 'str'> 

4 city ==> 21 <class 'str'> 

5 bd ==> 88 <class 'numpy.uint64'> 

6 gender ==> 3 <class 'float'> 

7 registered_via ==> 6 <class 'str'> 

8 expiration_date ==> 31 <class 'numpy.int64'> 

9 song_length ==> 45658 <class 'numpy.float64'> 

10 genre_ids ==> 502 <class 'str'> 

11 artist_name ==> 27564 <class 'str'> 

12 composer ==> 52307 <class 'str'> 

13 lyricist ==> 24911 <class 'float'> 

14 language ==> 11 <class 'str'> 

15 membership_days ==> 4240 <class 'numpy.int64'> 

16 artist_composer ==> 2 <class 'numpy.int64'> 

17 number_of_genres ==> 8 <class 'numpy.int64'> 

18 number_of_composers ==> 26 <class 'numpy.int64'> 

19 number_of_lyricists ==> 20 <class 'numpy.int64'> 

20 registration_year ==> 14 <class 'numpy.int64'> 

21 registration_month ==> 12 <class 'numpy.int64'> 

22 registration_date ==> 31 <class 'numpy.

In [None]:
# ds_test.columns.tolist()

### retirando da base algumas colunas que tem muitas categorias

### Treinamento

In [None]:
# columns_dftrain = [
#  'target',
#  'source_system_tab',
#  'source_screen_name',
#  'source_type',
#  'city',
#  'bd',
#  'song_length',
#  'language',
#  'gender',
#  'isrc',
#  'registration_year',
#  'registration_month',
#  'registration_date',
#  'expiration_year',
#  'expiration_month',
#  'registered_via',
#  'expiration_date']

# dftrain = ds_train.reindex(columns=columns_dftrain)
# # dftrain

### Teste

In [None]:
# columns_dftest = [
#  'id',
#  'source_system_tab',
#  'source_screen_name',
#  'source_type',
#  'city',
#  'bd',
#  'song_length',
#  'language',
#  'gender',
#  'isrc',
#  'registration_year',
#  'registration_month',
#  'registration_date',
#  'expiration_year',
#  'expiration_month',
#  'registered_via',
#  'expiration_date']

# dftest = ds_test.reindex(columns=columns_dftest)
# # dftrain

In [None]:
# ds_train

### Tratando a coluna isrc

### Treinamento

In [None]:
# isrc_values = pd.Series(dftrain.isrc.values)
# dftrain.drop(['isrc'], axis=1, inplace=True)
# dftrain['isrc_pais'] = isrc_values.str.slice(0,2)
# dftrain['isrc_org'] = isrc_values.str.slice(2,5)
# dftrain['isrc_year'] = isrc_values.str.slice(5,7).astype(float)  # IRSC issue date
# dftrain['isrc_number'] = isrc_values.str.slice(7,12)

# dftrain.loc[dftrain['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
# dftrain.loc[dftrain['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs

### Teste

In [None]:
# isrc_values_test = pd.Series(dftest.isrc.values)
# dftest.drop(['isrc'], axis=1, inplace=True)
# dftest['isrc_pais'] = isrc_values_test.str.slice(0,2)
# dftest['isrc_org'] = isrc_values_test.str.slice(2,5)
# dftest['isrc_year'] = isrc_values_test.str.slice(5,7).astype(float)  # IRSC issue date
# dftest['isrc_number'] = isrc_values_test.str.slice(7,12)

# dftest.loc[dftest['isrc_year'] > 17, 'isrc_year'] += 1900  # 1900's songs
# dftest.loc[dftest['isrc_year'] < 18, 'isrc_year'] += 2000  # 2000's songs

### Removendo valores nulos

### Treinamento

In [None]:
# for i in dftrain.columns.tolist():
#     dftrain[i].replace([np.nan, "null"], 0, inplace=True)

### Teste

In [None]:
# for i in dftest.columns.tolist():
#     dftest[i].replace([np.nan, "null"], 0, inplace=True)

In [None]:
# dftest

### Fazendo dummies na base

### Treinamento

In [None]:
# str_columns = ['source_system_tab', 'source_screen_name', 'source_type', 'gender']
# for i in str_columns:
#     print (i)
#     dummies = pd.get_dummies(dftrain[i], prefix=i)
#     dftrain = pd.concat([dftrain,dummies], axis=1)
    
# dftrain.drop(str_columns, axis=1, inplace=True)

### Teste

In [None]:
# str_columns_test = ['source_system_tab', 'source_screen_name', 'source_type', 'gender']
# for i in str_columns_test:
#     print (i)
#     dummies = pd.get_dummies(dftest[i], prefix=i)
#     dftest = pd.concat([dftest,dummies], axis=1)

# dftest.drop(str_columns_test, axis=1, inplace=True)

### Removendo colunas que foram feito dummies do dataframe

In [None]:

# dftest

## treinamento

In [None]:
# dftrain.to_csv('data.csv.gz', compression = 'gzip', index=False)

### Retirando colunas com texto

In [80]:
ds_train.drop(["msno", "song_id", "name"], axis=1, inplace=True)
ds_test.drop(["msno", "song_id", "name"], axis=1, inplace=True)

### Dividindo em input e output

In [88]:
X = ds_train.drop(['target'], axis=1)
y = ds_train.target

X_test = ds_test.drop(['id'], axis=1)
index_test = ds_test['id'].values

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()

# X = sc.fit_transform(X)
# X_test = sc.transform(X_test)

### Treinamento com LGBD

In [None]:
d_train = lgb.Dataset(X, y)
watchlist = [d_train]

#Those parameters are almost out of hat, so feel free to play with them. I can tell
#you, that if you do it right, you will get better results for sure ;)
print('Training LGBM model...')
params = {}
params['learning_rate'] = 0.0001
params['application'] = 'binary'
params['feature_fraction'] = 0.8
params['categorical_feature'] = [0,1,2,3,5,6,13]
params['max_depth'] = 8
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

model = lgb.train(params, train_set=d_train, num_boost_round=150, valid_sets=watchlist, \
verbose_eval=1)

print ("done")

Training LGBM model...


### Fazendo o predict com o LGBD

In [None]:
print('Making predictions and saving them...')
p_test = model.predict(X_test)

subm = pd.DataFrame()
subm['id'] = index_test
subm['target'] = p_test
subm.to_csv('submission.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
print('Done!')

### Treinando com linear regression

In [None]:
# from sklearn.linear_model import LinearRegression

# reg = LinearRegression(normalize=True)
# reg.fit(X,y)

In [None]:
# a = reg.predict(X[:50000])
# print (list(filter(lambda x : x < 0.15, a)))
# # a = list(map(lambda x: 1 if x > 0.8 else 0, a))
# b = pd.Series(a, y[:50000])
# b

## Teste

In [None]:
# columns_dftest = [
#  'source_system_tab',
#  'source_screen_name',
#  'source_type',
#  'city',
#  'bd',
#  'registered_via',
#  'registration_init_time',
#  'expiration_date',
#  'song_length',
#  'language',
#  'isrc']

# dftest = tttest.reindex(columns=columns_dftest)
# dftest

#### tratando os valores nulos

In [None]:
# for i in dftest.columns.tolist():
#     value = dftest[np.logical_not(np.logical_or(dftest[i].isnull(), dftest[i].isin(['null'])))]
#     print (len(value), i)

# for i in dftest.columns.tolist():
#     dftest[i].replace([np.nan, "null"], 0, inplace=True)
    
# for i in dftest.columns.tolist():
#     value = dftest[np.logical_not(np.logical_or(dftest[i].isnull(), dftest[i].isin(['null'])))]
#     print (len(value), i)
    

#### tratando colunas categoricas

In [None]:
# str_columns = ['source_system_tab', 'source_screen_name', 'source_type']
# for i in str_columns:
#     print (i)
#     dummies = pd.get_dummies(dftest[i], prefix=i)
#     dftest = pd.concat([dftest,dummies], axis=1)
# dftest

In [None]:
# dftest.drop(str_columns, axis=1, inplace=True)
# dftest

#### tratando coluna isrc

In [None]:
# isrc_values_test = pd.Series(dftest.isrc.values)
# dftest.drop(['isrc'], axis=1, inplace=True)
# # isrc_column = dftrain.reindex(columns=['isrc'])

# # isrc_column['cc'] = isrc_values.str.slice(0,2)  # Country Code column
# # isrc_column['xxx'] = isrc_values.str.slice(2,5) # IRSC Issuer
# dftest['isrc'] = isrc_values_test.str.slice(5,7).astype(float)  # IRSC issue date

In [None]:
# dftest.loc[dftest['isrc'] > 17, 'isrc'] += 1900  # 1900's songs
# dftest.loc[dftest['isrc'] < 18, 'isrc'] += 2000  # 2000's songs

# dftest

In [None]:
# reg.predict(dftest)

In [None]:
# columns_isrc = isrc_column.columns.tolist()
# for i in columns_isrc:
#     cc = list(isrc_column[i].unique())
#     print (i, "==>" , len(cc), type(cc[0]) , "\n")

junstando com a base de trainamento

In [None]:
# dftrain.drop(['isrc'], axis=1, inplace=True) # removendo a coluna isrc de dftrain
# dftrain = pd.concat([dftrain,isrc_column], axis=1)
# dftrain

In [None]:
# columns = tt.columns.tolist()
# # print (columns)
# dd = tt.reindex(columns=columns)
# # dd.plot.bar(x="XA", y="BA")
# # print (len(dd.msno.unique()))
# # print (len(dd.song_id.unique()))
# datas = []
# labels = []
# for column in columns:
#     datas.append(len(dd[column].unique()))
#     labels.append(column)
    
# print (datas, labels)
# print ("\n", dd.genre_ids.unique())
# for i in dd.genre_ids.unique():
#     print (len(dd[dd.genre_ids.isin([i])]), i)
# plot_bar(datas[2:], labels[2:0])

In [None]:
# def source_system_tab(data_frame):
#     """
#     recebe o data_frame da base de dados e retorna o data frame retirando as 
#     linhas que tem valores nulos  na coluna source_system_tab
#     """
#     df = data_frame[pd.notnull(data_frame.source_system_tab)]
#     df = df[~df.source_system_tab.isin(["null"])]
#     return df
# for i in df.source_system_tab.unique():
#     print (i, len(df[df.source_system_tab.isin([i])]))

In [None]:
# tt.describe()

In [None]:
# columns = data_set.columns.tolist()[:-1]
# data_set.drop_duplicates(subset=columns, keep='first', inplace=True) 

In [None]:
# data_set.describe()

In [None]:
# import numpy as np
# import pandas as pd
# import lightgbm as lgb

# from sklearn.neural_network import MLPRegressor

# print('Loading data...')
# data_path = './'
# train = pd.read_csv(data_path + 'train.csv', nrows=50000, dtype={'msno' : 'category',
#                                                 'source_system_tab' : 'category',
#                                                   'source_screen_name' : 'category',
#                                                   'source_type' : 'category',
#                                                   'target' : np.uint8,
#                                                   'song_id' : 'category'})
# test = pd.read_csv(data_path + 'test.csv', nrows=50000, dtype={'msno' : 'category',
#                                                 'source_system_tab' : 'category',
#                                                 'source_screen_name' : 'category',
#                                                 'source_type' : 'category',
#                                                 'song_id' : 'category'})
# songs = pd.read_csv(data_path + 'songs.csv', nrows=50000, dtype={'genre_ids': 'category',
#                                                   'language' : 'category',
#                                                   'artist_name' : 'category',
#                                                   'composer' : 'category',
#                                                   'lyricist' : 'category',
#                                                   'song_id' : 'category'})
# members = pd.read_csv(data_path + 'members.csv',dtype={'city' : 'category',
#                                                       'bd' : np.uint8,
#                                                       'gender' : 'category',
#                                                       'registered_via' : 'category'})
# songs_extra = pd.read_csv(data_path + 'song_extra_info.csv', nrows=50000)

# print('Data preprocessing...')
# song_cols = ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']
# train = train.merge(songs[song_cols], on='song_id', how='left')
# test = test.merge(songs[song_cols], on='song_id', how='left')

# members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
# members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
# members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

# members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
# members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
# members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
# members = members.drop(['registration_init_time'], axis=1)

# def isrc_to_year(isrc):
#     if type(isrc) == str:
#         if int(isrc[5:7]) > 17:
#             return 1900 + int(isrc[5:7])
#         else:
#             return 2000 + int(isrc[5:7])
#     else:
#         return np.nan
        
# songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
# songs_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

# train = train.merge(members, on='msno', how='left')
# test = test.merge(members, on='msno', how='left')

# train = train.merge(songs_extra, on = 'song_id', how = 'left')
# test = test.merge(songs_extra, on = 'song_id', how = 'left')

# import gc
# del members, songs; gc.collect();

# for col in train.columns:
#     if train[col].dtype == object:
#         train[col] = train[col].astype('category')
#         test[col] = test[col].astype('category')

# X = train.drop(['target'], axis=1)
# y = train['target'].values

# print (X)
# X_test = test.drop(['id'], axis=1)
# ids = test['id'].values


# mlp = MLPRegressor(hidden_layer_sizes=(3), activation='relu', verbose=True)
# mlp.fit(X, y)

# p_test = mlp.predict(X_test)

# """
# del train, test; gc.collect();

# d_train = lgb.Dataset(X, y)
# watchlist = [d_train]

# #Those parameters are almost out of hat, so feel free to play with them. I can tell
# #you, that if you do it right, you will get better results for sure ;)
# print('Training LGBM model...')
# params = {}
# params['learning_rate'] = 0.2
# params['application'] = 'binary'
# params['max_depth'] = 8
# params['num_leaves'] = 2**8
# params['verbosity'] = 0
# params['metric'] = 'auc'

# print (d_train)

# model = lgb.train(params, train_set=d_train, num_boost_round=50, valid_sets=watchlist, \
# verbose_eval=5)

# print('Making predictions and saving them...')
# p_test = model.predict(X_test)
# """
# subm = pd.DataFrame()
# subm['id'] = ids
# subm['target'] = p_test
# subm.to_csv('submissio.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')
# print('Done!')

In [None]:
# x = np.random.randn(1000, 3)
# print (x)
# # fig, axes = plt.subplots()

# colors = ['red', 'blue', 'green']
# N = 7
# menMeans = [20, 35, 30, 35, 27,5,2]
# womenMeans = [25, 32, 34, 20, 25,15,3]
# menStd = (2, 3, 4, 1, 2)
# womenStd = (3, 5, 2, 3, 3)
# ind = np.arange(N)    # the x locations for the groups
# width = 0.75       # the width of the bars: can also be len(x) sequence

# p1 = plt.bar(ind, tuple(menMeans), width, color='#d62728'#, yerr=menStd
#             )
# p2 = plt.bar(ind, tuple(womenMeans), width,
#              bottom=menMeans#, yerr=womenStd
#             )

# plt.ylabel('Scores')
# plt.title('Scores by group and gender')
# plt.xticks(ind, ['G1', 'G2', 'G3', 'G4', 'G5'])
# plt.yticks(np.arange(0, 81, 10))
# plt.legend((p1[0], p2[0]), ('Men', 'Women'))

# plt.show()