In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

In [2]:
import glob
for x in glob.glob("../input/*"):
    print(x)

../input\archive
../input\members.csv
../input\sample_submission.csv
../input\songs.csv
../input\song_extra_info.csv
../input\test.csv
../input\train.csv


In [3]:
df_train = pd.read_csv("../input/train.csv")
df_test  = pd.read_csv('../input/test.csv')
df_songs = pd.read_csv('../input/songs.csv')
df_song_extra = pd.read_csv("../input/song_extra_info.csv")
df_members = pd.read_csv("../input/members.csv",parse_dates=["registration_init_time","expiration_date"])

# Preprocessing

## Members

In [4]:
print(df_members.columns)

Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date'],
      dtype='object')


In [5]:
df_members['validate_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days

df_members['registration_year'] = df_members['registration_init_time'].apply(lambda x: str(x)[0:4])
df_members['registration_month'] = df_members['registration_init_time'].apply(lambda x: str(x)[4:6])
df_members['registration_date'] = df_members['registration_init_time'].apply(lambda x: str(x)[6:8])

df_members['expiration_year'] = df_members['expiration_date'].apply(lambda x: str(x)[0:4])
df_members['expiration_month'] = df_members['expiration_date'].apply(lambda x: str(x)[4:6])
df_members['expiration_date'] = df_members['expiration_date'].apply(lambda x: str(x)[6:8])

### Song year

In [6]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [7]:
df_song_extra['song_year'] = df_song_extra['isrc'].apply(isrc_to_year)

In [8]:
df_song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

In [9]:
df_train = df_train.merge(df_songs, how="left", on="song_id")
df_train = df_train.merge(df_members, how="left", on="msno")
df_train = df_train.merge(df_song_extra, how='left', on='song_id')

df_test  = df_test.merge(df_songs, how="left", on="song_id")
df_test  = df_test.merge(df_members, how="left", on="msno")
df_test = df_test.merge(df_song_extra, how='left', on='song_id')

In [10]:
df_train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,registered_via,registration_init_time,expiration_date,validate_days,registration_year,registration_month,registration_date,expiration_year,expiration_month,song_year
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,...,7,2012-01-02,0-,2103,2012,0,1-,2017,-1,2016.0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,...,9,2011-05-25,9-,2301,2011,0,5-,2017,0,1999.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,...,9,2011-05-25,9-,2301,2011,0,5-,2017,0,2006.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,...,9,2011-05-25,9-,2301,2011,0,5-,2017,0,2010.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,...,7,2012-01-02,0-,2103,2012,0,1-,2017,-1,2016.0


## Fill in null

In [11]:
for col in df_train.columns:
    if df_train[col].isnull().any() or df_train[col].isnull().any():
        print('{:10} {}'.format(str(df_train[col].dtype), col))

object     source_system_tab
object     source_screen_name
object     source_type
float64    song_length
object     genre_ids
object     artist_name
object     composer
object     lyricist
float64    language
object     gender
float64    song_year


In [12]:
col_fill_with_unknown = [
    'source_system_tab', 'source_screen_name', 'source_type', 'gender',
    'genre_ids', 'artist_name', 'composer', 'lyricist'
]
for col in col_fill_with_unknown:
    df_train[col].fillna(value="Unknown", inplace=True)
    df_test[col].fillna(value="Unknown", inplace=True)

for col in ['song_year']:
    df_train[col].fillna(value=-1, inplace=True)
    df_test[col].fillna(value=-1, inplace=True)

In [13]:
df_train['song_length'].fillna(value=df_train['song_length'].mean(),inplace=True)
df_test['song_length'].fillna(value=df_test['song_length'].mean(),inplace=True)

In [14]:
df_train['language'].fillna(value=df_train['language'].mode()[0],inplace=True)
df_test['language'].fillna(value=df_test['language'].mode()[0],inplace=True)

In [15]:
df_train['genre_ids'] = df_train['genre_ids'].str.split("|")
df_test['genre_ids'] = df_test['genre_ids'].str.split("|")
df_train['genre_count'] = df_train['genre_ids'].apply(lambda x : len(x) if "Unknown" not in x else 0)
df_test['genre_count'] = df_test['genre_ids'].apply(lambda x : len(x) if "Unknown" not in x else 0)

In [16]:
df_artists = df_train.loc[:,["artist_name","target"]]
artists_repeat = df_artists.groupby(["artist_name"], as_index=False).sum().rename(columns={"target":"repeat_count"})
artists_play = df_artists.groupby(["artist_name"], as_index=False).count().rename(columns = {"target":"play_count"})
df_artists_repeat = artists_repeat.merge(artists_play, how='inner', on='artist_name')

In [17]:
df_artists_repeat['artist_repeat_percentage'] = round(
    (df_artists_repeat['repeat_count']*100) / df_artists_repeat['play_count'], 
    1
)

In [18]:
df_train = df_train.merge(df_artists_repeat, on="artist_name",how="left")
df_test = df_test.merge(df_artists_repeat,on="artist_name",how="left")

In [19]:
# df_test['repeat_count'].fillna(np.nanmedian(df_test['repeat_count']), inplace=True)
# df_test['play_count'].fillna(np.nanmedian(df_test['play_count']), inplace=True)
# df_test['artist_repeat_percentage'].fillna(np.nanmedian(df_test['artist_repeat_percentage']), inplace=True)

for col in ['repeat_count', 'play_count', 'artist_repeat_percentage']:
    df_test[col].fillna(np.nanmedian(df_test[col]), inplace=True, downcast='infer')

In [20]:
del df_artists
del df_artists_repeat
del artists_play
del artists_repeat

In [21]:
# 可能需要处理顿号
df_train['composer'] = df_train['composer'].str.split("|") 
df_test['composer'] = df_test['composer'].str.split("|")
df_train['composer_count'] = df_train['composer'].apply(lambda x : len(x) if "Unknown" not in x else 0 )
df_test['composer_count'] = df_test['composer'].apply(lambda x : len(x) if "Unknown" not in x else 0 )

In [23]:
df_train['lyricist'] = df_train['lyricist'].str.split("|")
df_test['lyricist'] = df_test['lyricist'].str.split("|")
df_train['lyricist_count'] = df_train['lyricist'].apply(lambda x : len(x) if "Unknown" not in x else 0 )
df_test['lyricist_count'] = df_test['lyricist'].apply(lambda x : len(x) if "Unknown" not in x else 0 )

In [24]:
df_train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,registration_date,expiration_year,expiration_month,song_year,genre_count,repeat_count,play_count,artist_repeat_percentage,composer_count,lyricist_count
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,[359],Bastille,"[Dan Smith, Mark Crew]",...,1-,2017,-1,2016.0,1,528,1140,46.3,2,0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,[1259],Various Artists,[Unknown],...,5-,2017,0,1999.0,1,154799,303616,51.0,0,0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,[1259],Nas,[N. Jones、W. Adams、J. Lordan、D. Ingle],...,5-,2017,0,2006.0,1,62,289,21.5,1,0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,[1019],Soundway,[Kwadwo Donkoh],...,5-,2017,0,2010.0,1,1,1,100.0,1,0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,[1011],Brett Young,"[Brett Young, Kelly Archer, Justin Ebach]",...,1-,2017,-1,2016.0,1,161,427,37.7,3,0


In [25]:
df_train.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language', 'city', 'bd', 'gender',
       'registered_via', 'registration_init_time', 'expiration_date',
       'validate_days', 'registration_year', 'registration_month',
       'registration_date', 'expiration_year', 'expiration_month', 'song_year',
       'genre_count', 'repeat_count', 'play_count', 'artist_repeat_percentage',
       'composer_count', 'lyricist_count'],
      dtype='object')

# Categorical

In [26]:
import time
# print(time.strftime("%Y-%m-%d %H:%M:%S"))

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer

In [28]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
one_hot_cols = [
    'source_system_tab', 'source_screen_name', 'source_type', 
    'artist_name', 'language', 'city', 'bd', 'gender', 'registered_via',
    'song_year'
]
one_hot_train = dict()
one_hot_test = dict()
for col in one_hot_cols:
    print(col, end='..')
    olb = LabelBinarizer(sparse_output=True)
    all_data = pd.concat([df_train[col], df_test[col]])
    olb.fit(all_data)
    one_hot_train[col] = olb.transform(df_train[col])
    one_hot_test[col] = olb.transform(df_test[col])
print()
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-19 22:19:32
source_system_tab..source_screen_name..source_type..artist_name..language..city..bd..gender..registered_via..song_year..
2017-11-19 22:24:20


In [29]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
mul_hot_cols = ['genre_ids', 'composer', 'lyricist']
mul_hot_train = dict()
mul_hot_test = dict()
for col in mul_hot_cols:
    print(col, end='..')
    mlb = MultiLabelBinarizer(sparse_output=True)
    all_data = pd.concat([df_train[col], df_test[col]])
    mlb.fit(all_data)
    mul_hot_train[col] = mlb.transform(df_train[col])
    mul_hot_test[col] = mlb.transform(df_test[col])
print()
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-19 22:24:20
genre_ids..composer..lyricist..
2017-11-19 22:25:04


In [30]:
num_cols = ['song_length', 'validate_days', 'genre_count', 'repeat_count',
            'play_count', 'artist_repeat_percentage', 'composer_count', 'lyricist_count']
for col in num_cols:
    print('{:10} {}'.format(str(df_train[col].dtype), col))

float64    song_length
int64      validate_days
int64      genre_count
int64      repeat_count
int64      play_count
float64    artist_repeat_percentage
int64      composer_count
int64      lyricist_count


In [31]:
print(len(num_cols) + len(one_hot_cols) + len(mul_hot_cols), len(df_train.columns))
print(len(num_cols) + len(one_hot_cols) + len(mul_hot_cols), len(df_test.columns))

21 31
21 31


In [32]:
print(set(df_train.columns) - set(num_cols) - set(one_hot_cols) - set(mul_hot_cols))
print(set(df_test.columns) - set(num_cols) - set(one_hot_cols) - set(mul_hot_cols))

{'target', 'msno', 'registration_date', 'registration_year', 'expiration_month', 'expiration_date', 'song_id', 'registration_month', 'expiration_year', 'registration_init_time'}
{'msno', 'registration_date', 'registration_year', 'expiration_month', 'expiration_date', 'song_id', 'registration_month', 'id', 'expiration_year', 'registration_init_time'}


In [33]:
X_trn = sp.sparse.hstack(
    [sp.sparse.csr_matrix(df_train[num_cols].values)] + \
    list(one_hot_train.values()) + \
    list(mul_hot_train.values())
)
y_trn = df_train['target'].values

In [34]:
X_tst = sp.sparse.hstack(
    [sp.sparse.csr_matrix(df_test[num_cols].values)] + \
    list(one_hot_test.values()) + \
    list(mul_hot_test.values())
)
test_ids = df_test['id'].values

In [35]:
X_trn

<7377418x200733 sparse matrix of type '<class 'numpy.float64'>'
	with 155767061 stored elements in COOrdinate format>

# Tuning Models

In [39]:
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [38]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
clf = LogisticRegression(n_jobs=5)
clf.fit(X_trn, y_trn)
y_prd = clf.predict(X_trn)


2017-11-19 22:29:17


NameError: name 'metrics' is not defined

In [40]:
roc_auc_score(y_trn, y_prd)

0.56114016792032018

In [36]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
clf = LogisticRegression(n_jobs=5)
scores = cross_val_score(clf, X_trn, y_trn, cv=5, scoring='roc_auc')
print(scores)
print(np.mean(scores))
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-17 20:16:09
[ 0.71488343  0.58852425  0.58051265  0.63095915  0.61026334]
2017-11-17 20:34:27


In [42]:
np.mean([ 0.71488343 , 0.58852425 , 0.58051265 , 0.63095915 , 0.61026334])

0.62502856399999995

In [37]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
clf = RandomForestClassifier(n_jobs=5, max_depth=8)
scores = cross_val_score(clf, X_trn, y_trn, cv=5, scoring='roc_auc')
print(scores)
print(np.mean(scores))
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-17 20:34:27
[ 0.69326307  0.67312315  0.61006419  0.6092595   0.58454601]
0.634051183373
2017-11-17 20:39:09


# Producing Results

In [39]:
# Models and Public Leaderboard score
# clf = LogisticRegression(n_jobs=5) # 0.54230
clf = RandomForestClassifier(n_jobs=5, max_depth=8) # 0.53213

clf.fit(X_trn, y_trn)
y_pred = clf.predict(X_tst)
result_df = pd.DataFrame()
result_df['id'] = test_ids
result_df['target'] = y_pred
timestamp = time.strftime("%Y-%m-%d_%H_%M_%S")
filename = '../result/' + timestamp + '.csv.gz'
result_df.to_csv(filename, compression = 'gzip', index=False, float_format = '%.5f')

# Saving and loading features

In [263]:
# Save
path = '../tmp/v1/'
sp.sparse.save_npz(path + 'X_trn.npz', X_trn)
sp.sparse.save_npz(path + 'X_tst.npz', X_tst)
np.save(file=path + 'y_trn.npy', arr=y_trn)

In [None]:
# Load
path = '../tmp/v1/'
X_trn = sp.sparse.load_npz(path + 'X_trn.npz')
X_tst = sp.sparse.load_npz(path + 'X_tst.npz')
y_trn = np.load(path + 'y_trn.npy')

# Experiments on commands