In [1]:
import csv
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# the task is to categorize user given info (bd, city, registered_via, registration duration) 
# and songs (via language, genre_ids, )... such that the later given user can be categorized and 
# we can predict whether the user will like a given song or not.
# One can choose to output predict_proba() for individual classifier and read out probability to be
# classifies as 1 (=replay).

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
songs = pd.read_csv('songs.csv')

In [4]:
user = pd.read_csv('members.csv')

In [5]:
# let's fill up "bd" from user data alone. #
user_bd_fill = user.drop('msno',axis=1)
user_bd_fill = user_bd_fill.drop('registered_via',axis=1)

In [6]:
user_id = user['msno']
user_reg = user['registered_via']

In [7]:
#df_train['bd'].unique()

In [8]:
#user_num.loc[(user_num['gender'].isnull()),'gender']=0
#user_num['gender'] = user_num['gender'].map({0:0,'female':1,'male':2}).astype(int)

In [9]:
from scipy.stats import spearmanr,pearsonr

In [10]:
user_bd_fill.loc[(user_bd_fill['bd']<0)|(user_bd_fill['bd']>100),'bd']=0

In [11]:
# let's convert this into day of year and year
user_bd_fill['expiration_date'] = pd.to_datetime(user_bd_fill['expiration_date'],format='%Y%m%d')
user_bd_fill['exp_year'] = user_bd_fill['expiration_date'].dt.year.astype(int)
user_bd_fill['exp_doy'] = user_bd_fill['expiration_date'].dt.dayofyear.astype(int)
user_bd_fill= user_bd_fill.drop('expiration_date',axis=1)
user_bd_fill['registration_init_time'] = pd.to_datetime(user_bd_fill['registration_init_time'],format='%Y%m%d')
user_bd_fill['reg_year'] = user_bd_fill['registration_init_time'].dt.year.astype(int)
user_bd_fill['reg_doy'] = user_bd_fill['registration_init_time'].dt.dayofyear.astype(int)
user_bd_fill = user_bd_fill.drop('registration_init_time',axis=1)

In [12]:
spearmanr(user_bd_fill['bd'],user_bd_fill['exp_doy'])
#pearsonr(user_num['registration_init_time'],user_num['expiration_date'])
#city:0.7847 / 0.5461
#gender:0.8713 /0.691
#registered_via:0.2274 /0.2266
#reg_init_time:-0.5262 / -0.452
#expiration_date:0.3371 / 0.119
# pearsonr and spearmanr has similar correlation values.
# let's drop gender
# let's predict bd based on other user data (#registration_init_time, registered_via and expiration_date)
# we use decision tree algorithm to fill in the missing bd

SpearmanrResult(correlation=0.060697900431217984, pvalue=1.8826363771224628e-29)

In [13]:
user_bd_fill = user_bd_fill.drop('city',axis=1)
user_bd_fill = user_bd_fill.drop('gender',axis=1)
user_bd = user_bd_fill['bd']

In [14]:
user_bd_null = user_bd_fill.loc[user_bd_fill['bd']==0]
user_bd_null = user_bd_null.drop('bd',axis=1)

In [15]:
user_bd_train = user_bd_fill.loc[user_bd_fill['bd']!=0]
Xuser_bd_train = user_bd_train.drop('bd',axis=1)
Yuser_bd_train = user_bd_train['bd']

In [16]:
#Let's normalize the user variables here.
#user_bd_fill.head(5)
scale = np.std(Xuser_bd_train)
Xuser_bd_train /= scale
user_bd_null /= scale 
mean = np.mean(Xuser_bd_train)
Xuser_bd_train -= mean
user_bd_null -= mean

In [17]:
Xuser_bd_train['registered_via'] = user.loc[user_bd_fill['bd']!=0,'registered_via']
user_bd_null['registered_via'] = user.loc[user_bd_fill['bd']==0,'registered_via']

In [18]:
Xuser_bd_train.isnull().any()

exp_year          False
exp_doy           False
reg_year          False
reg_doy           False
registered_via    False
dtype: bool

In [19]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(Xuser_bd_train, Yuser_bd_train)
Y_pred = decision_tree.predict(user_bd_null)
acc_decision_tree = round(decision_tree.score(Xuser_bd_train, Yuser_bd_train) * 100, 2)
print(acc_decision_tree)

96.88


In [20]:
user_bd_null['bd'] = Y_pred
Xuser_bd_train['bd']=Yuser_bd_train

In [21]:
user_bd_fill = pd.concat([user_bd_null,Xuser_bd_train])

In [22]:
len(user_bd_fill)

34403

In [23]:
user_bd_fill = user_bd_fill.sort_index()

In [24]:
user_bd_fill['msno'] = user['msno']

In [25]:
#song_info = pd.read_csv('song_extra_info.csv')

In [26]:
# we are not going to use additional song_info.
#songs.head(3)

In [27]:
#let's drop unnecessary columns before merging, to save computational resources
songs = songs.drop('lyricist',axis=1)
songs = songs.drop('composer',axis=1)
songs = songs.drop('artist_name',axis=1)

In [28]:
scale = np.std(songs['song_length'])
songs['song_length'] /= scale
mean = np.mean(songs['song_length'])
songs['song_length'] -= mean

In [29]:
df_train = pd.merge(train,user_bd_fill,on='msno',how='left')
df_test = pd.merge(test,user_bd_fill,on='msno',how='left')

In [30]:
#song_mg = pd.merge(songs,song_info,on='song_id',how='left')

In [31]:
df_train = pd.merge(df_train,songs,on='song_id',how='left')
df_test = pd.merge(df_test,songs,on='song_id',how='left')

In [32]:
len(df_test)

2556790

In [33]:
#df_train.loc[(df_train['bd']<0)|(df_train['bd']>100),'bd']=0

In [34]:
# train does not miss an index. How did missing index happen in tidy_split?

In [35]:
# gender does not correlate much with othervariable : cannot be filled
#user_gender_drop =  user_num[user_num.gender!=0]

In [36]:
#spearmanr(user_gender_drop['gender'],user_gender_drop['reg_year'])
#bd-registered_via 0.2258
#bd-registration_init_time -0.332
#bd-expiration_date 0.128
# gender does not have significant correlation to any of the variables: we can drop gender? should we check it against song variables?

In [37]:
#plt.hist(df_train['bd'])
# too many unknown bds : about 40 % : instead of dropping them, find correlation and fill them in!
# e.g. registered_via, registration_init_time, expiration_date-registration_init_time
# is it different from including them in the training data separately?
# it is, because it may induce false correlation if we simply decide all the unknown bds are 0s #

In [38]:
#plt.hist(user_bd_fill['bd'])

In [39]:
#df_train.loc[(df_train['gender'].isnull()),'gender']=0

In [40]:
#df_train['gender'] = df_train['gender'].map({0:0,'female':1,'male':2}).astype(int)

In [41]:
df_test.loc[df_test['genre_ids'].isnull(),'genre_ids']=0
df_train.loc[df_train['genre_ids'].isnull(),'genre_ids']=0

In [42]:
# a fast way to split elements with given (esp unusual) delimeter
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [43]:
#df_train[['genre1','genre2','genre3']] = df_train['genre_ids'].apply(lambda x: pd.Series(x.split('|')))
#df_train[['genre_id']] = pd.DataFrame(df_train['genre_ids'].str.split('|').tolist()).head()
df_train= tidy_split(df_train, 'genre_ids', sep='|')
df_test= tidy_split(df_test, 'genre_ids', sep='|')
# we will train the models, using all the duplicates
# then we can average the target likelihood of duplicates to estimate the final target probability

In [44]:
# genre_ids should not be normalized, as it is categorical and not continuous variable
#df_train['genre_ids']=df_train['genre_ids'].astype(int)
#df_test['genre_ids']=df_test['genre_ids'].astype(int)

In [45]:
#scale = np.std(df_train['genre_ids'])
#df_train['genre_ids'] /= scale
#df_test['genre_ids'] /= scale
#mean = np.mean(df_train['genre_ids'])
#df_train['genre_ids'] -= mean
#df_test['genre_ids'] -= mean

In [46]:
# save the duplicated index to dataframe
test_idx = pd.DataFrame(df_test.index)
train_idx = pd.DataFrame(df_train.index)

In [47]:
# check what variable correlates with genre_ids
#plt.scatter(df_train['bd'],df_train['registered_via'])
#pd.crosstab(df_train['genre_cut'],df_train['gender'],normalize='columns')
#spearmanr(df_train['genre_ids'],df_train['gender'])
#somehow this correlation calculation (and crosstab) never finishes#
#is it because it is too big?#
# bd does play a role in preferred genre # song_length, language (a bit)
# expiration_date does!!! WHY????
# registered_via does a little
# city does a littel too
# registration_init_time not much
# gender does not seem to #
# are these gender - genre correlation at any statistical significance? how can I tell that? what is the standard deviation? #
# for some genres, mixed gender fraction is significantly outside either one of the gender: does it mean it is small number stats#

In [48]:
# are song language and user city strongly correlated? - some exclusions do exist
# is there a language strongly preferred in 

In [49]:
train_id = df_train['msno']
test_id = df_test['msno']

In [50]:
len(df_test)

2705361

In [51]:
X_train = df_train.drop('msno',axis=1)
X_test = df_test.drop('msno',axis=1)
X_train = X_train.drop('target',axis=1)

In [52]:
X_train = X_train.drop('song_id',axis=1)
X_test = X_test.drop('song_id',axis=1)

In [53]:
X_train.loc[X_train['language'].isnull(),'language']=0
X_test.loc[X_test['language'].isnull(),'language']=0

In [54]:
test_id = X_test['id']
X_test = X_test.drop('id',axis=1)

In [55]:
Y_train = pd.DataFrame(df_train['target'])

In [56]:
X_train.loc[X_train['source_system_tab'].isnull(),'source_system_tab']=0
X_train.loc[X_train['source_screen_name'].isnull(),'source_screen_name']=0
X_train.loc[X_train['source_type'].isnull(),'source_type']=0
X_train.loc[X_train['song_length'].isnull(),'song_length']=0

In [57]:
X_test.loc[X_test['source_system_tab'].isnull(),'source_system_tab']=0
X_test.loc[X_test['source_screen_name'].isnull(),'source_screen_name']=0
X_test.loc[X_test['source_type'].isnull(),'source_type']=0
X_test.loc[X_test['song_length'].isnull(),'song_length']=0

In [58]:
n_train = len(X_train)
n_test = len(X_test)

In [59]:
#X_test.loc[X_test['genre_ids'].isnull(),'genre_ids']=0

In [60]:
X_tgt = pd.concat([X_train,X_test])

In [61]:
X_tgt['src_sys_tab'] = pd.factorize(X_tgt['source_system_tab'])[0]

In [62]:
X_tgt = X_tgt.drop('source_system_tab',axis=1)

In [63]:
X_tgt['src_scr_nm'] = pd.factorize(X_tgt['source_screen_name'])[0]

In [64]:
X_tgt = X_tgt.drop('source_screen_name',axis=1)

In [65]:
X_tgt['src_typ'] = pd.factorize(X_tgt['source_type'])[0]

In [66]:
X_tgt = X_tgt.drop('source_type',axis=1)

In [67]:
X_tgt.head(5)
# categorical variables : registered_via, genre_ids, language, scr_sys_tab, src_scr_nm, src_typ
# we should properly deal with them! : let's use binary encoder to turn them into categorical variables.

Unnamed: 0,exp_year,exp_doy,reg_year,reg_doy,registered_via,bd,song_length,genre_ids,language,src_sys_tab,src_scr_nm,src_typ
0,0.067646,0.577033,-0.06842,-1.651606,7,36,-0.251818,359,52.0,0,0,0
1,0.067646,0.298691,-0.370239,-0.383273,9,24,0.233597,1259,52.0,1,1,1
2,0.067646,0.298691,-0.370239,-0.383273,9,24,-0.134213,1259,52.0,1,1,1
3,0.067646,0.298691,-0.370239,-0.383273,9,24,0.052936,1019,-1.0,1,1,1
4,0.067646,0.577033,-0.06842,-1.651606,7,36,-0.367832,1011,52.0,0,0,0


In [68]:
import category_encoders as ce

In [69]:
encoder = ce.binary.BinaryEncoder(cols=['registered_via','genre_ids','language','src_sys_tab','src_scr_nm','src_typ'])

In [70]:
X_tgt = encoder.fit_transform(X_tgt)

In [71]:
X_tgt.tail(4)

Unnamed: 0,registered_via_0,registered_via_1,registered_via_2,genre_ids_0,genre_ids_1,genre_ids_2,genre_ids_3,genre_ids_4,genre_ids_5,genre_ids_6,...,src_typ_0,src_typ_1,src_typ_2,src_typ_3,exp_year,exp_doy,reg_year,reg_doy,bd,song_length
2556786,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0.067646,0.310289,0.535219,-0.551793,24,-0.310257
2556787,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0.067646,0.310289,0.535219,-0.551793,24,-0.211555
2556788,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0.067646,0.310289,0.535219,-0.551793,24,-0.513171
2556789,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0.067646,0.310289,0.535219,-0.551793,24,-0.095958


In [72]:
X_train = X_tgt[:n_train]
X_test = X_tgt[n_train:]

In [73]:
X_test.isnull().any().sum()

0

In [74]:
X_train.isnull().any().sum()

0

In [75]:
len(X_test)

2705361

In [78]:
from keras.models import Sequential
from keras.layers import Dense

In [94]:
model=Sequential()
model.add(Dense(40,input_dim=34,activation='relu'))
model.add(Dense(30,activation='relu'))
model.add(Dense(20,activation='relu'))
model.add(Dense(10,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [80]:
X_train = np.asarray(X_train)
Y_train = np.asarray(Y_train)

In [95]:
model.fit(X_train, Y_train, epochs=100, batch_size=50000,  verbose=2)
# batch size ~ 1% of the training dataset
# 10 iterations
# calculate predictions
X_test = np.asarray(X_test)
Y_pred = model.predict(X_test)

Epoch 1/100
23s - loss: 0.6839 - acc: 0.5892
Epoch 2/100
22s - loss: 0.6531 - acc: 0.6219
Epoch 3/100
25s - loss: 0.6515 - acc: 0.6235
Epoch 4/100
24s - loss: 0.6505 - acc: 0.6243
Epoch 5/100
24s - loss: 0.6497 - acc: 0.6255
Epoch 6/100
25s - loss: 0.6489 - acc: 0.6268
Epoch 7/100
25s - loss: 0.6482 - acc: 0.6277
Epoch 8/100
24s - loss: 0.6476 - acc: 0.6284
Epoch 9/100
22s - loss: 0.6472 - acc: 0.6290
Epoch 10/100
24s - loss: 0.6469 - acc: 0.6293
Epoch 11/100
26s - loss: 0.6466 - acc: 0.6297
Epoch 12/100
23s - loss: 0.6465 - acc: 0.6298
Epoch 13/100
23s - loss: 0.6462 - acc: 0.6302
Epoch 14/100
26s - loss: 0.6461 - acc: 0.6304
Epoch 15/100
26s - loss: 0.6459 - acc: 0.6305
Epoch 16/100
24s - loss: 0.6457 - acc: 0.6306
Epoch 17/100
25s - loss: 0.6457 - acc: 0.6308
Epoch 18/100
26s - loss: 0.6454 - acc: 0.6310
Epoch 19/100
24s - loss: 0.6453 - acc: 0.6311
Epoch 20/100
25s - loss: 0.6452 - acc: 0.6312
Epoch 21/100
24s - loss: 0.6451 - acc: 0.6312
Epoch 22/100
27s - loss: 0.6450 - acc: 0.63

In [None]:
#X_test.tail(5)

In [78]:
#random_forest = RandomForestClassifier(n_estimators=20, max_depth=30)
#random_forest.fit(X_train, Y_train)
#Y_pred = clf.predict(X_test)
#print clf.score(X_train, Y_train)
#acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
#print (acc_random_forest)
# accuracy too high : indicates that it has high variance (overfitting) problem. 
# Remember even the best models in the competition performs about 70 % success on the test dataset.
# best performance so far : RF w/ depth 30 est 20, normalized (minor improvement over depth 20 / not normalized case)
# now let's implement NN!

  from ipykernel import kernelapp as app


83.39


In [79]:
#Y_pred = random_forest.predict_proba(X_test)

In [89]:
Y_pred = pd.DataFrame(Y_pred)
Y_pred['index'] =test_idx
Y_pred = Y_pred.groupby('index').mean()
#Y_pred = Y_pred[~Y_pred.index.duplicated(keep='first')]

In [90]:
Y_pred.columns=['pred']

In [None]:
# modify this such that it won't discard, but average, the duplicates
#df_train = df_train[~df_train.index.duplicated(keep='first')]

In [None]:
# modify this such that it won't discard, but average, the duplicates
#df_test = df_test[~df_test.index.duplicated(keep='first')]

In [91]:
submission = pd.DataFrame({
            "id": test_id,
                    "target": Y_pred['pred']
    })
submission = submission.groupby('id').mean().reset_index()

submission.to_csv('NNsubmission.csv', index=False)

In [92]:
if len(test) !=len(submission):
    print('something is wrong')