In [1]:
from IPython.core.display import HTML, display
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# Specify the path to the fastai directory
lib_PATH = '/home/ubuntu/fastai/fastai'   # Update this
!cd {lib_PATH}
# Create sym-link to the fastai library
!ln -s {lib_PATH} ./

ln: failed to create symbolic link './fastai': File exists


In [4]:
# Load required modules
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics
from dateutil.relativedelta import relativedelta

# Load Data

## `train.csv`

In [5]:
PATH = '/home/ubuntu/wsmd_music_rec/data/'

In [6]:
train = pd.read_csv(f'{PATH}train.csv')
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)
display_all(train.describe(include='all').transpose())

In [None]:
train.isnull().any()

## `songs.csv`

In [None]:
songs = pd.read_csv(f'{PATH}songs.csv')

In [None]:
songs.head()

In [None]:
songs.isnull().any()

In [None]:
display_all(songs.describe(include='all').transpose())

## `members.csv`

In [None]:
members = pd.read_csv(f'{PATH}members.csv', parse_dates=['registration_init_time','expiration_date'])
members.head()

In [None]:
display_all(members.describe(include='all').transpose())

## `song_extra_info.csv`

In [None]:
songs_extra = pd.read_csv(f'{PATH}song_extra_info.csv')
songs_extra.head()

In [None]:
display_all(songs_extra.describe(include='all').transpose())

## `test.csv`

In [None]:
test = pd.read_csv(f'{PATH}test.csv')
test.head()

In [None]:
display_all(test.describe(include='all').transpose())

# Merge Data & Feature Engineering

In [None]:
train = train.merge(songs, on='song_id', how='left')

In [None]:
test = test.merge(songs, on='song_id', how='left')

In [None]:
members['membership_days'] = members['expiration_date'].subtract(members['registration_init_time']).dt.days.astype(int)

In [None]:
members['registration_year'] = members['registration_init_time'].dt.year
members['registration_month'] = members['registration_init_time'].dt.month
members['registration_date'] = members['registration_init_time'].dt.day

In [None]:
members['expiration_year'] = members['expiration_date'].dt.year
members['expiration_month'] = members['expiration_date'].dt.month
members['expiration_date'] = members['expiration_date'].dt.day

In [None]:
members.head()

In [None]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [None]:
songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)

In [None]:
train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')

In [None]:
train = train.merge(songs_extra, on = 'song_id', how = 'left')
train.song_length.fillna(200000,inplace=True)
train.song_length = train.song_length.astype(np.uint32)
train.song_id = train.song_id.astype('category')

In [None]:
test = test.merge(songs_extra, on = 'song_id', how = 'left')
test.song_length.fillna(200000,inplace=True)
test.song_length = test.song_length.astype(np.uint32)
test.song_id = test.song_id.astype('category')

In [None]:
train_cats(train)
train_cats(test)

In [None]:
display_all(train.isnull().sum().sort_index()/len(train))

# More Features

In [None]:
def genre_id_count(x):
    if x == 'no_genre_id':
        return 0
    else:
        return x.count('|') + 1

train['genre_ids'] = train['genre_ids'].cat.add_categories(['no_genre_id'])    # cat accesses categorical properties
test['genre_ids'] = test['genre_ids'].cat.add_categories(['no_genre_id'])

train['genre_ids'].fillna('no_genre_id', inplace=True)
test['genre_ids'].fillna('no_genre_id', inplace=True)
train['genre_ids_count'] = train['genre_ids'].apply(genre_id_count).astype(np.int8)
test['genre_ids_count'] = test['genre_ids'].apply(genre_id_count).astype(np.int8)

In [None]:
def lyricist_count(x):
    if x == 'no_lyricist':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
    return sum(map(x.count, ['|', '/', '\\', ';']))

train['lyricist'] = train['lyricist'].cat.add_categories(['no_lyricist'])
train['lyricist'].fillna('no_lyricist', inplace=True)
train['lyricists_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
test['lyricist'] = test['lyricist'].cat.add_categories(['no_lyricist'])
test['lyricist'].fillna('no_lyricist', inplace=True)
test['lyricists_count'] = test['lyricist'].apply(lyricist_count).astype(np.int8)

In [None]:
def composer_count(x):
    if x == 'no_composer':
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

train['composer'] = train['composer'].cat.add_categories(['no_composer'])
train['composer'].fillna('no_composer',inplace=True)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
test['composer'] = test['composer'].cat.add_categories(['no_composer'])
test['composer'].fillna('no_composer',inplace=True)
test['composer_count'] = test['composer'].apply(composer_count).astype(np.int8)



In [None]:
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0

train['artist_name'] = train['artist_name'].cat.add_categories(['no_artist'])
train['artist_name'].fillna('no_artist',inplace=True)
train['is_featured'] = train['artist_name'].apply(is_featured).astype(np.int8)
test['artist_name'] = test['artist_name'].cat.add_categories(['no_artist'])
test['artist_name'].fillna('no_artist',inplace=True)
test['is_featured'] = test['artist_name'].apply(is_featured).astype(np.int8)

In [None]:
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count(' and') + x.count(',') + x.count(' feat') + x.count('&') + 1

train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
test['artist_count'] = test['artist_name'].apply(artist_count).astype(np.int8)

# if artist is same as composer
train['artist_composer'] = (np.asarray(train['artist_name']) == np.asarray(train['composer'])).astype(np.int8)
test['artist_composer'] = (np.asarray(test['artist_name']) == np.asarray(test['composer'])).astype(np.int8)

In [None]:
# if artist, lyricist and composer are all three same
train['artist_composer_lyricist'] = ((np.asarray(train['artist_name']) == np.asarray(train['composer'])) & 
                                     np.asarray((train['artist_name']) == np.asarray(train['lyricist'])) & 
                                     np.asarray((train['composer']) == np.asarray(train['lyricist']))).astype(np.int8)
test['artist_composer_lyricist'] = ((np.asarray(test['artist_name']) == np.asarray(test['composer'])) & 
                                    (np.asarray(test['artist_name']) == np.asarray(test['lyricist'])) &
                                    np.asarray((test['composer']) == np.asarray(test['lyricist']))).astype(np.int8)

In [None]:
# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0

train['song_lang_boolean'] = train['language'].apply(song_lang_boolean).astype(np.int8)
test['song_lang_boolean'] = test['language'].apply(song_lang_boolean).astype(np.int8)

In [None]:
_median_song_length = np.median(train['song_length'])
def smaller_song(x):
    if x < _median_song_length:
        return 1
    return 0

train['smaller_song'] = train['song_length'].apply(smaller_song).astype(np.int8)
test['smaller_song'] = test['song_length'].apply(smaller_song).astype(np.int8)

In [None]:
from collections import Counter
song_count = Counter(list(train['song_id']) + list(test['song_id']))  # counting test set occurrence as 1

def count_song_played(x):
    try:
        return song_count[x]
    except KeyError:
        return 0

        
train['count_song_played'] = train['song_id'].apply(count_song_played).astype(np.int64)
test['count_song_played'] = test['song_id'].apply(count_song_played).astype(np.int64)

In [None]:
artist_count = Counter(list(train['artist_name']) + list(test['artist_name']))
def count_artist_played(x):
    try:
        return artist_count[x]
    except KeyError:
        return 0


train['count_artist_played'] = train['artist_name'].apply(count_artist_played).astype(np.int64)
test['count_artist_played'] = test['artist_name'].apply(count_artist_played).astype(np.int64)

In [None]:
# train['registration_year'][train['registration_year']==np.inf] = np.nan
train.song_year.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
train.song_year.fillna(-1,inplace=True)
train.song_year = train.song_year.astype('int64')

In [None]:
train = train.drop(['registration_init_time'], axis=1)

In [None]:
test.song_year.replace([np.inf, -np.inf], np.nan, inplace=True)
test.song_year.fillna(-1,inplace=True)
test.song_year = test.song_year.astype('int64')
test = test.drop(['registration_init_time'], axis=1)

In [None]:
train = train.drop(['isrc'], axis=1)
test = test.drop(['isrc'], axis=1)

In [None]:
train.language.fillna(-1,inplace=True)
train.language = train.language.astype('int8')
test.language.fillna(-1,inplace=True)
test.language = test.language.astype('int8')

In [None]:
# Number of times a song was played in a city
# https://stackoverflow.com/questions/41682240/pandas-aggregate-count-in-dataframe
plays_by_city = train.groupby(by=['city', 'song_id'], as_index=False)['target'].agg(['size']).reset_index()
plays_by_city = plays_by_city.rename(columns = {'size':'num_plays_by_city'})
train = pd.merge(train, plays_by_city, how='left', on=['city', 'song_id'])
test = pd.merge(test, plays_by_city, how='left', on=['city', 'song_id'])

In [None]:
train.count_song_played.fillna(-1, inplace=True)   # set to -1 because we divide using this value later; 0 will be a problem
test.count_song_played.fillna(-1, inplace=True)

In [None]:
# For each song, Ratio of 1 and 0 for target var
successes = train.groupby(by = ['song_id'], as_index=False)['target'].sum()
successes = successes.rename(columns = {'target':'successes'})
train = pd.merge(train, successes, how='left', on=['song_id'])
train['addiction_ratio'] = train['successes'] / train['count_song_played']
train = train.drop(['successes'], axis=1)

In [None]:
test = pd.merge(test, successes, how='left', on=['song_id'])
test['addiction_ratio'] = test['successes'] / test['count_song_played']
test = test.drop(['successes'], axis=1)

In [None]:
train.addiction_ratio.fillna(0, inplace=True)
test.addiction_ratio.fillna(0, inplace=True)

In [None]:
train.to_feather('/home/ubuntu/wsmd_music_rec/data/tmp/wsdm_train')
test.to_feather('/home/ubuntu/wsmd_music_rec/data/tmp/wsdm_test')

In [7]:
?apply_cats