# 1. Import packages

In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import os
import random

from sklearn.preprocessing import LabelEncoder

In [2]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2018-04-24 22:15:55


In [3]:
class Timer():
    def __init__(self):
        self.info = 'main'
        self.start_time = time.time()
    
    def start(self, info):
        self.info = info
        self.start_time = time.time()
        self.checkpoint('start', elapsed_on=False)
    
    def end(self):
        self.checkpoint(' end ')
        
    def checkpoint(self, tag, elapsed_on=True):
        if elapsed_on:
            elapsed = datetime.timedelta(seconds=round(time.time() - self.start_time))
            expanded_info = self.info + ' [time elapsed: %s]' % str(elapsed)
        else:
            expanded_info = self.info
        self.output(tag, info=expanded_info)
        
    def output(self, tag=' '*5, info=''):
        if type(info) != type(''):
            info = str(info)
        print('[%s] (%s) %s' % (Timer.get_current_time(), tag, info))
    
    @staticmethod
    def get_current_time():
        return time.strftime("%Y-%m-%d %H:%M:%S")

In [4]:
timer = Timer()
sub_timer = Timer()

# 2. Load Data

In [5]:
timer.start('load data')
input_dir = '../input/'
df_train = pd.read_csv(input_dir + "train.csv")
df_train.rename(columns={'msno': 'user_id'}, inplace=True)
df_songs = pd.read_csv(input_dir + 'songs.csv')
df_song_extra = pd.read_csv(input_dir + "song_extra_info.csv")
df_members = pd.read_csv(input_dir + "members.csv", parse_dates=["registration_init_time","expiration_date"])
df_members.rename(columns={'bd': 'age', 'msno': 'user_id'}, inplace=True)
timer.end()

[2018-04-24 22:15:55] (start) load data
[2018-04-24 22:16:15] ( end ) load data [time elapsed: 0:00:19]


In [6]:
df_train.columns

Index(['user_id', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target'],
      dtype='object')

# 3. ID Mapping: mapping user_id, song_id, interaction_id

In [7]:
len(df_train)

7377418

In [8]:
uid_set = set(df_train['user_id'])
iid_set = set(df_train['song_id'])
uid_map = {uid: mapped_uid for mapped_uid, uid in enumerate(uid_set)}
iid_map = {iid: mapped_iid for mapped_iid, iid in enumerate(iid_set)}
num_user = len(uid_set)
num_item = len(iid_set)
print(num_user)
print(num_item)

30755
359966


In [9]:
df_train['user_id'] = df_train['user_id'].apply(lambda x: uid_map[x])
df_train['song_id'] = df_train['song_id'].apply(lambda x: iid_map[x])

In [10]:
# drop users and items that are irrelevant (thus not in the id mapping)
df_members.drop(df_members[~df_members['user_id'].isin(uid_map)].index, inplace=True)
df_songs.drop(df_songs[~df_songs['song_id'].isin(iid_map)].index, inplace=True)
df_song_extra.drop(df_song_extra[~df_song_extra['song_id'].isin(iid_map)].index, inplace=True)

In [11]:
print(
    len(df_members),
    len(df_songs),
    len(df_song_extra)
)

30755 359914 359807


In [12]:
df_members['user_id'] = df_members['user_id'].apply(lambda x: uid_map[x])
df_songs['song_id'] = df_songs['song_id'].apply(lambda x: iid_map[x])
df_song_extra['song_id'] = df_song_extra['song_id'].apply(lambda x: iid_map[x])

In [13]:
timer.start('mapping id\'s')
directory = '../data/mapping/'
if not os.path.exists(directory):
    os.makedirs(directory)

with open(directory+'user_id_map.csv', 'w') as file:
    for uid in uid_set:
        file.write('%s,%d\n' % (uid, uid_map[uid]))

with open(directory+'item_id_map.csv', 'w') as file:
    for iid in iid_set:
        file.write('%s,%d\n' % (iid, iid_map[iid]))

comb_map = dict()
with open(directory+'interaction_id_map.csv', 'w') as file:
    for interaction_id, (uid, iid) in enumerate(zip(df_train['user_id'], df_train['song_id'])):
        comb_map[(uid, iid)] = interaction_id
        file.write('%d,%d,%d\n' % (uid, iid, interaction_id))

del uid_set, iid_set
timer.end()

[2018-04-24 22:16:27] (start) mapping id's
[2018-04-24 22:16:46] ( end ) mapping id's [time elapsed: 0:00:19]


In [14]:
df_train.head()

Unnamed: 0,user_id,song_id,source_system_tab,source_screen_name,source_type,target
0,13362,232041,explore,Explore,online-playlist,1
1,3437,267911,my library,Local playlist more,local-playlist,1
2,3437,96294,my library,Local playlist more,local-playlist,1
3,3437,38663,my library,Local playlist more,local-playlist,1
4,13362,216653,explore,Explore,online-playlist,1


# 4. Split the data into train and test sets

In [15]:
timer.start('split data')
RANDOM_STATE = 42

# count the interactions owned by user/item
sub_timer.start('count interaction')
user_interaction_cnt = [0 for _ in range(num_user)]
item_interaction_cnt = [0 for _ in range(num_item)]
for index, row in df_train.iterrows():
    user_interaction_cnt[row.user_id] += 1
    item_interaction_cnt[row.song_id] += 1
sub_timer.end()

df_train['user_interaction_cnt'] = df_train['user_id'].apply(lambda x: user_interaction_cnt[x])
df_train['item_interaction_cnt'] = df_train['song_id'].apply(lambda x: item_interaction_cnt[x])

[2018-04-24 22:16:46] (start) split data
[2018-04-24 22:16:46] (start) count interaction
[2018-04-24 22:25:18] ( end ) count interaction [time elapsed: 0:08:32]


In [16]:
train_index_set = set(range(len(df_train)))
# randomly pick interactions that can be test_warm
sub_timer.start('select warm')
test_warm_size = 500*1000
t_u = 200
t_i = 20

qualified = df_train[(df_train['user_interaction_cnt'] >= t_u) & (df_train['item_interaction_cnt'] >= t_i)]
# around 70% are qualified for 200 / 20 configuration
# print(len(qualified))
# print(len(qualified)/len(df_train))

test_warm_index_set = set(qualified.sample(n=test_warm_size, random_state=RANDOM_STATE).index)
train_index_set -= test_warm_index_set
sub_timer.end()


# randomly pick interactions that can be test_cold_item
sub_timer.start('select test_cold_item')
t_u = 200
t_i = 10
qualified = df_train[(df_train['user_interaction_cnt'] >= t_u) & (df_train['item_interaction_cnt'] < t_i)]
print(len(qualified))
print(len(qualified)/len(df_train))

test_cold_item_index_set = set(qualified.index)
test_cold_item_index_set = test_cold_item_index_set.intersection(train_index_set)
train_index_set -= test_cold_item_index_set
sub_timer.end()


# randomly pick interactions that can be test_cold_user
sub_timer.start('select test_cold_user')
t_u = 120
t_i = 10
qualified = df_train[(df_train['user_interaction_cnt'] < t_u) & (df_train['item_interaction_cnt'] >= t_i)]
print(len(qualified))
print(len(qualified)/len(df_train))

test_cold_user_index_set = set(qualified.index)
test_cold_user_index_set = test_cold_user_index_set.intersection(train_index_set)
train_index_set -= test_cold_user_index_set
sub_timer.end()

[2018-04-24 22:25:24] (start) select warm
[2018-04-24 22:25:25] ( end ) select warm [time elapsed: 0:00:01]
[2018-04-24 22:25:25] (start) select test_cold_item
584142
0.07917973469850834
[2018-04-24 22:25:25] ( end ) select test_cold_item [time elapsed: 0:00:00]
[2018-04-24 22:25:25] (start) select test_cold_user
516379
0.06999454280616878
[2018-04-24 22:25:26] ( end ) select test_cold_user [time elapsed: 0:00:00]


In [17]:
# save interactions to files
sub_timer.start('save to files')
def save_split(df, index_set, filename):
    with open(filename, 'w') as file:
#         df_train.loc[index_set][['user_id', 'song_id', 'target']].to_csv(filename, header=False, index=False)
        df_train.loc[index_set][['user_id', 'song_id', 'target']].to_csv(filename)
        
directory = '../data/split/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
save_split(df=df_train, index_set=train_index_set, filename=directory + 'train.csv')
save_split(df=df_train, index_set=test_warm_index_set, filename=directory + 'test_warm.csv')
save_split(df=df_train, index_set=test_cold_item_index_set, filename=directory + 'test_cold_item.csv')
save_split(df=df_train, index_set=test_cold_user_index_set, filename=directory + 'test_cold_user.csv')
sub_timer.end()
timer.end()

[2018-04-24 22:25:26] (start) save to files
[2018-04-24 22:25:50] ( end ) save to files [time elapsed: 0:00:24]
[2018-04-24 22:25:50] ( end ) split data [time elapsed: 0:09:04]


In [18]:
# assert that user_id and item_id in test_warm appear in training data
df_warm = df_train.iloc[list(test_warm_index_set)]
df_for_train = df_train.iloc[list(train_index_set)]
assert not (~df_warm.user_id.isin(df_for_train.user_id)).any() # should be False
assert not (~df_warm.song_id.isin(df_for_train.song_id)).any() # should be False

# 5 Preprocess and save the contexts

In [19]:
timer.start('save context')

[2018-04-24 22:25:51] (start) save context


## 5.0 Append contexts for missing songs
some songs informations are missing

In [20]:
print((~df_train.user_id.isin(df_members.user_id)).sum(), 
      (~df_train.song_id.isin(df_songs.song_id)).sum(),
      (~df_train.song_id.isin(df_song_extra.song_id)).sum())

0 114 1455


In [21]:
set(iid_map.values()) - set(df_songs.song_id)

{3834,
 9847,
 30822,
 31963,
 46056,
 61040,
 64607,
 65306,
 65450,
 82125,
 83598,
 97927,
 104234,
 110812,
 139832,
 155371,
 158292,
 159690,
 161608,
 163691,
 164195,
 169329,
 171311,
 171449,
 171556,
 175246,
 179634,
 183063,
 184084,
 191297,
 194511,
 200985,
 201145,
 207654,
 212701,
 213207,
 231930,
 233308,
 252531,
 258572,
 271475,
 279864,
 286016,
 302487,
 306555,
 326332,
 337586,
 345913,
 353157,
 353320,
 353988,
 356924}

In [22]:
# just add the entries with ids in it
def fix_songs(df):
    missed_iid = list(set(iid_map.values()) - set(df.song_id))
    df_new = pd.DataFrame(missed_iid, columns=['song_id'])
    return df_new

df_songs = df_songs.append(fix_songs(df_songs), ignore_index=True)
df_song_extra = df_song_extra.append(fix_songs(df_song_extra), ignore_index=True)

In [23]:
timer.checkpoint('5.0')

[2018-04-24 22:25:56] (5.0) save context [time elapsed: 0:00:05]


## 5.1 Simple member related and song related

also join df_songs and df_song_extra

In [24]:
# member related
sub_timer.start('member related')
df_members.loc[df_members['age'] < 5, 'age'] = 0
df_members.loc[df_members['age'] >= 80, 'age'] = 0
df_members['weird_age'] = 0
df_members.loc[df_members['age'] == 0, 'weird_age'] = 1
df_members['validate_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days.astype(int)

df_members['registration_year'] = df_members['registration_init_time'].dt.year.astype(int)
df_members['registration_month'] = df_members['registration_init_time'].dt.month.astype(int)
df_members['registration_day'] = df_members['registration_init_time'].dt.day.astype(int)

df_members['expiration_year'] = df_members['expiration_date'].dt.year.astype(int)
df_members['expiration_month'] = df_members['expiration_date'].dt.month.astype(int)
df_members['expiration_day'] = df_members['expiration_date'].dt.day.astype(int)

df_members.drop(['registration_init_time', 'expiration_date'], axis=1, inplace=True)

sub_timer.end()

[2018-04-24 22:25:57] (start) member related
[2018-04-24 22:25:57] ( end ) member related [time elapsed: 0:00:00]


In [25]:
# song related
sub_timer.start('song related')

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

df_song_extra['song_year'] = df_song_extra['isrc'].apply(isrc_to_year)
df_song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

# 1000 <=> 1s
df_songs['song_length'] /= 1000.0
df_songs.loc[df_songs['song_length'] > 1800, 'song_length'] = 1800

df_songs = df_songs.merge(df_song_extra, how='left', on='song_id')
sub_timer.end()

[2018-04-24 22:25:57] (start) song related
[2018-04-24 22:25:58] ( end ) song related [time elapsed: 0:00:01]


## 5.2 Fill in missing value

In [26]:
sub_timer.start('find missing columns')
print('df_train')
for col in df_train.columns:
    num_null = df_train[col].isnull().sum()
    if num_null > 0:
        print('{:10} {} {}'.format(str(df_train[col].dtype), col, num_null))
print()

print('df_members')
for col in df_members.columns:
    num_null = df_members[col].isnull().sum()
    if num_null > 0:
        print('{:10} {} {}'.format(str(df_members[col].dtype), col, num_null))
print()

print('df_songs')
for col in df_songs.columns:
    num_null = df_songs[col].isnull().sum()
    if num_null > 0:
        print('{:10} {} {}'.format(str(df_songs[col].dtype), col, num_null))

sub_timer.end()

[2018-04-24 22:25:58] (start) find missing columns
df_train
object     source_system_tab 24849
object     source_screen_name 414804
object     source_type 21539

df_members
object     gender 17092

df_songs
object     artist_name 52
object     composer 155554
object     genre_ids 7285
float64    language 53
object     lyricist 268760
float64    song_length 52
float64    song_year 48338
[2018-04-24 22:25:59] ( end ) find missing columns [time elapsed: 0:00:01]


In [27]:
sub_timer.start('fill in null value')
UNKNOWN = 'Unknown'
df_train['source_system_tab'].fillna(value=UNKNOWN, inplace=True)
df_train['source_screen_name'].fillna(value=UNKNOWN, inplace=True)
df_train['source_type'].fillna(value=UNKNOWN, inplace=True)

df_members['gender'].fillna(value=UNKNOWN, inplace=True)

for col in ['artist_name', 'composer', 'genre_ids', 'lyricist']:
    df_songs[col].fillna(value=UNKNOWN, inplace=True)
    
print('language mode', df_songs['language'].mode())
df_songs['language'].fillna(value=df_songs['language'].mode()[0], inplace=True) # 52.0 is the mode

print('song_length median', df_songs['song_length'].mean())
df_songs['song_length'].fillna(value=df_songs['song_length'].mean(), inplace=True)

print('song year median', df_songs['song_year'].median())
df_songs['song_year'].fillna(value=df_songs['song_year'].median(), inplace=True)

sub_timer.end()

[2018-04-24 22:25:59] (start) fill in null value
language mode 0    52.0
dtype: float64
song_length median 247.05283130970574
song year median 2011.0
[2018-04-24 22:26:00] ( end ) fill in null value [time elapsed: 0:00:01]


## 5.3 Count and binary features

In [28]:
sub_timer.start('count and binary features engineering')
def artist_count(x):
    if x == UNKNOWN:
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&') + 1
df_songs['artist_count'] = df_songs['artist_name'].apply(artist_count).astype(np.int8)

def _count(x):
    if x == UNKNOWN:
        return 0
    else:
        return sum(map(x.count, ['|', '/', '\\', ';', '、', ','])) + 1
df_songs['genre_count'] = df_songs['genre_ids'].apply(_count).astype(int)
df_songs['lyricist_count'] = df_songs['lyricist'].apply(_count).astype(int)
df_songs['composer_count'] = df_songs['composer'].apply(_count).astype(int)

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
df_songs['is_featured'] = df_songs['artist_name'].apply(is_featured).astype(np.int8)

# if artist is same as composer
df_songs['artist_composer'] = (df_songs['artist_name'] == df_songs['composer']).astype(np.int8)

# if artist, lyricist and composer are all three same
df_songs['artist_composer_lyricist'] = ((df_songs['artist_name'] == df_songs['composer']) 
                                        & (df_songs['artist_name'] == df_songs['lyricist']) 
                                        & (df_songs['composer'] == df_songs['lyricist'])).astype(np.int8)

# is song language 17 or 45. 
def song_lang_boolean(x):
    if '17.0' in str(x) or '45.0' in str(x):
        return 1
    return 0
df_songs['song_lang_boolean'] = df_songs['language'].apply(song_lang_boolean).astype(np.int8)

# smaller song
_mean_song_length = df_songs['song_length'].mean()
print(_mean_song_length)
print(0 < _mean_song_length)
def smaller_song(x):
    if x < _mean_song_length:
        return 1
    return 0
df_songs['smaller_song'] = df_songs['song_length'].apply(smaller_song).astype(np.int8)
sub_timer.end()

[2018-04-24 22:26:00] (start) count and binary features engineering
247.05283130970534
True
[2018-04-24 22:26:03] ( end ) count and binary features engineering [time elapsed: 0:00:03]


# 5.4 standarize numerical

In [29]:
for df in [df_train, df_members, df_songs]:
    for col in df.columns:
        print(col, len(df[col].unique()))
    print()

user_id 30755
song_id 359966
source_system_tab 9
source_screen_name 20
source_type 13
target 2
user_interaction_cnt 1564
item_interaction_cnt 1798

user_id 30755
city 21
age 68
gender 3
registered_via 5
weird_age 2
validate_days 4321
registration_year 14
registration_month 12
registration_day 31
expiration_year 18
expiration_month 12
expiration_day 31

artist_name 40583
composer 76064
genre_ids 573
language 10
lyricist 33888
song_id 359966
song_length 59933
song_year 100
artist_count 11
genre_count 9
lyricist_count 24
composer_count 27
is_featured 2
artist_composer 2
artist_composer_lyricist 2
song_lang_boolean 2
smaller_song 2



In [30]:
sub_timer.start('standarize')
def standarize_numerical(df, cols):
    for col in cols:
        mean = np.mean(df[col])
        stdd = np.std(df[col])
        df[col] = (df[col] - mean) / stdd
        print('%20s' %col, mean, stdd)

print('standarize df_songs')
standarize_numerical(df_songs, ['song_length', 'genre_count', 'lyricist_count', 'composer_count', 'artist_count', 
    'is_featured', 'artist_composer', 'artist_composer_lyricist', 'song_lang_boolean', 'smaller_song'])
print('standarize df_members')
standarize_numerical(df_members, ['age', 'weird_age', 'validate_days'])
sub_timer.end()

[2018-04-24 22:26:05] (start) standarize
standarize df_songs
         song_length 247.05283130970534 115.79556125560042
         genre_count 1.0664618325064035 0.39541568542263716
      lyricist_count 0.3968930398982126 0.9317668455635831
      composer_count 1.0127567603607008 1.667934525381824
        artist_count 1.05236605679 0.257709773866
         is_featured 0.00357811571093 0.0597102403185
     artist_composer 0.052210486546 0.222451234298
artist_composer_lyricist 0.0170043837474 0.129287411146
   song_lang_boolean 0.0799797758677 0.271261886965
        smaller_song 0.593039342605 0.491267422824
standarize df_members
                 age 12.801528206795643 15.554090126424791
           weird_age 0.5573077548366119 0.49670496397332436
       validate_days 1175.467338644123 1160.4635983491546
[2018-04-24 22:26:05] ( end ) standarize [time elapsed: 0:00:00]


In [31]:
# some other checks: should just print an empty line
def check_null(df):
    for col in df.columns:
        null_num = df[col].isnull().sum()
        if null_num > 0:
            print(col, null_num)
    print()

df = df_train.merge(df_songs, how="left", on="song_id")
df = df.merge(df_members, how="left", on="user_id")
check_null(df)




## 5.5 encode all string features to id

In [32]:
def encode_categorical(df, cols):
    for col in cols:
        df[col] = df[col].astype(str)
        encoder = LabelEncoder()
        encoder.fit(df[col])
        df[col] = encoder.transform(df[col])
        print(col, len(df[col].unique()), df[col].min(), df[col].max())
    print('-' * 20)

In [33]:
df_train.head()

Unnamed: 0,user_id,song_id,source_system_tab,source_screen_name,source_type,target,user_interaction_cnt,item_interaction_cnt
0,13362,232041,explore,Explore,online-playlist,1,5511,215
1,3437,267911,my library,Local playlist more,local-playlist,1,622,1
2,3437,96294,my library,Local playlist more,local-playlist,1,622,4
3,3437,38663,my library,Local playlist more,local-playlist,1,622,1
4,13362,216653,explore,Explore,online-playlist,1,5511,412


In [34]:
df_songs.head()

Unnamed: 0,artist_name,composer,genre_ids,language,lyricist,song_id,song_length,song_year,artist_count,genre_count,lyricist_count,composer_count,is_featured,artist_composer,artist_composer_lyricist,song_lang_boolean,smaller_song
0,張信哲 (Jeff Chang),董貞,465,3.0,何啟弘,146621,0.005071,2014.0,-0.203198,-0.168081,0.647272,-0.007648,-0.059925,-0.234705,-0.131524,-0.294843,-1.207162
1,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,444,31.0,TEDDY,329770,-0.429419,2011.0,-0.203198,-0.168081,0.647272,1.19144,-0.059925,-0.234705,-0.131524,-0.294843,0.828389
2,SUPER JUNIOR,Unknown,465,31.0,Unknown,195553,-0.131886,2011.0,-0.203198,-0.168081,-0.425957,-0.607192,-0.059925,-0.234705,-0.131524,-0.294843,0.828389
3,S.H.E,湯小康,465,3.0,徐世珍,294749,0.228862,2002.0,-0.203198,-0.168081,0.647272,-0.007648,-0.059925,-0.234705,-0.131524,-0.294843,-1.207162
4,貴族精選,Traditional,726,52.0,Traditional,254554,-0.921657,2011.0,-0.203198,-0.168081,0.647272,-0.007648,-0.059925,-0.234705,-0.131524,-0.294843,0.828389


In [35]:
df_members.head()

Unnamed: 0,user_id,city,age,gender,registered_via,weird_age,validate_days,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day
0,28200,1,-0.823033,Unknown,7,0.891258,0.902685,2011,8,20,2017,9,20
1,4571,1,-0.823033,Unknown,7,0.891258,-0.388179,2015,6,28,2017,6,22
2,15275,1,-0.823033,Unknown,4,0.891258,-0.619121,2016,4,11,2017,7,12
3,17563,1,-0.823033,Unknown,9,0.891258,-1.012067,2015,9,6,2015,9,7
4,7682,1,-0.823033,Unknown,4,0.891258,-0.894011,2017,1,26,2017,6,13


In [36]:
df_train.head()

Unnamed: 0,user_id,song_id,source_system_tab,source_screen_name,source_type,target,user_interaction_cnt,item_interaction_cnt
0,13362,232041,explore,Explore,online-playlist,1,5511,215
1,3437,267911,my library,Local playlist more,local-playlist,1,622,1
2,3437,96294,my library,Local playlist more,local-playlist,1,622,4
3,3437,38663,my library,Local playlist more,local-playlist,1,622,1
4,13362,216653,explore,Explore,online-playlist,1,5511,412


In [37]:
sub_timer.start('encode_categorical')
encode_categorical(df_train, ['source_system_tab', 'source_screen_name', 'source_type'])
encode_categorical(df_songs, ['genre_ids', 'artist_name', 'composer', 'lyricist', 'language', 'song_year'])
encode_categorical(df_members, [
    'city', 'gender', 'registered_via', 
    'registration_year', 'registration_month', 'registration_day', 
    'expiration_year', 'expiration_month', 'expiration_day'
])
sub_timer.end()

[2018-04-24 22:26:19] (start) encode_categorical
source_system_tab 9 0 8
source_screen_name 20 0 19
source_type 13 0 12
--------------------
genre_ids 573 0 572
artist_name 40583 0 40582
composer 76064 0 76063
lyricist 33888 0 33887
language 10 0 9
song_year 100 0 99
--------------------
city 21 0 20
gender 3 0 2
registered_via 5 0 4
registration_year 14 0 13
registration_month 12 0 11
registration_day 31 0 30
expiration_year 18 0 17
expiration_month 12 0 11
expiration_day 31 0 30
--------------------
[2018-04-24 22:27:16] ( end ) encode_categorical [time elapsed: 0:00:57]


## 5.6 save to file

In [38]:
sub_timer.start('5.6 save to file')
directory = '../data/context/'
if not os.path.exists(directory):
    os.makedirs(directory)
df_train.to_csv(directory + 'event_context.csv')
df_songs.to_csv(directory + 'song_context.csv')
df_members.to_csv(directory + 'user_context.csv')
sub_timer.end()

[2018-04-24 22:27:16] (start) 5.6 save to file
[2018-04-24 22:28:03] ( end ) 5.6 save to file [time elapsed: 0:00:48]


In [39]:
timer.end()

[2018-04-24 22:28:03] ( end ) save context [time elapsed: 0:02:12]


In [40]:
if not os.path.exists('../data/source'):
    os.makedirs('../data/source')
df_train.to_csv('../data/source/df_train.csv')
df_songs.to_csv('../data/source/df_songs.csv')
df_members.to_csv('../data/source/df_members.csv')