# 1. Load library and data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import time
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-21 20:15:08


In [2]:
import glob
for x in glob.glob("../input/*"):
    print(x)

../input/members.csv
../input/sample_submission.csv
../input/song_extra_info.csv
../input/songs.csv
../input/test.csv
../input/train.csv


In [3]:
df_train = pd.read_csv("../input/train.csv")
df_test  = pd.read_csv('../input/test.csv')
df_songs = pd.read_csv('../input/songs.csv')
df_song_extra = pd.read_csv("../input/song_extra_info.csv")
df_members = pd.read_csv("../input/members.csv",parse_dates=["registration_init_time","expiration_date"])

In [4]:
df_song_extra.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


# 2. Feature Engineering 

## 2.1 Members Dataset
- rename column 'bd' -> 'age' 
- registration_init_time -> registration_year + registration_month + registration_day (2011-08-20 -> 2011 08 20)
- expiration_date -> expiration_year + expiration_month + expiration_day (2011-08-20 -> 2011 08 20)

In [5]:
# Members
df_members.rename(columns={'bd': 'age'}, inplace=True)
df_members['validate_days'] = (df_members['expiration_date'] - df_members['registration_init_time']).dt.days

df_members['registration_year'] = df_members['registration_init_time'].apply(lambda x: str(x)[0:4])
df_members['registration_month'] = df_members['registration_init_time'].apply(lambda x: str(x)[5:7])
df_members['registration_day'] = df_members['registration_init_time'].apply(lambda x: str(x)[8:10])

df_members['expiration_year'] = df_members['expiration_date'].apply(lambda x: str(x)[0:4])
df_members['expiration_month'] = df_members['expiration_date'].apply(lambda x: str(x)[5:7])
df_members['expiration_day'] = df_members['expiration_date'].apply(lambda x: str(x)[8:10])

df_members.drop(['registration_init_time', 'expiration_date'], axis=1, inplace=True)

In [6]:
df_members.head()

Unnamed: 0,msno,city,age,gender,registered_via,validate_days,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2223,2011,8,20,2017,9,20
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,725,2015,6,28,2017,6,22
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,457,2016,4,11,2017,7,12
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,1,2015,9,6,2015,9,7
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,138,2017,1,26,2017,6,13


## 2.2 Song_extra Dataset
- isrc -> song_year (TWUM7 12 00043 -> 2012, QMZSY 16 00015 -> 2016, TWA53 08 87303 -> 2008)
- drop column song_name


In [7]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

df_song_extra['song_year'] = df_song_extra['isrc'].apply(isrc_to_year)
df_song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

## 2.3 Merge all dataset based on song_id and msno(user_id)

In [8]:
# left join train and song based on song_id
# left join train and members based on msno(user id)
# left join train and song_extra based on song_id
df_train = df_train.merge(df_songs, how="left", on="song_id")
df_train = df_train.merge(df_members, how="left", on="msno")
df_train = df_train.merge(df_song_extra, how='left', on='song_id')

df_test  = df_test.merge(df_songs, how="left", on="song_id")
df_test  = df_test.merge(df_members, how="left", on="msno")
df_test = df_test.merge(df_song_extra, how='left', on='song_id')

In [9]:
for col in df_train.columns:
    print(col)

msno
song_id
source_system_tab
source_screen_name
source_type
target
song_length
genre_ids
artist_name
composer
lyricist
language
city
age
gender
registered_via
validate_days
registration_year
registration_month
registration_day
expiration_year
expiration_month
expiration_day
song_year


## 2.4 Fill in missing value
- categorical feature -> 'Unknown'
- continuous feaure -> mean or mode

In [10]:
# find columns with missing value
for col in df_train.columns:
    if df_train[col].isnull().any() or df_train[col].isnull().any():
        print('{:10} {}'.format(str(df_train[col].dtype), col))

object     source_system_tab
object     source_screen_name
object     source_type
float64    song_length
object     genre_ids
object     artist_name
object     composer
object     lyricist
float64    language
object     gender
float64    song_year


In [11]:
# fill categorical columns with tag: 'Unknown'
UNKNOWN = 'Unknown'
col_fill_with_unknown = [
    'source_system_tab', 'source_screen_name', 'source_type', 'gender',
    'genre_ids', 'artist_name', 'composer', 'lyricist',
    'song_year'
]
for col in col_fill_with_unknown:
    df_train[col].fillna(value=UNKNOWN, inplace=True)
    df_test[col].fillna(value=UNKNOWN, inplace=True)
df_train['song_year'] = df_train['song_year'].astype(str)
    
# fill in song length with mean
df_train['song_length'].fillna(value=df_train['song_length'].mean(),inplace=True)
df_test['song_length'].fillna(value=df_test['song_length'].mean(),inplace=True)

# fill in language with mode
df_train['language'].fillna(value=df_train['language'].mode()[0],inplace=True)
df_test['language'].fillna(value=df_test['language'].mode()[0],inplace=True)

## 2.5 Deal with multiple labels
- split into lists (242|726 -> [242, 726]) preparing for later one-hot or multi-hot

In [12]:
# genre
# TODO: genre_ids seldomly have values seperated by comma
df_train['genre_ids'] = df_train['genre_ids'].str.split("|")
df_test['genre_ids'] = df_test['genre_ids'].str.split("|")
df_train['genre_count'] = df_train['genre_ids'].apply(lambda x : len(x) if UNKNOWN not in x else 0)
df_test['genre_count'] = df_test['genre_ids'].apply(lambda x : len(x) if UNKNOWN not in x else 0)

df_train['lyricist'] = df_train['lyricist'].str.split("|")
df_test['lyricist'] = df_test['lyricist'].str.split("|")
df_train['lyricist_count'] = df_train['lyricist'].apply(lambda x : len(x) if "Unknown" not in x else 0 )
df_test['lyricist_count'] = df_test['lyricist'].apply(lambda x : len(x) if "Unknown" not in x else 0 )

In [13]:
df_train['genre_count'].describe()


count    7.377418e+06
mean     1.037353e+00
std      2.948952e-01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      8.000000e+00
Name: genre_count, dtype: float64

In [14]:
df_train['lyricist_count'].describe()

count    7.377418e+06
mean     7.751240e-01
std      1.079586e+00
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.300000e+01
Name: lyricist_count, dtype: float64

## 2.6 Historical Data (artist's historical repeat rate, etc.)
maybe sometimes later

In [15]:
# df_artists = df_train.loc[:,["artist_name","target"]]
# artists_repeat = df_artists.groupby(["artist_name"], as_index=False).sum().rename(columns={"target":"repeat_count"})
# artists_play = df_artists.groupby(["artist_name"], as_index=False).count().rename(columns = {"target":"play_count"})
# df_artists_repeat = artists_repeat.merge(artists_play, how='inner', on='artist_name')

# df_artists_repeat['artist_repeat_percentage'] = round(
#     (df_artists_repeat['repeat_count']*100) / df_artists_repeat['play_count'], 
#     1
# )

# df_train = df_train.merge(df_artists_repeat, on="artist_name",how="left")
# df_test = df_test.merge(df_artists_repeat,on="artist_name",how="left")

# for col in ['repeat_count', 'play_count', 'artist_repeat_percentage']:
#     df_test[col].fillna(np.nanmedian(df_test[col]), inplace=True, downcast='infer')

# del df_artists
# del df_artists_repeat
# del artists_play
# del artists_repeat

In [16]:
df_train.head(2)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,validate_days,registration_year,registration_month,registration_day,expiration_year,expiration_month,expiration_day,song_year,genre_count,lyricist_count
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,[359],Bastille,Dan Smith| Mark Crew,...,2103,2012,1,2,2017,10,5,2016.0,1,0
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,[1259],Various Artists,Unknown,...,2301,2011,5,25,2017,9,11,1999.0,1,0


# 3. Preparing Data For TensorFlow

- TODO later
- [ ] adjust the embedding dimension for crossed
- [ ] add more crossed
- [ ] (other parameters adjustment, add it here)
- [ ] preprocessing the numerical data

In [17]:
# define categorical columns, continuous columns and labels
CATEGORICAL_COLUMNS = [
    'msno',
    'song_id',
    'artist_name',
    'song_year',
    'source_screen_name',
    'source_type',
    'source_system_tab',
    'language',
    'city',
    'gender',
    'registered_via',
    'registration_year',
    'registration_month',
    'registration_day',
    'expiration_year',
    'expiration_month',
    'expiration_day',
]


CONTINUOUS_COLUMNS = [
    'age', 
    'song_length',
    'genre_count', 
    'lyricist_count',
]

LABEL = 'target'

In [18]:
# convert categorical columns to string
print(time.strftime("%Y-%m-%d %H:%M:%S"))
for col in CATEGORICAL_COLUMNS:
    df_train[col] = df_train[col].astype(str)
    df_test[col] = df_test[col].astype(str)
print(time.strftime("%Y-%m-%d %H:%M:%S"))

2017-11-21 20:19:14
2017-11-21 20:20:44


In [19]:
columns = dict()

## 3.1 Source Related Feature

In [20]:
# Source related categorical feature
# Columns with a small set of categories transform them into sparse columns with keys
_source_cols = ['source_screen_name', 'source_type', 'source_system_tab']
for col in _source_cols:
    col_set = set(df_train[col].unique()).union(set(df_test[col].unique()))
#     print(col, len(col_set))
    columns[col] = tf.contrib.layers.sparse_column_with_keys(column_name=col, keys=col_set)

## 3.2 Members Related Feature

In [21]:
# members related categorical feature
# Columns with a small set of categories transform them into sparse columns with keys
_members_cols = [
    'city', 'gender', 'registered_via', 
    'registration_year', 'registration_month', 'registration_day',
    'expiration_year', 'expiration_month', 'expiration_day'
]
for col in _members_cols:
    col_set = set(df_train[col].unique()).union(set(df_test[col].unique()))
#     print(col, len(col_set))
    columns[col] = tf.contrib.layers.sparse_column_with_keys(column_name=col, keys=col_set)

    
# members related numerical feature   
# bucket the ages. 
# Bucketization allows us to find the music recommendtion by certain age groups 
columns['age'] = tf.feature_column.numeric_column('age')
columns['age_buckets'] = tf.feature_column.bucketized_column(
    columns['age'], boundaries=[15, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

# validate_days as continuous column
columns['validate_days'] = tf.feature_column.numeric_column('validate_days')

## 3.3 Song Related Feature

In [22]:
# songs related categorical feature
# Columns with a small set of categories transform them into sparse columns with keys
_songs_cols = ['language', 'song_year']
for col in _songs_cols:
    col_set = set(df_train[col].unique()).union(set(df_test[col].unique()))
#     print(col, len(col_set))
    columns[col] = tf.contrib.layers.sparse_column_with_keys(column_name=col, keys=col_set)

# artist_name
# Columns with a large set of categories hash into real value 
col = 'artist_name'
columns[col] = tf.contrib.layers.sparse_column_with_hash_bucket(col, hash_bucket_size=10000)

# song length (ms) 
# bucketization allows us to find the music recommendtion by certain song length groups 
# TODO 1: use it as numerical by cutting cutting >3600s to 3600s
# TODO 2: improved bucket
col = 'song_length'
columns[col] = tf.feature_column.numeric_column(col)
columns['song_length_buckets'] = tf.feature_column.bucketized_column(
    columns[col], boundaries=[x*1000 for x in [30, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600]])

# genre_count: number of genre 
# lyricist_count: number of lyricist
for col in ['genre_count', 'lyricist_count']:
    columns[col] = tf.feature_column.numeric_column(col)

# TODO: genre_ids, composer, lyricist

##  3.4 Hash msno(usr_id) and song_id

In [23]:
for col in ['msno', 'song_id']:
    print(len(set(df_train[col]).union(set(df_test[col]))))

34403
419839


In [24]:
columns['msno'] = tf.contrib.layers.sparse_column_with_hash_bucket(
      "msno", hash_bucket_size=30000)
columns['song_id'] = tf.contrib.layers.sparse_column_with_hash_bucket(
      "song_id", hash_bucket_size=10**5)

## 3.5 Final state

In [25]:
for key in columns.keys():
    print(key)

source_screen_name
source_type
source_system_tab
city
gender
registered_via
registration_year
registration_month
registration_day
expiration_year
expiration_month
expiration_day
age
age_buckets
validate_days
language
song_year
artist_name
song_length
song_length_buckets
genre_count
lyricist_count
msno
song_id


In [60]:
df_train['language'].isnull().any()

False

In [45]:
# wide model input
# categorical feature
wide_columns_tag = ['song_id', 
                    'msno', 
                    'song_length_buckets', 
                    'language', 
                    'song_year',
                    'source_screen_name',
                    'source_type',
                    'source_system_tab',
                    'gender',
                    'registered_via',
                    'registration_year',
                    'registration_month',
                    'registration_day',
                    'expiration_year',
                    'expiration_month',
                    'expiration_day',
                    'artist_name',
                    'age_buckets']
wide_columns = []
for tag in wide_columns_tag:
    wide_columns.append(columns[tag])

# cross column feature 
# these feature is related so hash them together to produce new feature
wide_columns.append(tf.feature_column.crossed_column([columns['msno'], columns['song_id']], 
                                                     hash_bucket_size=int(1e6)))

wide_columns.append(tf.feature_column.crossed_column([columns['registration_year'], columns['expiration_year']], 
                                                     hash_bucket_size=int(1e2)))

wide_columns.append(tf.feature_column.crossed_column([columns['language'], columns['msno']], 
                                                     hash_bucket_size=int(1e6)))

wide_columns.append(tf.feature_column.crossed_column([columns['age_buckets'], columns['artist_name']], 
                                                     hash_bucket_size=int(1e6)))

In [27]:
wide_columns

['source_screen_name',
 'source_type',
 'source_system_tab',
 _CrossedColumn(keys=(_SparseColumnHashed(column_name='msno', is_integerized=False, bucket_size=30000, lookup_config=None, combiner='sum', dtype=tf.string), _SparseColumnHashed(column_name='song_id', is_integerized=False, bucket_size=100000, lookup_config=None, combiner='sum', dtype=tf.string)), hash_bucket_size=1000000, hash_key=None),
 _CrossedColumn(keys=(_SparseColumnKeys(column_name='registration_year', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('2012', '2008', '2015', '2009', '2006', '2014', '2013', '2005', '2010', '2011', '2016', '2007', '2017', '2004'), num_oov_buckets=0, vocab_size=14, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='expiration_year', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('2012', '2008', '2015', '2009', '2006', '2019', '2014', '2013', '2018'

In [29]:
# deep model input
# dense embedded feature
deep_columns = []
_categoricals = [
    'source_screen_name',
    'source_type',
    'source_system_tab',
    'language',
    'city',
    'gender',
    'registered_via',
    'registration_year',
    'registration_month',
    'registration_day',
    'expiration_year',
    'expiration_month',
    'expiration_day',
    'song_year',
#     'age_buckets',
#     'song_length_buckets',
    'artist_name',
    'msno',
    'song_id'
]
for col in _categoricals:
    deep_columns.append(tf.contrib.layers.embedding_column(columns[col], dimension=8))

# 这一点有问题 我加到deep_column 里面了
# Represents multi-hot representation of given categorical column???
# _buckets = ['age_buckets', 'song_length_buckets', 'artist_name']
# for col in _buckets:
#     deep_columns.append(tf.feature_column.indicator_column(columns[col]))

# numerical columns
_numericals = [
    'age', 
    'song_length',
    'genre_count', 
    'lyricist_count'
]
for col in _numericals:
    deep_columns.append(columns[col])

# Todo: high dimension categorical
# for col in ['msno', 'song_id', 'artist_name']:
#     deep_columns.append(tf.contrib.layers.embedding_column(columns[col], dimension=64))



In [30]:
# 这也有问题 这个应该multihot! 不该drop 掉 gereids吧 我还没找到怎么加明天看你也可以先看看
# here use multi-hot???
# tf.feature_column.categorical_column_with_vocabulary_list??
# tf.feature_column.indicator_column(columns[col])
X_train = df_train.drop(['target', 'genre_ids', 'composer', 'lyricist'], axis=1)
# cannot accept element with type list
y_train = df_train['target']

In [31]:
X_test = df_test.drop(['genre_ids', 'composer', 'lyricist'], axis=1)
test_ids = df_test['id']

# 4 Modeling and evaluating

## 4.1 split into train and validation set

In [32]:
# split into train and validation set
def split(X, y, val_size):
    '''
    split the data into training and validation set
    '''
    test_num = int(val_size * X.shape[0])
    return X[test_num:], X[:test_num], y[test_num:], y[:test_num]

In [33]:
X_trn, X_val, y_trn, y_val = split(X_train, y_train, val_size=0.1)

## 4.2 Build model

In [34]:
# use DNNLinearCombinedClassifier estimator provided by tensorflow API
def build_estimator(model_dir):
      return tf.contrib.learn.DNNLinearCombinedClassifier(
          model_dir=model_dir,
          linear_feature_columns=wide_columns,
          dnn_feature_columns=deep_columns,
          # dnn_dropout=0.4,
          dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=0.3),
          dnn_hidden_units=[100, 32],
          fix_global_step_increment_bug=True)

In [35]:
# input data to model (train: small batch, test: whole testdata)
def train_input_fn(shuffle, train = True):
    if train:
        return tf.estimator.inputs.pandas_input_fn(
            x=X_trn,
            y=y_trn,
            batch_size=128,
            num_epochs=1,
            shuffle=shuffle,
            num_threads=1,
            target_column='target'
        )
    else:
        return tf.estimator.inputs.pandas_input_fn(
            x=X_val,
            y=y_val,
            batch_size=X_val.shape[0],
            num_epochs=1,
            shuffle=False,
            num_threads=1,
            target_column='target'
        )
def test_input_fn():
    return tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        y=None,
        batch_size=128,
        num_epochs=1,
        shuffle=False,
        num_threads=1
    )

In [36]:
# produce submission
def produce(m):
    print(time.strftime("%Y-%m-%d %H:%M:%S"))
    predictions = m.predict(input_fn=test_input_fn())
    result_df = pd.DataFrame()
    result_df['id'] = test_ids
    result_df['target'] = list(predictions)

    timestamp = time.strftime("%Y-%m-%d_%H_%M_%S")
    filename = '../result/' + timestamp + '.csv.gz'
    result_df.to_csv(filename, compression = 'gzip', index=False, float_format = '%.5f')
    print(time.strftime("%Y-%m-%d %H:%M:%S"))

# 5 Experiment

In [None]:
# add evaludation!!!
# def train_and_eval():
# model_dir = "../models/model2"
model_dir = None
# print("model directory = %s" % model_dir)

m = build_estimator(model_dir)

print(time.strftime("%Y-%m-%d %H:%M:%S"))
for iter in range(1):
    m.fit(input_fn=train_input_fn(shuffle=True))
    if iter % 2 == 0:
        print(time.strftime("%Y-%m-%d %H:%M:%S"))
        results = m.evaluate(input_fn=train_input_fn(shuffle=False, train = False))
        print('results:' + '=' * 30)
        for key in sorted(results):
            print("%s: %s" % (key, results[key]))
        print('results end:' + '=' * 30)
        print(time.strftime("%Y-%m-%d %H:%M:%S"))
produce(m)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a9dd3b240>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/var/folders/_n/8c5rjjsj4kl5v76kz7z1p9qm0000gn/T/tmpsspr74fg'}
2017-11-21 20:47:07
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a te

INFO:tensorflow:global_step/sec: 35.9466
INFO:tensorflow:loss = 0.577265, step = 6701 (2.776 sec)
INFO:tensorflow:global_step/sec: 37.4652
INFO:tensorflow:loss = 0.603229, step = 6801 (2.669 sec)
INFO:tensorflow:global_step/sec: 36.5803
INFO:tensorflow:loss = 0.595863, step = 6901 (2.734 sec)
INFO:tensorflow:global_step/sec: 34.2659
INFO:tensorflow:loss = 0.653866, step = 7001 (2.920 sec)
INFO:tensorflow:global_step/sec: 39.9406
INFO:tensorflow:loss = 0.59747, step = 7101 (2.504 sec)
INFO:tensorflow:global_step/sec: 39.8653
INFO:tensorflow:loss = 0.667987, step = 7201 (2.507 sec)
INFO:tensorflow:global_step/sec: 40.8815
INFO:tensorflow:loss = 0.685732, step = 7301 (2.446 sec)
INFO:tensorflow:global_step/sec: 42.5913
INFO:tensorflow:loss = 0.591227, step = 7401 (2.350 sec)
INFO:tensorflow:global_step/sec: 46.8584
INFO:tensorflow:loss = 0.58356, step = 7501 (2.134 sec)
INFO:tensorflow:global_step/sec: 43.5045
INFO:tensorflow:loss = 0.583701, step = 7601 (2.297 sec)
INFO:tensorflow:global

In [None]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
for iter in range(5):
    # m.fit(input_fn=lambda: input_fn(df=df_train[:100], train=True), steps=10)
    # results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
    m.fit(input_fn=train_input_fn(shuffle=True))

print(time.strftime("%Y-%m-%d %H:%M:%S"))
results = m.evaluate(input_fn=train_input_fn(shuffle=False))
# results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
print('results:' + '=' * 30)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))
print('results end:' + '=' * 30)
print(time.strftime("%Y-%m-%d %H:%M:%S"))

produce(m)

# Experiment 001

- larger for msno 10000->30000, embedding dimension 32 -> 64
- song_id, 10000->1e4, embedding dimension 32 -> 64
- adagrad lr=0.3

In [None]:
# def train_and_eval():
# model_dir = "../models/model2"
model_dir = None
print("model directory = %s" % model_dir)

m = build_estimator(model_dir)

print(time.strftime("%Y-%m-%d %H:%M:%S"))
for iter in range(5):
    # m.fit(input_fn=lambda: input_fn(df=df_train[:100], train=True), steps=10)
    # results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
    m.fit(input_fn=train_input_fn(shuffle=True))

print(time.strftime("%Y-%m-%d %H:%M:%S"))
results = m.evaluate(input_fn=train_input_fn(shuffle=False))
# results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
print('results:' + '=' * 30)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))
print('results end:' + '=' * 30)
print(time.strftime("%Y-%m-%d %H:%M:%S"))

produce(m)

In [None]:
print(time.strftime("%Y-%m-%d %H:%M:%S"))
for iter in range(1):
    # m.fit(input_fn=lambda: input_fn(df=df_train[:100], train=True), steps=10)
    # results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
    m.fit(input_fn=train_input_fn(shuffle=True))

print(time.strftime("%Y-%m-%d %H:%M:%S"))
results = m.evaluate(input_fn=train_input_fn(shuffle=False))
# results = m.evaluate(input_fn=lambda: input_fn(df_train[:10], train=True), steps=1)
print('results:' + '=' * 30)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))
print('results end:' + '=' * 30)
print(time.strftime("%Y-%m-%d %H:%M:%S"))

produce(m)