In [1]:
import addict
import numpy as np
from utils import save_obj, load_obj
from my_logger import Timer, print_info
from sklearn.metrics import mean_squared_error
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv2D, GlobalAveragePooling2D, Reshape, Multiply, Lambda
from keras.models import Model
from keras.models import load_model as load_model_keras
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from sklearn.preprocessing import StandardScaler
import time

Using TensorFlow backend.


In [2]:
from data_io import load_train
from data_io import features_downcast
from downcast import load_dataframe32

# load data

In [3]:
main_m_code16 = load_dataframe32(features_downcast("main_m_code16"))
main_m_value16 = load_dataframe32(features_downcast("main_m_value16"))
topk_m_code150 = load_dataframe32(features_downcast("topk_m_code150"))
topk_m_value150 = load_dataframe32(features_downcast("topk_m_value150"))

In [4]:
main_m_code16.head().T.tail()

Unnamed: 0,0,1,2,3,4
main_m_lag12,193598,13012,232546,0,18933
main_m_lag13,173571,273815,232546,0,148211
main_m_lag14,0,216501,224019,0,90518
main_m_lag15,855,200595,0,0,78198
card_id,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda


In [5]:
main_m_value16.head().T.tail()

Unnamed: 0,0,1,2,3,4
main_m_lag12,430,1008.22,1100,0,2337
main_m_lag13,500,771.83,1491,0,1374.93
main_m_lag14,0,444.94,9.92,0,191.2
main_m_lag15,60,250,0,0,425.4
card_id,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda


In [6]:
topk_m_code150.head().T.tail()

Unnamed: 0,0,1,2,3,4
m_code_rank147,0,0,0,283388,0
m_code_rank148,0,0,0,54335,0
m_code_rank149,0,0,0,259109,0
m_code_rank150,0,0,0,330051,0
card_id,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda


In [7]:
topk_m_value150.head().T.tail()

Unnamed: 0,0,1,2,3,4
m_value_rank147,0,0,0,13.5,0
m_value_rank148,0,0,0,6,0
m_value_rank149,0,0,0,6,0
m_value_rank150,0,0,0,2,0
card_id,C_ID_00007093c1,C_ID_0001238066,C_ID_0001506ef0,C_ID_0001793786,C_ID_000183fdda


In [8]:
train = load_train()

train_part = train[:20000]
valid_part = train[10000:20000]

In [9]:
from data_io import rename_columns

for name in ("main_m_code16",
             "main_m_value16",
             "topk_m_code150",
             "topk_m_value150"):
    part = load_dataframe32(features_downcast(name))
    print(name)
    print(part.shape)
    part = rename_columns(part, name)
    print(part.shape)
    train_part = train_part.merge(part, how="left", left_on="card_id", right_on="card_id")
    valid_part = valid_part.merge(part, how="left", left_on="card_id", right_on="card_id")

main_m_code16
(325540, 17)
(325540, 17)
main_m_value16
(325540, 17)
(325540, 17)
topk_m_code150
(325540, 151)
(325540, 151)
topk_m_value150
(325540, 151)
(325540, 151)


In [10]:
train_part.head()

Unnamed: 0,feature_1,feature_2,feature_3,target,first_active_month,card_id,main_m_lag0_main_m_code16,main_m_lag1_main_m_code16,main_m_lag2_main_m_code16,main_m_lag3_main_m_code16,...,m_value_rank141_topk_m_value150,m_value_rank142_topk_m_value150,m_value_rank143_topk_m_value150,m_value_rank144_topk_m_value150,m_value_rank145_topk_m_value150,m_value_rank146_topk_m_value150,m_value_rank147_topk_m_value150,m_value_rank148_topk_m_value150,m_value_rank149_topk_m_value150,m_value_rank150_topk_m_value150
0,5.0,2.0,1.0,-0.820283,2017-06,C_ID_92a2005557,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,1.0,0.0,0.392913,2017-01,C_ID_3d0044924f,46728,102082,121630,297903,...,8.7,8.0,8.0,7.0,5.0,4.99,4.0,3.0,0.0,0.0
2,2.0,2.0,0.0,0.688056,2016-08,C_ID_d639edf6cd,112442,112442,30590,112442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,3.0,0.0,0.142495,2017-09,C_ID_186d6a6901,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,3.0,0.0,-0.159749,2017-11,C_ID_cdbd2c0db2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# X_train & X_valid

In [11]:
def get_columns(data, name):
    return [col for col in data.columns if col.endswith(name)]

columns_m_code16 = get_columns(train_part, "main_m_code16")
columns_m_values16 = get_columns(train_part, "main_m_value16")
columns_topk_m_code150 = get_columns(train_part, "topk_m_code150")
columns_topk_m_value150 = get_columns(train_part, "topk_m_value150")

names = ("main_m_code16",
         "main_m_value16",
         "topk_m_code150",
         "topk_m_value150")

X_train = {
    name: train_part[get_columns(train_part, name)].values
    for name in names
}

y_train = train_part.target.values

X_valid = {
    name: valid_part[get_columns(train_part, name)].values
    for name in names
}

y_valid = valid_part.target.values

# model structure

In [16]:
import addict
import numpy as np
from utils import save_obj, load_obj
from my_logger import Timer, print_info
from sklearn.metrics import mean_squared_error
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv2D, GlobalAveragePooling2D, Reshape, Multiply, Lambda, Concatenate
from keras.layers import RepeatVector, MaxPooling2D, Flatten
from keras.models import Model
from keras.models import load_model as load_model_keras
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from sklearn.preprocessing import StandardScaler
import time

In [17]:
max_features = 440000
embed_size = 256

main_code = Input((16,), name="main_m_code16")
topk_code = Input((150,), name="topk_m_code150")
main_value = Input((16,), name="main_m_value16")
topk_value = Input((150,), name="topk_m_value150")

ebd_layer = Embedding(max_features, embed_size)
main_ebd = ebd_layer(main_code)
topk_ebd = ebd_layer(topk_code)
con_ebd = Concatenate(axis=1)([main_ebd, topk_ebd])
con_ebd = Reshape((166, embed_size, 1))(con_ebd)
con_ebd = BatchNormalization()(con_ebd)
conv = Conv2D(64, (5, 5))(con_ebd)
conv = Activation("relu")(conv)
conv = MaxPooling2D((5, 5))(conv)
conv = BatchNormalization()(conv)
conv = Conv2D(32, (3, 3))(conv)
conv = Activation("relu")(conv)
flat_code = Flatten()(conv)

main_dense = Dense(8, activation="sigmoid")(main_value)
topk_dense = Dense(64, activation="sigmoid")(topk_value)
con_dense = Concatenate()([main_dense, topk_dense, flat_code])
con_dense = Dense(64, activation="relu")(con_dense)
con_dense = Dense(1, activation="linear")(con_dense)

model = Model(inputs=[main_code, topk_code, main_value, topk_value], outputs=con_dense)

In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_m_code16 (InputLayer)      (None, 16)           0                                            
__________________________________________________________________________________________________
topk_m_code150 (InputLayer)     (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         multiple             112640000   main_m_code16[0][0]              
                                                                 topk_m_code150[0][0]             
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 166, 256)     0           embedding_2[0][0]                
          

In [19]:
model.compile(loss='mean_squared_error', optimizer="adam", metrics=['mean_squared_error'])

# trainning

In [28]:
def get_curdate():
    return time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))

model_name = "notebook038"
metric_name = "val_mean_squared_error"
batch_size = 64  # 512
epochs = 50  # 5
cur_time = get_curdate()
model_filepath = "./keras_temp/{}_{}.h5".format(model_name, cur_time)
log_filepath = "./keras_temp/{}_{}.csv".format(model_name, cur_time)
# X_train = pad_sequences(X_train, maxlen=maxlen, padding="pre", truncating="pre")
# X_valid = pad_sequences(X_valid, maxlen=maxlen, padding="pre", truncating="pre")

# print_info("keras_params", keras_params)
# print_info("keras_params.num_boost_round", keras_params.num_boost_round)
# print_info("keras_params.early_stopping_rounds", keras_params.early_stopping_rounds)
# print_info("keras_params.verbose_eval", keras_params.verbose_eval)

early_stopping = EarlyStopping(monitor=metric_name, mode='min', patience=10, verbose=1)
model_checkpoint = ModelCheckpoint(model_filepath, monitor=metric_name, mode='min', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor=metric_name, mode='min', factor=0.5, patience=5, min_lr=0.000005, verbose=1)  # patience=5, factor=0.2
model_logger = CSVLogger(log_filepath, separator=',', append=False)

In [29]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_valid, y_valid),
          callbacks=[early_stopping, reduce_lr, model_checkpoint, model_logger],
          verbose=20)

Train on 20000 samples, validate on 10000 samples
Epoch 1/50

Epoch 00001: val_mean_squared_error improved from inf to 14.52244, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 2/50

Epoch 00002: val_mean_squared_error improved from 14.52244 to 14.49120, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 3/50

Epoch 00003: val_mean_squared_error improved from 14.49120 to 14.46098, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 4/50

Epoch 00004: val_mean_squared_error improved from 14.46098 to 14.43747, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 5/50

Epoch 00005: val_mean_squared_error improved from 14.43747 to 14.42164, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 6/50

Epoch 00006: val_mean_squared_error improved from 14.42164 to 14.41160, saving model to ./keras_temp/notebook038_20190225_181632.h5
Epoch 7/50

Epoch 00007: val_mean_squared_error improved from 14.41160 to 14.40600, saving 

<keras.callbacks.History at 0x123c7b55b00>