In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler, Imputer

In [28]:
import keras
from keras import Model
from keras.callbacks import Callback
from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Flatten, Activation, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.regularizers import l2
from keras.constraints import maxnorm

In [3]:
class AucCallback(Callback):
    def on_train_begin(self, logs={}):
        self.aucs = []
        self.losses = []
 
    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        y_pred = self.model.predict(self.validation_data[0:3])
        self.aucs.append(roc_auc_score(y_true=self.validation_data[3], y_score=y_pred))
        return

# Load data

In [4]:
data = pd.read_csv("../data/prepared/data_v4_0_60_under.csv", compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
customers = list(enumerate([i for i in data.CustomerIdx.unique()]))
bonds = list(enumerate([i for i in data.IsinIdx.unique()]))

In [6]:
customer2idx = {o:i for i,o in customers}
bond2idx = {o:i for i,o in bonds}

In [7]:
# Specify input sizes
n_customer = data.CustomerIdx.nunique()
n_bond = data.IsinIdx.nunique()

## Train-test split

In [8]:
trainIdx = np.array(np.logical_and(np.logical_and(data.Week>90, data.Week<112), 
                                   (data.Recency1 < data.Recency1.max())))
valIdx = np.array(np.logical_and(data.Week >= 116, data.Week <=120))
testIdx = np.array(~data.PredictionIdx.isnull())

In [9]:
drop_vars=['CustomerInterest','CustomerIdx','IsinIdx','Week','PredictionIdx','BuySell']

In [10]:
cust_train = np.array([customer2idx[id] for id in data.CustomerIdx.loc[trainIdx]])
bond_train = np.array([bond2idx[id] for id in data.IsinIdx.loc[trainIdx]])
y_train = data.CustomerInterest[trainIdx]

cust_val = np.array([customer2idx[id] for id in data.CustomerIdx.loc[valIdx]])
bond_val = np.array([bond2idx[id] for id in data.IsinIdx.loc[valIdx]])
y_val = data.CustomerInterest[valIdx]

cust_test = np.array([customer2idx[id] for id in data.CustomerIdx.loc[testIdx]])
bond_test = np.array([bond2idx[id] for id in data.IsinIdx.loc[testIdx]])
y_test = data.CustomerInterest[testIdx]

In [11]:
X_train = data.loc[trainIdx,:].drop(drop_vars, axis=1)
X_val  = data.loc[valIdx,:].drop(drop_vars, axis=1)
X_test  = data.loc[testIdx,:].copy()

In [12]:
X_train.shape

(835551, 39)

In [13]:
vars_with_missing = ('MeanPrice', 'StdPrice', 'MeanYield', 'StdYield',
       'MeanZScore', 'StdZScore', 'YieldMarktDelta', 'ZScoreMarktDelta')

In [14]:
imputer = Imputer(missing_values="NaN", strategy="mean")
X_train.loc[:,vars_with_missing] = imputer.fit_transform(X_train.loc[:,vars_with_missing])

In [15]:
X_val.loc[:,vars_with_missing] = imputer.transform(X_val.loc[:,vars_with_missing])
X_test.loc[:,vars_with_missing] = imputer.transform(X_test.loc[:,vars_with_missing])

Standardization of float vars

In [16]:
continuous_vars = X_train.columns[X_train.dtypes == "float64"]

In [17]:
scaler = StandardScaler()
X_train.loc[:,continuous_vars] = scaler.fit_transform(X_train.loc[:,continuous_vars].copy())

In [18]:
X_val.loc[:,continuous_vars] = scaler.transform(X_val.loc[:,continuous_vars])
X_test.loc[:,continuous_vars] = scaler.transform(X_test.loc[:,continuous_vars])

In [19]:
X_train = X_train.values
X_val = X_val.values

In [20]:
compute_class_weight("balanced", classes=np.array([0,1]), y=y_train)

array([0.66524867, 2.01287147])

# Build model

In [21]:
n_features = X_train.shape[1]

In [22]:
# Create an input layer with one row of IDs
cust_in = Input(shape = (1,), dtype='int64', name = "cust_in")
bond_in = Input(shape = (1,), dtype='int64', name = "bond_in")
features_in = Input(shape = (n_features,), name = "features_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
cust = Embedding(n_customer, 50, input_length=1, embeddings_regularizer=l2(1e-3))(cust_in)
bond = Embedding(n_bond,     50, input_length=1, embeddings_regularizer=l2(1e-3))(bond_in)

In [23]:
embeddings = concatenate([cust, bond])

In [24]:
embeddings = Flatten()(embeddings)

In [25]:
x = concatenate([embeddings, features_in])

In [26]:
x = Dropout(0.02)(x)
x = Dense(128, kernel_constraint=maxnorm(5), )(x)
#x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = Dense(64, kernel_constraint=maxnorm(5))(x)
#x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

# x = Dense(64, kernel_constraint=maxnorm(5))(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.5)(x)

x = Dense(1)(x)
x = BatchNormalization()(x)
out = Activation('sigmoid')(x)

In [29]:
# Then we specify the model that we want to use
model = Model([cust_in, bond_in, features_in], out) # 
model.compile(optimizer=RMSprop(clipvalue=1, clipnorm=1), loss="binary_crossentropy", metrics = ['accuracy'])

In [30]:
auc_history = AucCallback()

In [31]:
1-y_train.mean()

0.7515986456841055

In [None]:
model.fit([cust_train, bond_train, X_train], y_train,  
          validation_data = ([cust_val, bond_val, X_val], y_val),
          #class_weight={0:0.74958666, 1:1.5016561},
          epochs = 50, batch_size = 5000
          ,callbacks = [auc_history], verbose=1
          #keras.callbacks.ModelCheckpoint('../models/weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
)

Train on 835551 samples, validate on 236752 samples
Epoch 1/50
Epoch 2/50


In [37]:
auc_history.aucs

[0.5530095489943554,
 0.5732080911276227,
 0.5783734773183309,
 0.5808701090294107,
 0.5883136446477099]

In [None]:
pred_val = model.predict([cust_val, bond_val, X_val], batch_size=10000)
roc_auc_score(y_true=y_val, y_score=pred_val)

In [31]:
pred_test = model.predict([cust_test, bond_test, 
                           X_test.drop(drop_vars, axis=1).values], batch_size=10000)

In [34]:
submission = pd.DataFrame({'PredictionIdx':X_test.PredictionIdx,
              'CustomerInterest':pred_test.flatten()}).reset_index(drop=True)
submission.to_csv("../submissions/FCNN_20180619.csv", index=False)

In [33]:
old = pd.read_csv("../submissions/auc845086_data_v4_0_80_lgb_1stage.csv")

In [38]:
submission.describe()

Unnamed: 0,CustomerInterest
count,484758.0
mean,0.507189
std,0.211071
min,0.003557
25%,0.324898
50%,0.534475
75%,0.711395
max,0.999672
