In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler

# Load data

In [None]:
data = pd.read_csv("../data/prepared/data_basic.csv", compression="gzip")

In [None]:
data.columns

In [None]:
customers = list(enumerate([i for i in data.CustomerIdx.unique()]))
bonds = list(enumerate([i for i in data.IsinIdx.unique()]))

In [None]:
customer2idx = {o:i for i,o in customers}
bond2idx = {o:i for i,o in bonds}

In [None]:
# Specify input sizes
n_customer = data.CustomerIdx.nunique()
n_bond = data.IsinIdx.nunique()
#n_features = X_train.shape[1]

## Train-test split

In [None]:
data = data[~data.CustomerInterest.isnull()]

In [None]:
trainIdx = np.array(data.Week<116)

In [None]:
cust_train = np.array([customer2idx[id] for id in data.CustomerIdx.loc[trainIdx]])
bond_train = np.array([bond2idx[id] for id in data.IsinIdx.loc[trainIdx]])
y_train = data.CustomerInterest[trainIdx]

cust_test = np.array([customer2idx[id] for id in data.CustomerIdx.loc[~trainIdx]])
bond_test = np.array([bond2idx[id] for id in data.IsinIdx.loc[~trainIdx]])
y_test = data.CustomerInterest[~trainIdx]

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler(X_test)

In [None]:
compute_class_weight("balanced", classes=np.array([0,1]), y=y_train)

# Build model

In [None]:
import keras
from keras import Model
from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Flatten, BatchNormalization
from keras.optimizers import Adam
from keras.regularizers import l2

In [None]:
# Create an input layer with one row of IDs
cust_in = Input(shape = (1,), dtype='int64', name = "cust_in")
bond_in = Input(shape = (1,), dtype='int64', name = "bond_in")
#features_in = Input(shape = (n_features,), name = "features_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
cust = Embedding(n_customer, 25, input_length=1, embeddings_regularizer=l2(1e-5))(cust_in)
bond = Embedding(n_bond, 25, input_length=1, embeddings_regularizer=l2(1e-5))(bond_in)

# Build NN from embeddings and other features
x = concatenate([cust, bond]) #, features_in])
x = Flatten()(x)
x= Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(Dense(32, activation='relu')(x))
out = Dense(1, activation = "sigmoid")(x)

In [None]:
# Then we specify the model that we want to use
model = Model([cust_in, bond_in], out) # 
model.compile(Adam(0.01), loss="binary_crossentropy", metrics = ['accuracy'])

In [None]:
model.fit([cust_train, bond_train], y_train,  #
          validation_data = ([cust_test, bond_test], y_test),
          class_weight={0:0.53647409, 1:7.35417975},
          batch_size = 50000, epochs = 5
    #,callbacks = keras.callbacks.ModelCheckpoint('../models/weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
)

In [None]:
pred_test = model.predict([cust_test, bond_test], batch_size=100000)

In [None]:
pred_test

In [None]:
roc_auc_score(y_true=y_test, y_score=pred_test)