In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import pickle
import gc

# modeling
import tensorflow as tf
import pandas as pd
import datatable as dt
from tqdm import tqdm
import sklearn.model_selection
import numpy as np

# other
import matplotlib.pyplot as plt

# Constants

In [None]:
BATCH_SIZE = 512

cols_def = {
    'answered_correctly' : 'con',
    'timestamp'          : 'cat',
#     'user_id'            : 'cat',
    'content_id'         : 'cat',
#     'part'               : 'cat',
#     'content_mean'       : 'con',
#     'part_mean'          : 'con',
#     'kmeans_label'       : 'cat',
#     'kmeans_mean'        : 'con',
}

mapping_dict = None
with open('mapping_dict.pkl', 'rb') as f:
    mapping_dict = pickle.load(f)

# Functions

In [None]:
class FM(tf.keras.Model):
    def __init__(self, k=4):
        super(FM, self).__init__()
        self.k = k

    def build(self, input_shape):
        _, p =  input_shape

        self.w0 = tf.Variable(tf.zeros([1]))
        self.w  = tf.Variable(tf.zeros([p]))
        self.V  = tf.Variable(tf.random.normal(shape=(p, self.k), stddev=0.01))

    def call(self, inputs):
        linear_terms = tf.reduce_sum(tf.math.multiply(self.w, inputs), axis=1)

        interactions = 0.5 * tf.reduce_sum(
            tf.math.pow(tf.matmul(inputs, self.V), 2)
            - tf.matmul(tf.math.pow(inputs, 2), tf.math.pow(self.V, 2)),
            axis=1,
            keepdims=False)

        y_hat = tf.sigmoid(self.w0 + linear_terms + interactions)

        return y_hat

# Load Data

In [None]:
df = pd.read_csv('train.csv', nrows=10000)
df = df.loc[df.content_type_id==0,:]

In [None]:
df = df.sample(frac=1)
train, test = sklearn.model_selection.train_test_split(df, test_size=0.2)
train, val = sklearn.model_selection.train_test_split(train, test_size=0.1)

train = train[list(cols_def)].reset_index(drop=True,)
test  = test[list(cols_def)].reset_index(drop=True,)
val   = val[list(cols_def)].reset_index(drop=True,)

In [None]:
y_train = train.answered_correctly.values
y_test = test.answered_correctly.values
y_val = val.answered_correctly.values

In [None]:
indicies = []
values = []

for row in train[train.columns[1:]].reset_index(drop=True).reset_index().to_dict('records'):
    for c in list(row):
        if c=='index':
            continue

        if c not in cols_def:
            continue
            
        if cols_def[c] == 'cat':
            if row[c] not in mapping_dict[c]['features']:
                continue

            ix = [row['index'], mapping_dict[c]['features'][row[c]]]
            v = 1
        else:
            ix = [row['index'], mapping_dict[c]['features']]
            v = row[c]

        indicies.append(ix)
        values.append(v)

X = tf.sparse.SparseTensor(indices=indicies, values=np.array(values, dtype=np.float32), dense_shape=(len(y_train),409562))
train_ds = tf.data.Dataset.from_tensor_slices((X, np.reshape(y_train, (len(y_train))))).batch(BATCH_SIZE)

In [None]:
indicies = []
values = []

for row in test[test.columns[1:]].reset_index(drop=True).reset_index().to_dict('records'):
    for c in list(row):
        if c=='index':
            continue

        if c not in cols_def:
            continue
            
        if cols_def[c] == 'cat':
            if row[c] not in mapping_dict[c]['features']:
                continue

            ix = [row['index'], mapping_dict[c]['features'][row[c]]]
            v = 1
        else:
            ix = [row['index'], mapping_dict[c]['features']]
            v = row[c]

        indicies.append(ix)
        values.append(v)
        
_X = tf.sparse.SparseTensor(indices=indicies, values=np.array(values, dtype=np.float32), dense_shape=(len(y_test),409562))
test_ds = tf.data.Dataset.from_tensor_slices((_X, np.reshape(y_test, (len(y_test))))).batch(BATCH_SIZE)

In [None]:
indicies = []
values = []

for row in val[val.columns[1:]].reset_index(drop=True).reset_index().to_dict('records'):
    for c in list(row):
        if c=='index':
            continue

        if c not in cols_def:
            continue
            
        if cols_def[c] == 'cat':
            if row[c] not in mapping_dict[c]['features']:
                continue

            ix = [row['index'], mapping_dict[c]['features'][row[c]]]
            v = 1
        else:
            ix = [row['index'], mapping_dict[c]['features']]
            v = row[c]

        indicies.append(ix)
        values.append(v)
        
_X = tf.sparse.SparseTensor(indices=indicies, values=np.array(values, dtype=np.float32), dense_shape=(len(y_val),409562))
val_ds = tf.data.Dataset.from_tensor_slices((_X, np.reshape(y_val, (len(y_val))))).batch(BATCH_SIZE)

### Train Model

In [None]:
EPOCHS = 10

train_auc = tf.keras.metrics.AUC(name='train_auc')
test_auc = tf.keras.metrics.AUC(name='test_auc')
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.2)

_lambda = tf.Variable(0.0002, name='lambda')

@tf.function
def train_step(model, optimizer, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        
        loss = tf.keras.losses.binary_crossentropy(from_logits=True,
                                                        y_true=targets,
                                                        y_pred=y_pred)

        l2_reg = _lambda*(tf.nn.l2_loss(model.w) + tf.nn.l2_loss(model.V))
        loss = loss + l2_reg
        
    grads = tape.gradient(target=loss, sources=model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    train_auc.update_state(targets, y_pred)
    return loss

@tf.function
def test_step(model, inputs, targets):

    y_pred = model(inputs, training=False)
    
    loss = tf.keras.losses.binary_crossentropy(from_logits=True,
                                               y_true=targets, 
                                               y_pred=y_pred)

    l2_reg = _lambda*(tf.nn.l2_loss(model.w) + tf.nn.l2_loss(model.V))
    loss = loss + l2_reg

    test_auc.update_state(targets, y_pred)
    return loss



model = FM(4)

with tf.device('/cpu:0'):
    for epoch in range(EPOCHS):
        print(f'Starting Epoch: {epoch+1}')
        # 다음 epoch의 평가지표 초기화 
        train_auc.reset_states()
        test_auc.reset_states()


        for step, (sparse_batch, labels) in enumerate(train_ds):
            sparse_batch = tf.sparse.to_dense(sparse_batch)
            sparse_batch = tf.where(tf.math.is_nan(sparse_batch), tf.zeros_like(sparse_batch), sparse_batch)
#             sparse_batch = tf.divide(
#                 tf.subtract(sparse_batch, tf.reduce_min(sparse_batch, axis=0, keepdims=True)),
#                 tf.subtract(tf.reduce_max(sparse_batch, axis=0, keepdims=True), tf.reduce_min(sparse_batch, axis=0, keepdims=True))
#             )
            loss = train_step(model, optimizer, sparse_batch, labels)

            if step % 200 == 0:
                print(f'Training Loss (for one batch) at step {step}: {loss:.5f}')
                print(f'Seen so far: {((step+1)*BATCH_SIZE)} samples')


        print(f'Training AUC over epoch: {train_auc.result():.5f}')

        for step, (sparse_batch, labels) in enumerate(val_ds):
            sparse_batch = tf.sparse.to_dense(sparse_batch)
            sparse_batch = tf.where(tf.math.is_nan(sparse_batch), tf.zeros_like(sparse_batch), sparse_batch)
#             sparse_batch = tf.divide(
#                 tf.subtract(sparse_batch, tf.reduce_min(sparse_batch, axis=0, keepdims=True)),
#                 tf.subtract(tf.reduce_max(sparse_batch, axis=0, keepdims=True), tf.reduce_min(sparse_batch, axis=0, keepdims=True))
#             )
            t_loss = test_step(model, sparse_batch, labels)

        print(f'Test loss: {t_loss:.5f}')
        print(f'Test AUC: {test_auc.result():.5f}')
        print('==========================================================')

### Test

In [None]:
with tf.device('/cpu:0'):
    y_true = []
    y_pred = []
    for a,b in test_ds:
        a = tf.sparse.to_dense(a)
        a = tf.where(tf.math.is_nan(a), tf.zeros_like(a), a)
        print(np.mean(model.predict(a)))

        y_pred += list(model.predict(a))
        y_true += list(b.numpy())

### Visualize

In [None]:
def rand_jitter(arr, tol):
    stdev = tol*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

plt.scatter(y_pred, rand_jitter(y_true, 0.1), alpha=0.2)
plt.xlabel('Correct Probability')
plt.yticks([0,1])
plt.show()

In [None]:
fpr, tpr, _ = sklearn.metrics.roc_curve(y_true,  y_pred)
auc = sklearn.metrics.roc_auc_score(y_true,  y_pred)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
auc