goal:
=====

basically a factorization machine with cross entropy loss where interaction effects come from deep nonlinear relu-activated embeddings and with an additional "metric" kernal matrix.

todo: dropout. currently no regularization on the interaction layers in the cost function. can handle with FTRL optimization

In [17]:
# import this stuff
import time
import sys
from pylab import *
from scipy import sparse
import numpy as np

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [22]:
def make_embeddings(x, rank, num_features, depth=1, seed=12345):
    """
    assumes that all hidden layers are width `rank`
    """
    assert depth > 0
    V = tf.Variable(
        tf.random.truncated_normal([rank, num_features], stddev=0.2, mean=0, seed=seed),
        name="v_1",
    )
    b = tf.Variable(
        tf.random.truncated_normal([rank, 1], stddev=0.2, mean=0, seed=seed), name="b_1"
    )
    Vx = tf.nn.relu(tf.matmul(V, x) + b)
    for i in range(depth - 1):
        V = tf.Variable(
            tf.random.truncated_normal([rank, rank], stddev=0.2, mean=0, seed=seed),
            name="v_%s" % i,
        )
        b = tf.Variable(
            tf.random.truncated_normal([rank, 1], stddev=0.2, mean=0, seed=seed),
            name="b_%s" % i,
        )
        Vx = tf.nn.relu(tf.matmul(V, Vx) + b)

    return Vx


def factorize(
    observed_features,
    labels,
    observed_features_validation,
    labels_validation,
    rank,
    max_iter=100,
    verbose=False,
    lambda_v=0,
    lambda_k=0,
    lambda_w=0,
    lambda_constants=0,
    epsilon=0.001,
    optimizer=tf.compat.v1.train.AdamOptimizer(),
    depth=3,
    seed=12345,
):

    # Extract info about shapes etc from the training data
    num_items = observed_features.shape[0]
    num_features = observed_features.shape[1]

    # matrix defining the inner product weights when doing interactions
    K = tf.Variable(
        tf.random.truncated_normal([rank, rank], stddev=0.2, mean=0, seed=seed),
        name="metric_matrix",
    )

    # coefficients for linear function on inputs (wide part)
    w = tf.Variable(
        tf.random.truncated_normal([1, num_features], stddev=0.2, mean=0, seed=seed),
        name="hyperplane",
    )

    # coefficients for linear functinos on inputs (deep part)
    lw = tf.Variable(
        tf.random.truncated_normal([1, rank], stddev=0.2, mean=0, seed=seed),
        name="latenthyperplane",
    )

    # bias in linear function
    b = tf.Variable(
        tf.random.truncated_normal([1, 1], stddev=0.2, mean=0, seed=seed), name="b_one"
    )

    x = tf.placeholder(tf.float32, [None, num_features])
    y = tf.placeholder(tf.float32)

    norm_x = tf.nn.l2_normalize(x, dim=0)

    Vx = make_embeddings(
        tf.transpose(norm_x), rank, num_features, depth=depth, seed=seed
    )
    right_kern = tf.matmul(K, Vx)

    full_kern = tf.matmul(tf.transpose(Vx), right_kern)
    linear = tf.matmul(w, tf.transpose(norm_x))
    latent_linear = tf.matmul(lw, Vx)

    pred = tf.reduce_sum(tf.sigmoid(linear + latent_linear + full_kern + b))

    # todo: dropout. currently no regularization on the interaction layers in the cost functino
    # can handle with FTRL optimization
    cost = tf.reduce_mean(
        -y * tf.log(pred + 0.0000000001)
        - (1 - y) * tf.log((1 - pred + 0.0000000001))
        + lambda_k * tf.nn.l2_loss(K)
        + lambda_w * tf.nn.l2_loss(w)
        + lambda_constants * tf.nn.l2_loss(b)
    )
    optimize = optimizer.minimize(cost)
    norm = tf.reduce_mean(tf.nn.l2_loss(w))

    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)
        last_cost = 1000000
        for iter in range(0, max_iter):
            avg_cost = 0

            for i in range(num_items):
                _, c, n = sess.run(
                    [optimize, cost, norm],
                    feed_dict={
                        x: observed_features[i].reshape(1, num_features),
                        y: labels[i],
                    },
                )
                avg_cost += c / num_items
            if verbose:
                print("epoch: %s, cost: %s" % (iter + 1, avg_cost))

            # check for convergence
            if abs(avg_cost - last_cost) / avg_cost < epsilon:
                break

            last_cost = avg_cost

        if verbose:
            print("optimization finished")
        predictions = []
        total_costs = 0
        for i in range(observed_features_validation.shape[0]):
            p, c = sess.run(
                [pred, cost],
                feed_dict={
                    x: observed_features_validation[i].reshape(1, num_features),
                    y: labels_validation[i],
                },
            )
            predictions.append(p)
            total_costs += c
        return (
            predictions,
            total_costs / observed_features_validation.shape[0],
            sess.run([norm]),
        )

In [19]:
# use this data for now

categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]

X, y = datasets.fetch_20newsgroups(
    categories=categories,
    shuffle=True,
    remove=["headers", "footers", "quotes"],
    return_X_y=True,
)
y = np.array([1 if y_i == 1 else 0 for y_i in y])
tfidf = TfidfVectorizer(decode_error=False, min_df=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train = tfidf.fit_transform(X_train).todense()
X_test = tfidf.transform(X_test).todense()

In [23]:
r = 10
predictions, test_costs, norm = factorize(
    X_train, y_train, X_test, y_test, r, verbose=True, lambda_v=0.1, max_iter=300
)
print("rank: %s, cost: %s, overall AUC: %s, norm: %s") % (
    r,
    test_costs,
    roc_auc_score(y_test, predictions, average="weighted"),
    norm,
)

AttributeError: module 'tensorflow' has no attribute 'placeholder'

In [2]:
# with some regularization via the optimizer
r = 10
predictions, test_costs, norm = factorize(
    X_train,
    y_train,
    X_test,
    y_test,
    r,
    verbose=True,
    max_iter=30,
    optimizer=tf.train.FtrlOptimizer(1.0, l2_regularization_strength=1.0),
)
print("rank: %s, cost: %s, overall AUC: %s, norm: %s") % (
    r,
    test_costs,
    roc_auc_score(y_test, predictions, average="weighted"),
    norm,
)

NameError: name 'factorize' is not defined