In [23]:
import numpy as np
import tensorflow as tf
import time
from tqdm import tqdm

# Load data

In [24]:
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

mnist = tf.keras.datasets.mnist
(X_tr, y_tr), (X_te, y_te) = mnist.load_data()

X_tr = np.reshape(X_tr, [-1, 784])
X_te = np.reshape(X_te, [-1, 784])

X_all = np.vstack([
    X_tr[y_tr == 3, :],
    X_te[y_te == 3, :],
    X_tr[y_tr == 5, :],
    X_te[y_te == 5, :]    
]).astype(np.float32)

X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.8)

n_three, n_five = sum(y_tr == 3) + sum(y_te == 3), sum(y_tr == 5) + sum(y_te == 5)
y_all = np.array([1]*n_three + [0]*n_five)

print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X_all != 0)))
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_all==0), np.mean(y_all==1)))

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)

Dataset shape: (13454, 784)
Non-zeros rate: 0.04034
Classes balance: 0.469 / 0.531


# Baselines

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
for model in [
                LogisticRegression(), 
                RandomForestClassifier(n_jobs=-1, n_estimators=200)
            ]:
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    print('model: {}'.format(model.__str__()))
    print('accuracy: {}'.format(acc))
    print()

# Dense example

In [None]:
from tffm2 import TFFMClassifier

for order in [2, 3]:
    model = TFFMClassifier(
        order=order, 
        rank=10, 
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        seed=42
    )
    model.fit(X_train=X_tr, y_train=y_tr, show_progress=True)
    predictions = list(model.predict(X=X_te, pred_batch_size=5000))
    print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions[0]["pred"])))

# Regression example

In [None]:
from tffm2 import TFFMRegressor
from sklearn.metrics import mean_squared_error

order = 3

model = TFFMRegressor(
    order=order, 
    rank=10, 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),  
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
)
# translate Y from {0,1} to {-10, 10}
model.fit(X_train=X_tr, y_train=y_tr*20-10, show_progress=True)
predictions = list(model.predict(X_te, pred_batch_size=5000))
print('[order={}] accuracy: {}'.format(order, accuracy_score(y_te, predictions[0]["pred"] > 0)))
print('MSE: {}'.format(mean_squared_error(y_te*20-10, predictions)))

# Save/load example (todo)

In [None]:
model = TFFMClassifier(
    order=3, 
    rank=10, 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), 
    n_epochs=10, 
    batch_size=-1,
    init_std=0.001,
    reg=0.001,
    checkpoint_dir='./checkpoint/',
    verbose=1
)

# Different optimizers

In [25]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [None]:
for optim, title in [(tf.keras.optimizers.Adam(learning_rate=0.00001), 'Adam'), 
              (tf.keras.optimizers.Ftrl(0.01, l1_regularization_strength=0.001), 'FTRL')]:
    acc = []
    model = TFFMClassifier(
        order=3, 
        rank=10, 
        optimizer=optim,
        batch_size=1024,
        init_std=0.001,
        reg=0.1
    )
    n_epochs = 5
    anchor_epochs = range(0, 200+1, n_epochs)
    for _ in anchor_epochs:
        # score result every 5 epochs
        model.fit(X_train=X_tr, y_train=y_tr)
        predictions = list(model.predict(X_te, pred_batch_size=5000))
        acc.append(accuracy_score(y_te, predictions[0]["pred"]))
    plt.plot(anchor_epochs, acc, label=title)

plt.xlabel('n_epochs')
plt.ylabel('accuracy')
plt.legend()
plt.grid()

# Different regularization strategies

In [None]:
mnist = tf.keras.datasets.mnist
(X_tr, y_tr), (X_te, y_te) = mnist.load_data()

X_tr = np.reshape(X_tr, [-1, 784])
X_te = np.reshape(X_te, [-1, 784])

X_all = np.vstack([
    X_tr[y_tr == 3, :],
    X_te[y_te == 3, :],
    X_tr[y_tr == 5, :],
    X_te[y_te == 5, :]    
]).astype(np.float32)

# make it more sparse (sparseness is about 97%)
X_all = X_all * (np.random.uniform(0, 1, X_all.shape) > 0.97)

n_three, n_five = sum(y_tr == 3) + sum(y_te == 3), sum(y_tr == 5) + sum(y_te == 5)
y_all = np.array([1]*n_three + [0]*n_five)

print('Dataset shape: {}'.format(X_all.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X_all != 0)))
print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_all==0), np.mean(y_all==1)))

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, random_state=42, test_size=0.3)

In [None]:
for use_reweight, title in [(False, 'no reweight reg'), (True, 'reweight reg')]:
    acc = []
    model = TFFMClassifier(
        order=3, 
        rank=10, 
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        batch_size=1024,
        init_std=0.001,
        reg=1.0,
        reweight_reg = use_reweight
    )
    n_epochs = 2
    anchor_epochs = range(0, 20+1, n_epochs)
    for _ in anchor_epochs:
        # score result every 5 epochs
        model.fit(X_tr, y_tr, n_epochs=n_epochs)
        predictions = model.predict(X_te)
        acc.append(accuracy_score(y_te, predictions))
    plt.plot(anchor_epochs, acc, label=title)
plt.xlabel('n_epochs')
plt.ylabel('accuracy')
plt.legend(loc=4)
plt.grid()

# Weighted Loss Function

When using `TFFMClassifier`, one can set the parameter `sample_weights` in order to 

1. Use a "balanced" weighting scheme, in which the weight applied to the positive class is $w_+ = n_- / n_+$.
2. Prove a custom weight that is applied to every sample from the positive class.
2. Prove arbitrary weights to be applied to each sample.

We will demonstrate the first two approaches.

In [None]:
from sklearn.metrics import confusion_matrix

# generate imbalanced data:
X_imbalanced = X_all[4000:,:]
y_imbalanced = y_all[4000:]

print('Classes balance: {:.03f} / {:.03f}'.format(np.mean(y_imbalanced==0), 
                                                  np.mean(y_imbalanced==1)))

print('Balanced positive weight is {:.03f}.'.format(np.mean(y_imbalanced==0)/np.mean(y_imbalanced==1)))

X_tr, X_te, y_tr, y_te = train_test_split(X_imbalanced, y_imbalanced, random_state=42, test_size=0.3)

In [None]:
# use default weighting
model = TFFMClassifier(
    order=2,
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))

In [None]:
confusion_matrix(y_te,predictions)

Unweighted loss shows good performance on prevalent class, but poor performance on class with smaller representation

In [None]:
# use balanced weighting
model = TFFMClassifier(
    order=2,
    sample_weight='balanced',
    rank=10, 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
model.destroy()

In [None]:
confusion_matrix(y_te,predictions)

Performance in underrepresented class improved, at the cost of performance in prevalent class.

In [None]:
# use manully weighting for positive class
model = TFFMClassifier(
    order=2,
    pos_class_weight=6.0,
    rank=10, 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='dense',
    seed=42
)
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
print('accuracy: {}'.format(accuracy_score(y_te, predictions)))
model.destroy()  

In [None]:
confusion_matrix(y_te,predictions)

Here we've overdone it, but we're quite accurate on the underrepresented class. The limiting case will cause the classifier to put all point into the over-weighted class.