In [None]:
#dataset: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html
import numpy as np
import sklearn as sk
import scipy as sp
from numpy.random import randn
from numpy.linalg import norm, svd
from IMC import IMC
from dirtyIMC import dirtyIMC

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc

In [None]:
features_train = []
labels_train = []
with open('data/mail/train-exp1') as f:
    for line in f:
        c = line.split()
        if ':' in c[0]:
            continue
        labe = np.array(c[0].split(',')).astype(np.int)
        feat = [x.split(':')[1] for x in c[1:]]
        features_train.append(feat)
        ll = np.zeros((101,))
        ll[labe] = 1
        labels_train.append(ll)
features_train = np.array(features_train).astype(np.float)
labels_train = np.array(labels_train) * 2 - 1

features_test = []
labels_test = []
with open('data/mail/test-exp1') as f:
    for line in f:
        c = line.split()
        if ':' in c[0]:
            continue
        labe = np.array(c[0].split(',')).astype(np.int)
        feat = [x.split(':')[1] for x in c[1:]]
        features_test.append(feat)
        ll = np.zeros((101,))
        ll[labe] = 1
        labels_test.append(ll)
features_test = np.array(features_test).astype(np.float)
labels_test = np.array(labels_test) * 2 - 1

In [None]:
num_test = 3000
cases = 8000
noise_level = 0

In [None]:
seed = 3
np.random.seed(seed)

perm = np.random.permutation(labels_train.shape[0] + labels_test.shape[0])[:cases]
features = np.concatenate([features_test, features_train], axis = 0)[perm, :]
features = np.concatenate([features, np.ones((cases, 1))], axis=1)
labels = np.concatenate([labels_test, labels_train], axis = 0)[perm, :]
labels_gt = labels.copy()
labels[:num_test,:] = 0

In [None]:
seed = 3
np.random.seed(seed)

lamb1 = 10
lamb2 = 60
maxiter = 25

n1 = labels.shape[0]
n2 = labels.shape[1]
d1 = features.shape[1]
d2 = labels.shape[1]
k1 = 30
k2 = 5

# print 'Generating random data...'
X = features
Y = np.eye(d2)
A = labels_gt
R = labels

t1 = int(round(noise_level * d1))
if t1 > 0:
    U, S, V = svd(X)
    N = U[:, d1:]
    I = np.random.choice(range(d1), t1, replace=False)
    X[:, I] = N[:, :t1]

W0 = randn(d1, k1)
H0 = randn(d2, k1)
U0 = randn(n1, k2)
V0 = randn(n2, k2)

# Run IMC
W, H, U, V, losses = dirtyIMC(R, X, Y, k1, k2, lamb1, lamb2, maxiter, W0, H0, U0, V0)
W_imc, H_imc, losses_imc = IMC(R, X, Y, k1, lamb1, maxiter * 2, W0, H0)

Diff = X.dot(W.T).dot(H).dot(Y.T) + U.T.dot(V) - A;
# Diff[R==0] = 0

relerr = norm(Diff, 'fro')**2 / norm(A, 'fro')**2 * 100
print 'dirtyIMC RelErr = %g'%(relerr)

Diff = X.dot(W_imc.T).dot(H_imc).dot(Y.T) - A;
# Diff[R==0] = 0

relerr = norm(Diff, 'fro')**2 / norm(A, 'fro')**2 * 100
print 'IMC RelErr = %g'%(relerr)

# plt.plot(losses)
# plt.yscale('log')
# plt.show()

In [None]:
IMC_pred = (X.dot(W_imc.T).dot(H_imc).dot(Y.T))[:num_test,:]
DIMC_pred = (X.dot(W.T).dot(H).dot(Y.T) + U.T.dot(V))[:num_test,:]

In [None]:
test_gt = labels_gt[:num_test,:]

In [None]:
precision = dict()
recall = dict()
average_precision = dict()

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(labels.shape[1]):
    precision[i], recall[i], _ = precision_recall_curve(test_gt[:, i], IMC_pred[:, i])
    average_precision[i] = average_precision_score(test_gt[:, i], IMC_pred[:, i])
    fpr[i], tpr[i], _ = roc_curve(test_gt[:, i], IMC_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# print average_precision
roc_auc = np.array(roc_auc.values())
print roc_auc, np.mean(roc_auc[~np.isnan(roc_auc)])

In [None]:
precision = dict()
recall = dict()
average_precision = dict()

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(labels.shape[1]):
    precision[i], recall[i], _ = precision_recall_curve(test_gt[:, i], DIMC_pred[:, i])
    average_precision[i] = average_precision_score(test_gt[:, i], DIMC_pred[:, i])
    fpr[i], tpr[i], _ = roc_curve(test_gt[:, i], DIMC_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# print average_precision
roc_auc = np.array(roc_auc.values())
print roc_auc, np.mean(roc_auc[~np.isnan(roc_auc)])

In [None]:
thresh = -0.3
IMC_acc = np.zeros(IMC_pred.shape)
IMC_acc[IMC_pred >= thresh] = 1
IMC_acc[IMC_pred < thresh] = -1
IMC_acc = IMC_acc == test_gt
print 'IMC:',np.mean(np.sum(IMC_acc, axis=0) * 1.0 / IMC_acc.shape[0])
DIMC_acc = np.zeros(DIMC_pred.shape)
DIMC_acc[DIMC_pred >= thresh] = 1
DIMC_acc[DIMC_pred < thresh] = -1
DIMC_acc = DIMC_acc == test_gt
print 'IMC:',np.mean(np.sum(DIMC_acc, axis=0) * 1.0 / DIMC_acc.shape[0])

In [None]:
IMC_acc = np.zeros(IMC_pred.shape)
IMC_acc[IMC_pred >= thresh] = 1
IMC_acc[IMC_pred < thresh] = -1
np.sum(IMC_acc == -1)