In [14]:
import math
import pandas as pd 
import numpy as np
from numpy.linalg import inv
from scipy.linalg import eigh
import csv
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [15]:
# % dica - supervised domain-invariant component analysis on colored MNIST
# %
# % Synopsis
# %   [V,D,X,Xt] = dica(Kx, Ky, Kt, groupIdx, lambda, epsilon, M)
# %
# % Description
# %   Domain-invariant component analysis (DICA) finds a low dimensional
# %   subspace of data points from several distributions so as to minimize 
# %   the variance among the distributions of projected data points. It also
# %   takes into account the affinity in the output space.
# % 
# %
# % Inputs ([]s are optional)
# %   (matrix) Kx         NxN kernel matrix between data points
# %   (matrix) Ky         NxN kernel matrix between outputs
# %   (matrix) Kt         NtxN kernel matrix between test samples and
# %                           training samples
# %   (vector) groupIdx   Nx1 vector of group membership of data points
# %   (scalar) lambda     The regularization parameter (input)
# %   (scalar) epsilon    The regularization parameter (output)
# %   (scalar) M          The dimensionality of subspace (M < N)
# %
# % Outputs ([]s are optional)
# %   (matrix) V          Nxdim matrix in which each column is the
# %                       eigenvector
# %   (matrix) D          MxM diagonal matrix in which the diagonal elements
# %                       are eigenvalues associated with the eigenvectors in
# %                       the matrix V
# %   (matrix) X          MxN matrix in which each column is the projection
# %                       of original data point onto the subspace spanned by
# %                       the eigenvectors in the matrix V
# %   (matrix) Xt         MxNt matrix in which each column is the projection
# %                       of test data point onto the subspace spanned by
# %                       the eigenvectors in the matrix V

# % References
# %   K. Muandet, D.Balduzzi,and B.Schölkopf, Domain Generalization via 
# %   Invariant Feature Representation. The 30th International Conference on 
# %   Machine Learning (ICML 2013), pages 10?18, Atlanta, Georgia.
# %
# % DICA Code Reference from
# %   Krikamol Muandet <krikamol@tuebingen.mpg.de>

In [16]:
myfilename = "mnist_digit100_color90flipped_testpurple_022120.npz"
data = np.load(myfilename)

In [23]:
x_train = data['x_train'][data['train_inds']]
y_train = data['y_train'][data['train_inds']]
a_train = data['attr_train'][data['train_inds']]
x_test = data['x_test']
y_test = data['y_test']
x_test.shape

(10000, 2352)

In [18]:
x_train_1_arr = []
y_train_1_arr = []
x_train_2_arr = []
y_train_2_arr = []
for i in range(len(x_train)):
    if np.all(a_train[i] == [1.,0.,0.]): # study 1
        x_train_1_arr.append(x_train[i])
        y_train_1_arr.append(y_train[i])
    elif np.all(a_train[i] == [0.,1.,0.]): # study 2
        x_train_2_arr.append(x_train[i])
        y_train_2_arr.append(y_train[i])
    else:
        raise ValueError()

x_train_1 = np.asarray(x_train_1_arr)
y_train_1 = np.asarray(y_train_1_arr)
x_train_2 = np.asarray(x_train_2_arr)
y_train_2 = np.asarray(y_train_2_arr)
x_train_1.shape

(7978, 2352)

In [19]:
def get_variance_x():
    variance_sum = 0
    for j in range(len(x_train_1[0])):
        variance_sum += np.var(x_train_1[:,j])
    for j in range(len(x_train_2[0])):
        variance_sum += np.var(x_train_2[:,j])

    return 1.0 * variance_sum / (len(x_train_1[0]) + len(x_train_2[0]))

In [20]:
lmbda = 0.1
eps = 0.001
sigma_x = get_variance_x()
M = 10000
sigma_x

0.029947557405004578

In [21]:
# define kernels

def g_kernel_x(x_p, x_q): # x_p, x_q are images (encoded as 3*28*28 vectors) in i'th domain
    dist = np.linalg.norm(x_p-x_q)**2
    power = -1.0/(2.0*(sigma_x**2)) * dist
    return math.exp(power)

def g_kernel_y(a, b):
    if np.all(a == b):
        return 1
    else:
        return 0

x_p = np.ones(3*28*28)
x_q = np.ones(3*28*28)
print(g_kernel_x(x_p, x_q))

1.0


In [22]:
# create groupIdx
study_1s = np.asarray([1]*len(x_train_1))
study_2s = np.asarray([2]*len(x_train_2))
groupIdx = np.concatenate((study_1s, study_2s), axis=None)
groupIdx

array([1, 1, 1, ..., 2, 2, 2])

In [25]:
n = len(x_train)
s_1 = len(x_train_1)

In [14]:
# create k_x

k_x = np.zeros((n, n))
k_x.shape

for i in range(n):
    if i < s_1:
        x_p = x_train_1[i]
    else:
        x_p = x_train_2[i-s_1]
        
    for j in range(n):
        if j < s_1:
            x_q = x_train_1[j]
        else:
            x_q = x_train_2[j-s_1]
        k_x[i][j] = g_kernel_x(x_p, x_q)
k_x[0]
        

KeyboardInterrupt: 

In [15]:
# create k_y

k_y = np.zeros((n, n))
k_y.shape

for i in range(n):
    if i < s_1:
        y_p = y_train_1[i]
    else:
        y_p = y_train_2[i-s_1]
        
    for j in range(n):
        if j < s_1:
            y_q = y_train_1[j]
        else:
            y_q = y_train_2[j-s_1]
        k_y[i][j] = g_kernel_y(y_p, y_q)
k_y[0]

KeyboardInterrupt: 

In [29]:
# create k_t

n_t = 10 # len(x_test)
k_t = np.zeros((n_t, n))
k_t.shape

for i in range(n_t):
    x_pt = x_test[i]
    print(x_pt)
        
    for j in range(n):
        if j < s_1:
            x_q = x_train_1[j]
        else:
            x_q = x_train_2[j-s_1]
        k_t[i][j] = g_kernel_x(x_pt, x_q)
s = ""
for i in range(len(k_t[0])):
    if k_t[0][i] != 0.:
        s += str(i)
print(s)




In [24]:
k_y.shape

(16000, 16000)

In [33]:
def dica(Kx, Ky, Kt, groupIdx, lmbda, eps, M):
    N = len(Kx[0])
    Nt = len(Kt[0])
    uniqueGroupIdx = np.unique(groupIdx)
    G = len(uniqueGroupIdx)
    NG = [s_1, n-s_1]


    H = 1.0*np.identity(N)-1.0*np.ones(N)/N

    L = np.zeros((N, N))
    

    for i in range(0, N):
        for j in range(0,N):
            if groupIdx[i] == groupIdx[j]:
                groupSize = NG[groupIdx[i]-1]
            else: 
                groupSize_i = NG[groupIdx[i]-1]
                groupSize_j = NG[groupIdx[j]-1]
                L[i][j] = -1.0/(G*G*groupSize_i*groupSize_j)


    Ky = np.dot(np.dot(H,Ky),H)
    Kx = np.dot(np.dot(H,Kx),H)

    B = np.dot(Ky, np.dot(inv(Ky+N*eps*np.identity(N)),np.dot(Kx,Kx)))
    A = np.dot(inv(np.dot(np.dot(Kx,L),Kx)+Kx+lmbda*np.identity(N)),B)

    w, v = eigh(A)  # w is the eigenvalues and v are the eigenmatrix, increasing order
    V = v[:, (len(w)-M):]
    D = np.diag(w[(len(w)-M):])
    Evals = np.real(D)

    for i in range(0,M):
        V[:,i] = V[:,i]/(Evals[i,i])**0.5


    X = np.dot(V.T,Kx)

    Ht = np.identity(Nt)-np.ones(Nt)/Nt
    Kt = np.dot(np.dot(Ht,Kt),H)
    Xt = np.dot(V.T,Kt.T)
    return (V, D, X, Xt)

In [32]:
Nt = 100
N = 160
H = 1.0*np.identity(N)-1.0*np.ones(N)/N
Ht = np.identity(Nt)-np.ones(Nt)/Nt
Kt = np.zeros((Nt, N))
Kt.shape
Kt = np.dot(np.dot(Ht,Kt),H)

In [20]:
np.dot(Ht,Kt).shape
H.shape

(160, 160)

In [None]:
# run DICA to get X and Xt
(V, D, X, Xt) = dica(k_x, k_y, k_t, groupIdx, lmbda, eps, M)

In [12]:
y_train_labels = y_train[:,0]
y_test_labels = y_test[:,0]
y_test_labels

array([0, 1, 1, ..., 1, 0, 0])

In [41]:
small = 5000
x_train.shape
X = np.zeros((6000, 16000))
np.transpose(X).shape

(16000, 6000)

In [15]:

svclassifier = SVC(kernel='linear')
svclassifier.fit(x_train[:small], y_train_labels[:small])
y_pred = svclassifier.predict(x_test[:small])
print(confusion_matrix(y_test_labels[:small],y_pred))
print(classification_report(y_test_labels[:small],y_pred))

[[1927  512]
 [ 447 2114]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      2439
           1       0.81      0.83      0.82      2561

    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000



In [54]:
# train SVM on X and Xt?
svclassifier = SVC(kernel='linear')
svclassifier.fit(X, y_train_labels)

In [None]:
# evaluate SVM?

y_pred = svclassifier.predict(Xt)
print(confusion_matrix(y_test_labels,y_pred))
print(classification_report(y_test_labels,y_pred))