# Quantitative Experiments with LXDR
In this notebook, we are performing quantitative experiments!

In [1]:
import math
import numpy as np
import site, pprint
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, maxabs_scale
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics.pairwise import cosine_similarity as cd, euclidean_distances as ed
from tensorflow.keras import layers, losses, callbacks, Sequential
from tensorflow.keras.models import Model
from lxdr import LXDR

We will need PCA, KPCA and AE as DR techniques

In [2]:
class Autoencoder(Model):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim   
        self.encoder = Sequential([
          layers.Dense(int(input_dim/2)+2, activation='tanh'),
          layers.Dense(int(input_dim/2)+1, activation='tanh'),
          layers.Dense(latent_dim, activation='tanh'),
        ])
        self.decoder = Sequential([
          layers.Dense(int(input_dim/2)+1, activation='tanh'),
          layers.Dense(int(input_dim/2)+2, activation='tanh'),
          layers.Dense(input_dim, activation='tanh'),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

We will use 3 real datasets, and later few synthetic

In [3]:
#datasets
dataset_names = ['Iris', 'Diabetes', 'Digits']
iris = datasets.load_iris()
iris_data = iris.data
iris_predictions = iris.target
iris_dimensions = 3

diabetes = datasets.load_diabetes()
diabetes_data = diabetes.data
diabetes_predictions = diabetes.target
diabetes_dimensions = 8

digits = datasets.load_digits()
digits_data = digits.data
digits_predictions = digits.target
digits_dimensions = 25

sample = 0

We prepare our data! And we train the different DR techniques!

In [None]:
data = {'X':[iris_data, diabetes_data, digits_data], 
        'y':[iris_predictions, diabetes_predictions, digits_predictions],
        'd':[iris_dimensions, diabetes_dimensions, digits_dimensions],
        'n':[50, 150, 750],
        'scaler':[]}
model = {'PCA':[], 'KPCA':[], 'AE':[]}

for X, y, d in zip(data['X'],data['y'],data['d']):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   
    scaler = MaxAbsScaler().fit(x_train)
    data['scaler'].append(scaler)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    pca = PCA(d, random_state=42)
    pca.fit(x_train, y_train)
    model['PCA'].append(pca)
    
    kpca = KernelPCA(d, kernel='rbf', random_state=42)
    kpca.fit(x_train, y_train)
    kpca.fit(x_train, y_train)
    model['KPCA'].append(kpca)
    
    callback = callbacks.EarlyStopping(monitor='loss', patience=3, verbose=0, restore_best_weights=True)
    autoencoder = Autoencoder(len(x_train[0]),d)
    autoencoder.compile(optimizer='adam', loss='mae')
    autoencoder.fit(x_train, x_train,
                    epochs=200,
                    shuffle=True,
                    validation_split=0.1,
                    callbacks=[callback],
                    verbose=0)
    model['AE'].append(autoencoder)

We are ready to measure the weights difference and the instance difference as presented in our paper!

In [None]:
import csv
with open('results_real_datasets.csv','w', encoding='UTF8') as f:
    writer = csv.writer(f)
    dataset = 0
    for X, y, d, n in zip(data['X'],data['y'],data['d'],data['n']):
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
        feature_names = ['F'+str(i) for i in range(1,len(x_train[0])+1)]
        scaler = data['scaler'][dataset]
        x_train = scaler.transform(x_train)
        mean = x_train.mean(axis=0)
        x_test = scaler.transform(x_test)

        pca = model['PCA'][dataset]
        x_test_pca = pca.transform(x_test)

        kpca = model['KPCA'][dataset]
        x_test_kpca = kpca.transform(x_test)

        ae = model['AE'][dataset]
        x_test_ae = ae.predict(x_test)  
        
        for scope in ['local','global']:
            lxdr_pca =  LXDR(pca, feature_names, scope, x_train, mean=mean)
            lxdr_kpca =  LXDR(kpca, feature_names, scope, x_train)
            lxdr_ae =  LXDR(ae, feature_names, scope, x_train, True)

            ed_pca = []
            cd_pca = []
            mae_pca = []

            ed_kpca = []
            cd_kpca = []
            mae_kpca = []

            ed_ae = []
            cd_ae = []
            mae_ae = []

            instance = 0
            for x in x_test:
                weights_pca = lxdr_pca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                a = pca.components_.reshape((1,-1))[0]
                b = np.array(weights_pca).reshape((1,-1))[0]

                ed_pca.append(ed([a],[b])[0][0])
                cd_pca.append(1-cd([a],[b])[0][0])
                mae_pca.append(mae([a],[b]))

                weights_kpca = lxdr_kpca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                ldrx_instance = np.dot(x, weights_kpca.T)
                a = x_test_kpca[instance]
                b = ldrx_instance

                ed_kpca.append(ed([a],[b])[0][0])
                cd_kpca.append(1-cd([a],[b])[0][0])
                mae_kpca.append(mae([a],[b]))

                weights_ae = lxdr_ae.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                ldrx_instance = np.dot(x, weights_ae.T)
                a = x_test_ae[instance]
                b = ldrx_instance

                ed_ae.append(ed([a],[b])[0][0])
                cd_ae.append(1-cd([a],[b])[0][0])
                mae_ae.append(mae([a],[b]))

                instance += 1
                
            writer.writerow([dataset_names[dataset],'pca',scope,
                                         sum(ed_pca)/len(x_test), 
                                         sum(cd_pca)/len(x_test), 
                                         sum(mae_pca)/len(x_test)])
            writer.writerow([dataset_names[dataset],'kpca',scope,
                                          sum(ed_kpca)/len(x_test), 
                                          sum(cd_kpca)/len(x_test), 
                                          sum(mae_kpca)/len(x_test)])
            writer.writerow([dataset_names[dataset],'ae',scope, 
                                        sum(ed_ae)/len(x_test), 
                                        sum(cd_ae)/len(x_test), 
                                        sum(mae_ae)/len(x_test)])
        dataset += 1

We also experiment with 25 synthetically created datasets to measure the aforementioned metrics, as well as to perform a scalability analysis.

In [None]:
from sklearn.datasets import make_classification
import csv
with open('scalability.csv','w', encoding='UTF8') as f:
    writer = csv.writer(f)

    for features in range(10,260,10):
        feature_names = ['F'+str(i) for i in range(features)]
        #We increase here the features of a dataset!
        x, y = make_classification(n_samples=1000, n_features=features, n_informative=int(features/2), n_redundant=int(features/2), n_classes=2, shuffle=True, random_state=1)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) 
        scaler = MaxAbsScaler().fit(x_train)
        x_train = scaler.transform(x_train)
        mean = x_train.mean(axis=0)
        x_test = scaler.transform(x_test)

        pca = PCA(int(features/2), random_state=42)
        pca.fit(x_train, y_train)
        x_test_pca = pca.transform(x_test)
        
        kpca = KernelPCA(int(features/2), random_state=42)
        kpca.fit(x_train, y_train)
        x_test_kpca = kpca.transform(x_test)
        
        for scope in ['global']:
            n = 0
            
            lxdr_pca =  LXDR(pca, feature_names, scope, x_train, mean=mean)
            lxdr_kpca =  LXDR(kpca, feature_names, scope, x_train)

            ed_pca = []
            cd_pca = []
            mae_pca = []
            
            ed_kpca = []
            cd_kpca = []
            time_kpca = []

            instance = 0
            ts = time.time()
            for x in x_test[:5]:
                weights_pca = lxdr_pca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                a = pca.components_.reshape((1,-1))[0]
                b = np.array(weights_pca).reshape((1,-1))[0]

                ed_pca.append(ed([a],[b])[0][0])
                cd_pca.append(1-cd([a],[b])[0][0])
                mae_pca.append(mae([a],[b]))

                weights_kpca = lxdr_kpca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                ldrx_instance = np.dot(x, weights_kpca.T)
                a = x_test_kpca[instance]
                b = ldrx_instance

                ed_kpca.append(ed([a],[b])[0][0])
                cd_kpca.append(1-cd([a],[b])[0][0])

            writer.writerow([features,'kpca',scope,n,
                                         sum(ed_kpca)/5, 
                                         sum(cd_kpca)/5, 
                                         (time.time()-ts)/5])

                
            writer.writerow([features,'pca',scope,n,
                                         sum(ed_pca)/5, 
                                         sum(cd_pca)/5, 
                                         sum(mae_pca)/5])
        for scope in ['local']:
            for n in [int(1000/4),int(2*1000/4),int(3*1000/4)]:
                lxdr_pca =  DRX(pca, feature_names, scope, x_train, mean=mean)
                lxdr_kpca =  DRX(kpca, feature_names, scope, x_train)

                ed_pca = []
                cd_pca = []
                mae_pca = []
                
                ed_kpca = []
                cd_kpca = []
                time_kpca = []
                
                instance = 0
                ts = time.time()
                for x in x_test[:5]:

                    weights_pca = lxdr_pca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                    a = pca.components_.reshape((1,-1))[0]
                    b = np.array(weights_pca).reshape((1,-1))[0]

                    ed_pca.append(ed([a],[b])[0][0])
                    cd_pca.append(1-cd([a],[b])[0][0])
                    mae_pca.append(mae([a],[b]))
                    
                    weights_kpca = lxdr_kpca.explain_instance(x, n, auto_alpha=True, use_LIME=False)
                    ldrx_instance = np.dot(x, weights_kpca.T)
                    a = x_test_kpca[instance]
                    b = ldrx_instance

                    ed_kpca.append(ed([a],[b])[0][0])
                    cd_kpca.append(1-cd([a],[b])[0][0])

                writer.writerow([features,'pca',scope,n,
                                             sum(ed_pca)/5, 
                                             sum(cd_pca)/5, 
                                             sum(mae_pca)/5])
                
                writer.writerow([features,'kpca',scope,n,
                                             sum(ed_kpca)/5, 
                                             sum(cd_kpca)/5, 
                                             (time.time()-ts)/5])