In [2]:
!pip install geomle



In [4]:
import sys
import os
from os import path
current_folder = path.dirname(path.abspath('')) 
sys.path.append(current_folder)
from estimators import *
from geomle import geomle, mle, DataGenerator
import multiprocessing as mp
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from scipy.linalg import cholesky
from scipy.special import gammainc, lambertw
import scipy.io
import matplotlib as mpl
from matplotlib import pyplot as plt
import umap
import seaborn as sns
import random
import time
import numpy as np
import pandas as pd
import pickle
import rpy2
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
import rpy2.robjects.packages as rpackages
from functools import wraps
import subprocess
from IPython.display import display_html
from operator import itemgetter
ig0 = itemgetter(0)
ig1 = itemgetter(1)
ig2 = itemgetter(2)
rpy2.robjects.numpy2ri.activate()
utils = rpackages.importr('utils')
#utils.install_packages('intrinsicDimension')
#utils.install_packages('ider')
intdimr = rpackages.importr('intrinsicDimension')
ider   = rpackages.importr('ider')
r_base = rpackages.importr('base')

In [1]:
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def mean_sqe(estimations, truth):
    '''
    Mean squared error 
    '''
    return ((estimations - truth)^2/truth).sum() /len(truth) 
    
def mean_pe(estimations, truth):
    '''
    Mean percentage error 
    '''
    return (abs(estimations - truth)/truth).sum() /len(truth)*100

def mean_ge(estimations, truth):
    '''
    Mean geometric error: The geometric mean of the error *ratio*. It is always >= 1.
    '''
    ratios = np.concatenate(((estimations/truth)[np.newaxis, :], (truth/estimations)[np.newaxis, :]), axis=0)
    return np.power(ratios.max(axis=0).prod(), 1.0/len(estimations))

def med_pe(estimations, truth):
    '''
    Median error in %.
    '''
    return np.percentile(abs(estimations - truth)/truth, q=50)*100


def randball(n_points,ndim,radius,center = []):
    ''' Generate uniformly sampled ndim-sphere interior'''
    if center == []:
        center = np.array([0]*ndim)
    r = radius
    x = np.random.normal(size=(n_points, ndim))
    ssq = np.sum(x**2,axis=1)
    fr = r*gammainc(ndim/2,ssq/2)**(1/ndim)/np.sqrt(ssq)
    frtiled = np.tile(fr.reshape(n_points,1),(1,ndim))
    p = center + np.multiply(x,frtiled)
    return p, center

def proxy(tup):
    function,X,Dict = tup
    return function(X,**Dict)

def get_nn(X,k,n_jobs=1):
    neigh = NearestNeighbors(n_neighbors=k,n_jobs=n_jobs)
    neigh.fit(X)
    dists, inds = neigh.kneighbors(return_distance=True)
    return dists,inds

def asPointwise(data,function, params, precomputed_knn = None, n_neighbors=100, n_jobs=1):
    '''Use a global estimator as a pointwise one by creating kNN neighborhoods'''
    if precomputed_knn is not None:
        knn = precomputed_knn
    else:
        _, knn = get_nn(data, k=n_neighbors, n_jobs=n_jobs)
        
    if n_jobs > 1:
        pool = mp.Pool(n_jobs)
        results = pool.map(proxy,[(function,data[i,:],params) for i in knn])
        pool.close()
        return results
    else:
        return [function(data[i,:],**params) for i in knn]


from functools import wraps
def calculate_time(func): 
    @wraps(func)
    def inner_func(*args, **kwargs): 
        begin = time.time() 
        res = func(*args, **kwargs) 
        end = time.time()
        return res, end - begin
    return inner_func

class DimEst():
    def __init__(self):
        self.names = ['MLE', 'GeoMLE', 'MIND', 'DANCo', 'FastDANCo', 'ESS', 'PCA', 'CD','FisherS','ANOVA','TwoNN']
        self.caldatas = {}
        
    def estimateAllMethods(self, data,ConditionalNumber=10):
        dim = data.shape[1]
        self.funcs = {'MLE':          self.mle(data),
                      #'GeoMLE':       self.geomle(data, dim),
                      #'DANCo':        self.danco(data, dim),
                      'FastDANCo':    self.fast_danco(data),
                      #'ESS':          self.ess(data),
                      'PCA':          self.pca(data),
                      #'CD':           self.cd(data),
                      'FisherS':      self.fisherS(data,ConditionalNumber),
                      'ANOVA':        self.anova(data),
                      'TwoNN':        self.twonn(data)
                     }
                      
        self.times = {key: ig1(val) for key, val in self.funcs.items()}
        self.funcs = {key: ig0(val) for key, val in self.funcs.items()}
        return self.funcs, self.times
    
    def estimateAllMethodsLocally(self, data, k, n_jobs = 1, ConditionalNumber = 10):
        dim = data.shape[1]
        
        _, knn = get_nn(data, k, n_jobs)
        
        mle_pw, tle_pw, mom_pw, ed_pw, ged_pw, pca_pw = self.rado_ests(data,k).values()
        self.funcs = {'MLE':          self.mlelocal(data,k),
                      #'GeoMLE':       self.geomlelocal(data, dim),
                      'mind_mlk':         asPointwise(data,self.mind_mlk,{'dim':dim},precomputed_knn=knn,n_jobs=1),
                      'mind_mli':         asPointwise(data,self.mind_mli,{'dim':dim},precomputed_knn=knn,n_jobs=1),
                      #'DANCo':        asPointwise(data,self.danco,{'dim':dim},precomputed_knn=knn,n_jobs=1),
                      'FastDANCo':    self.fast_dancoloop(data),
                      'ESS':          asPointwise(data,self.ess,{},precomputed_knn=knn,n_jobs=1),
                      #'PCA':          self.pca(data),
                      'CD':           asPointwise(data,self.lcd,{},precomputed_knn=knn,n_jobs=1),
                      'FisherS':      asPointwise(data,self.fisherS,{'ConditionalNumber':ConditionalNumber},precomputed_knn=knn,n_jobs=n_jobs),
                      'ANOVA':        self.anovalocal(data,k),
                      'TwoNN':        asPointwise(data,self.twonn,{},precomputed_knn=knn,n_jobs=n_jobs),
                      'radoMLE':      mle_pw,
                      'radoTLE':      tle_pw,
                      'radoMOM':      mom_pw,
                      'radoED':       ed_pw,
                      'radoGED':      ged_pw,
                      'radoPCA':      pca_pw
                     }
                      
        self.times = {}
        for key, val in self.funcs.items():
            if key in ['MLE','ANOVA','FastDANCo']:
                self.funcs[key] = np.array(val[0])
                self.times[key] = val[1]
            elif 'rado' in key:
                self.funcs[key] = np.array(val)
            else:
                self.funcs[key] = np.array([i[0] for i in val])
                self.times[key] = np.sum([i[1] for i in val])
            
        return self.funcs, self.times
    
    
    @staticmethod
    def rado_ests(data,k):
        return radovanovic_estimators_matlab(data,k=k)
        mle_pw, tle_pw, mom_pw, ed_pw, ged_pw, pca_pw = rado_ests.values()
    
    @staticmethod
    @calculate_time
    def mle(data):
        return intdimr.maxLikGlobalDimEst(data,k=20).rx2('dim.est')[0]
    
    @staticmethod
    @calculate_time
    def mlelocal(data,k):
        res = intdimr.maxLikPointwiseDimEst(data,k=k)
        return np.array([i[0] for i in res])

    @staticmethod
    @calculate_time
    def geomle(data, dim):
#         k1 =  k1_log(dim)
#         k2 =  k2_log(dim)
        return geomle(pd.DataFrame(data), k1=20, k2=55, nb_iter1=1, alpha=5e-3).mean()

    @staticmethod
    @calculate_time
    def geomlelocal(data, dim):
#         k1 =  k1_log(dim)
#         k2 =  k2_log(dim)
        return geomle(pd.DataFrame(data), k1=20, k2=55, nb_iter1=1, alpha=5e-3)
    
    @staticmethod
    @calculate_time
    def mind_mlk(data, dim):
        return intdimr.dancoDimEst(data, k=10, D=min(dim,100), ver="MIND_MLk").rx2('dim.est')[0]
    

    @staticmethod
    @calculate_time
    def mind_mli(data, dim):
        return intdimr.dancoDimEst(data, k=10, D=min(dim,100), ver="MIND_MLi").rx2('dim.est')[0]
    
    #@staticmethod
    @calculate_time
    def danco(self,data, dim):
        try:
            res = intdimr.dancoDimEst(data, k=10, D=min(dim,100), calibration_data = self.caldatas[len(data)], ver="DANCo")
            self.caldatas[len(data)]=res[2]
            return res.rx2('dim.est')[0]
        except:
            res = intdimr.dancoDimEst(data, k=10, D=min(dim,100), ver="DANCo")
            self.caldatas[len(data)]=res[2]
            return res.rx2('dim.est')[0]

    @staticmethod
    @calculate_time
    def fast_danco(data):
        return runDANCo(data)[0]
    
    @staticmethod
    @calculate_time
    def fast_dancoloop(data):
        return runDANColoop(data)
    
    @staticmethod
    @calculate_time
    def ess(data):
        return ess_py(data)[0]
    
    @staticmethod
    @calculate_time
    def pca(data):
        return intdimr.pcaLocalDimEst(data, 'FO').rx2('dim.est')[0]
    
    @staticmethod
    @calculate_time
    def cd(data):
        return corint_py(data, k1=10, k2=20)[0]
    
    @staticmethod
    @calculate_time
    def lcd(data):
        return corint_py(data, k1=10, k2=len(data)-1)[0]
    
    @staticmethod
    @calculate_time
    def fisherS(data,ConditionalNumber):
        return SeparabilityAnalysis(data,ProducePlots=0,alphas=np.arange(.2,1,.02)[None],ConditionalNumber=ConditionalNumber)[1][0]
    
    @staticmethod
    @calculate_time
    def anova(data):
        return runANOVAglobal(data)[0,0]
    
    @staticmethod
    @calculate_time
    def anovalocal(data,k):
        return runANOVAlocal(data,k=k)[:,0]
    
    @staticmethod
    @calculate_time
    def twonn(data):
        res = twonn_py(data)
        return res    

# Synthetic data

In [2]:
DE=DimEst()

In [5]:
data_files = list(filter(lambda x: '.data' in x, os.listdir('../data/id-tle-synth-m10000-data/data/m10000/')))
synthetic_data = [np.array(pd.read_csv('../data/id-tle-synth-m10000-data/data/m10000/'+file,sep=' ',header=None)) for file in data_files]
synthetic_data = dict(zip(data_files,synthetic_data))

### Global ID saturation analysis

In [7]:
# testing separability saturation

for key,data in synthetic_data.items():
    n_repeats = 1
    sample_sizes = [1,2,4,8,16,32,64]
    sample_sizes_halves = [x/2 for x in sample_sizes]

    all_sample_sizes = sample_sizes+sample_sizes_halves
    all_sample_sizes = list(set(all_sample_sizes))
    all_sample_sizes.sort()
    
    
    datasets_done = [i.split('_')[0] for i in list(filter(lambda x: '.data' in x, os.listdir('../results')))]
    dataset_name = key
    n_samples = data.shape[0]
    
    if dataset_name in datasets_done:
        print('already computed ', dataset_name)
        continue
    
    print('\n',dataset_name)
    print('Running subsampling analysis...\nSubsample percentages = {}\nNumber of repeats = {}\nNumber of samples = {}\nDimension = {}'.format(sample_sizes,n_repeats,n_samples,data.shape[1]))
    print('----------------------------\n')
    
    n_methods = 6
    all_dim_estimates = np.empty([n_methods,len(all_sample_sizes)+1,n_repeats])

    runtimes = []
    for i,sz in enumerate(all_sample_sizes):
        sample_size = int(n_samples*sz/100)
        print('Sample size = {}'.format(sample_size))
        start_time = time.time()
        for j in range(0,n_repeats):
            sample = np.random.choice(n_samples,replace=False, size=sample_size)
            xs = data[sample,:]

            #Run estimators
            allres = DE.estimateAllMethods(xs)
            results = allres[0]
            runtimes.append(allres[1])

            #Store
            all_dim_estimates[0,i,j] = results['FisherS']
            all_dim_estimates[1,i,j] = results['FastDANCo']
            all_dim_estimates[2,i,j] = results['TwoNN']
            all_dim_estimates[3,i,j] = results['ANOVA']
            all_dim_estimates[4,j] = results['MLE']
            all_dim_estimates[5,j] = results['PCA']


        print("Elapsed time = {}".format(time.time()-start_time))

    allres = DE.estimateAllMethods(data)
    results = allres[0]
    runtimes.append(allres[1])

    for i in range(0,n_repeats):
        all_dim_estimates[0,len(all_sample_sizes),i] = results['FisherS']
        all_dim_estimates[1,len(all_sample_sizes),i] = results['FastDANCo']
        all_dim_estimates[2,len(all_sample_sizes),i] = results['TwoNN']
        all_dim_estimates[3,len(all_sample_sizes),i] = results['ANOVA']
        all_dim_estimates[4,len(all_sample_sizes),i] = results['MLE']
        all_dim_estimates[5,len(all_sample_sizes),i] = results['PCA']


    all_sample_sizes.append(100)
    sample_sizes.append(100)

    np.savetxt("../results/"+dataset_name+"_all_dim_estimates.txt", all_dim_estimates, delimiter="\t")
    np.savetxt("../results/"+dataset_name+"_all_sample_sizes.txt", all_sample_sizes, delimiter="\t")
    np.savetxt("../results/"+dataset_name+"_sample_sizes.txt", sample_sizes, delimiter="\t")


 m10a-09.data
Running subsampling analysis...
Subsample percentages = [1, 2, 4, 8, 16, 32, 64]
Number of repeats = 1
Number of samples = 10000
Dimension = 11
----------------------------

Sample size = 50
Elapsed time = 6.547706842422485
Sample size = 100


KeyboardInterrupt: 

In [None]:
# plot convergence curve
alls=pd.read_csv('../results/'+dataset_name+'_all_sample_sizes.txt', sep='\t',header=None)
all_sample_sizes = alls.to_numpy()[:,0]
sizes=pd.read_csv('../results/'+dataset_name+'_sample_sizes.txt', sep='\t',header=None)
sample_sizes =sizes.to_numpy()[:,0]

estimators = ['fisherS_dim_estimates','ess_dim_estimates','danco_dim_estimates',
              'twonn_dim_estimates','anova_dim_estimates','mle_dim_estimates','mind_dim_estimates']

for estimator in estimators:
    print(estimator)
    ds=pd.read_csv('../results/'+dataset_name+'_'+estimator+'.txt', sep='\t',header=None)
    dim_estimates=ds.to_numpy()

    mn = np.mean(dim_estimates[:,:],1)
    std = np.std(dim_estimates[:,:],1)

    plt.figure(figsize=(10,3))
    plt.subplot(121)
    plt.plot(all_sample_sizes,mn,'bs-')
    plt.plot(all_sample_sizes,mn-std,'r--')
    plt.plot(all_sample_sizes,mn+std,'r--')
    plt.plot(all_sample_sizes,dim_estimates,'b+')
    plt.xlabel('Percentage of points')
    plt.ylabel('Estimated intrinsic dimension')

    ratios = []
    for sz in sample_sizes:
        sz_half = sz/2
        k = [i for i,asz in enumerate(all_sample_sizes) if np.abs(sz-asz)<0.001 ][0]
        k_half = [i for i,asz in enumerate(all_sample_sizes) if np.abs(sz_half-asz)<0.001 ][0]
        #print(k,all_sample_sizes[k],k_half,all_sample_sizes[k_half])
        ratios.append(1-std[k]/std[k_half])
        
    #avoid case of 0 std (nan ratio)
    ratios=np.array(ratios)
    ratios[np.isnan(ratios)]=1

    plt.subplot(122)
    plt.plot(sample_sizes,ratios,'bs-')
    plt.show()

### Local estimates convergence

In [103]:
poker=np.array(pd.read_csv('../data/poker-hand-training-true.data',header=None))
n_samples = poker.shape[0]
np.random.seed(0);subsample = np.random.choice(n_samples,replace=False, size=5000)
poker = poker[subsample]

In [None]:
for key,data in [('poker',poker)]:
    n_repeats = 1
    sample_sizes = [5,7,10,15,20,25,30,50,70,90]
    sample_sizes_halves = [x/2 for x in sample_sizes]

    all_sample_sizes = sample_sizes+sample_sizes_halves
    all_sample_sizes = list(set(all_sample_sizes))
    all_sample_sizes.sort()
    
    
    datasets_done = [i.split('_')[0] for i in list(filter(lambda x: '.data' in x, os.listdir('../results')))]
    dataset_name = key
    n_samples = data.shape[0]
    
    if dataset_name in datasets_done:
        print('already computed ', dataset_name)
        continue
    
    print('\n',dataset_name)
    print('Running subsampling analysis...\nSubsample percentages = {}\nNumber of repeats = {}\nNumber of samples = {}\nDimension = {}'.format(sample_sizes,n_repeats,n_samples,data.shape[1]))
    print('----------------------------\n')
    
    for i,sz in enumerate(all_sample_sizes):
        sample_size = int(n_samples*sz/100)
        print('Sample size = {}'.format(sample_size))
        start_time = time.time()
        for j in range(0,n_repeats):
            sample = np.random.choice(n_samples,replace=False, size=sample_size)
            xs = data[sample,:]

    
    ests_pw_dict_all_neighbors = []
    num_neighbors = [25,50,100,200]
    n_jobs = 4
    #Run ID estimators pointwise in KNN neighborhoods of different sizes
    for n_neighbors in num_neighbors:
        for dataset_name,data in [('mnist',real_data_subsampled)]:

            n_samples = data.shape[0]
            print(dataset_name)
            print('Running kNN ID for all estimators...\nNumber of samples = {}\nDimension = {}'.format(n_samples,data.shape[1]))
            print('----------------------------\n')
            print('kNN = ',n_neighbors)

            start_all=time.time()

            ests_pw_dict = DE.estimateAllMethodsLocally(data, k = n_neighbors, n_jobs = n_jobs, ConditionalNumber = np.inf)

            print('elapsed :',round(time.time()-start_all,2))

            with open('../results/ests_pw_dict_'+dataset_name+'_kNN'+str(n_neighbors)+'.pkl','wb') as f:
                pickle.dump(ests_pw_dict,f)

### Study kNN ID

In [None]:
num_neighbors = [100]
n_jobs = 4

#Run ID estimators pointwise in KNN neighborhoods of different sizes
for n_neighbors in num_neighbors:
    for dataset_name,data in [('mnist',real_data_subsampled)]:
    
        n_samples = data.shape[0]
        print(dataset_name)
        print('Running kNN ID for all estimators...\nNumber of samples = {}\nDimension = {}'.format(n_samples,data.shape[1]))
        print('----------------------------\n')
        print('kNN = ',n_neighbors)

        start_all=time.time()

        ests_pw_dict = DE.estimateAllMethodsLocally(data, k = n_neighbors, n_jobs = n_jobs)

        print('elapsed :',round(time.time()-start_all,2))

        with open('../results/ests_pw_dict_'+dataset_name+'_kNN'+str(n_neighbors)+'.pkl','wb') as f:
            pickle.dump(ests_pw_dict,f)

mnist
Running kNN ID for all estimators...
Number of samples = 2000
Dimension = 784
----------------------------

kNN =  100


In [1]:
num_neighbors = np.arange(25,425,25)

NameError: name 'np' is not defined

In [None]:
with open('../results/ests_pw_dict_'+dataset_name+'_kNN'+str(n_neighbors)+'.pkl','rb') as f:
    res_pw=pickle.load(f)

### Study global pointwise ID

In [8]:
list_global_id = []
list_inseparability_id = []
for dataset_name,data in list(real_data_subsampled.items())[:1]:

    print(dataset_name)
    start_all=time.time()

    [n_alpha,n_single,p_alpha,alphas,separable_fraction,Xp] = SeparabilityAnalysis(data,ProducePlots=0)
    n_pointwise, idx = point_inseparability_to_pointID(n_alpha,n_single,p_alpha,alphas,idx='all_separable')
    
    list_global_id.append(n_single[0])
    list_inseparability_id.append(n_pointwise)

    print('elapsed :',round(time.time()-start_all,2))

### Study the behavior of the statistics used by the various estimators