In [1]:
import os

import numpy as np
import pandas as pd
import cv2
from PIL import Image
import PIL.ImageOps
from sklearn.model_selection import train_test_split
from sklearn import datasets

import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from sklearn.manifold import MDS
from sklearn.manifold import Isomap 
from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.manifold import SpectralEmbedding as SE
from sklearn.random_projection import GaussianRandomProjection as GRP
import umap

import matplotlib.pyplot as plt
import seaborn as sns

import ray 


In [2]:
ray.shutdown()
ray.init(num_cpus=8, memory=8e+9, object_store_memory=4e+9)


2020-03-10 17:16:42,902	INFO resource_spec.py:216 -- Starting Ray with 7.42 GiB memory available for workers and up to 3.73 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '129.69.205.59',
 'redis_address': '129.69.205.59:33559',
 'object_store_address': '/tmp/ray/session_2020-03-10_17-16-42_892552_38340/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-03-10_17-16-42_892552_38340/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2020-03-10_17-16-42_892552_38340'}

In [3]:
def flatten_image(data):
    d = data.shape
    #print(d)
    flat_data = data.flatten().reshape(d[0], np.product(d[1:]))
    #print(flat_data.shape)
    return flat_data
    

@ray.remote
def apply_dr(dr_name, file_name, perplexity = None, n_neighbors=None, min_dist=None, dataset_name = 'MNIST/', max_samples = 5000):
    print('---------Starting: {}-----------'.format(file_name))
    df = pd.read_csv(file_name)

    y = df['labels']
    X = df.iloc[:,:-2]
        
    if df.shape[0]>max_samples:
        X_train, features, y_train, labels = train_test_split(X, y, test_size=max_samples, random_state=42, stratify=y)
    else: 
        X_train, features, y_train, labels = train_test_split(X, y, test_size=0.7, random_state=42, stratify=y)
    
    
    file_name = "{dr_name}{perp}{neigh}{mindist}".format(dr_name = dr_name, 
                                         perp = "_p" + str(perplexity) if perplexity is not None else "",
                                         neigh = "_n" + str(n_neighbors) if n_neighbors is not None else "",
                                         mindist = "_d" + str(min_dist) if min_dist is not None else "")
    
    

    
    ## check if files exist
    print(file_name)
    if os.path.exists(dataset_name + '/data/' + file_name + '.csv'):
        print('File was already calculated. Skipping ....')
        return
    ########
    
    ## apply dr
    if dr_name=='PCA':
        dr = PCA(n_components = 2)
        
    elif dr_name=='TSNE':
        dr = TSNE(n_components = 2, perplexity = perplexity, verbose = 1)
        
    elif dr_name=='ISM':
        dr = Isomap(n_components = 2, n_neighbors = n_neighbors)
        
    elif dr_name=='LLE':
        dr = LLE(n_components = 2, n_neighbors = n_neighbors)
        
    elif dr_name=='SE':
        dr = SE(n_components = 2, n_neighbors = n_neighbors)
        
    elif dr_name=='UMAP':
        dr = umap.UMAP(n_components = 2, n_neighbors = n_neighbors, verbose=False, min_dist=min_dist)
        
    elif dr_name=='GRP':
        dr = GRP(n_components = 2)
        
    dr_data = dr.fit_transform(features)
    dr_data = pd.DataFrame(dr_data, columns=['{}_1'.format(dr_name),'{}_2'.format(dr_name)])
    ###########
    
    ## save stuff
    if labels is not None:
        dr_data['labels'] = list(labels)
        fig, ax = plt.subplots()
        sns.scatterplot(dr_data['{}_1'.format(dr_name)], dr_data['{}_2'.format(dr_name)], hue = dr_data['labels'] )
        plt.savefig(dataset_name + '/figures/' + file_name +'.pdf')
    
    
    dr_data.to_csv(dataset_name + '/data/' +file_name + '.csv', index=False)
    print('---------Finished: {}-----------'.format(file_name))
    ##########
    
    return 
    

            

def load_and_combine(folder):
    files = os.listdir(folder)
    datasets = []
    for f in files:
        datasets.append(pd.read_csv(folder+files))
    
    return pd.concat(datasets, axis=1)


In [4]:
# apply all these drs

files = os.listdir()
files = ['fashionmnist','coil-100','paris_buildings', 'pets', 'oxford_buildings','dogs','cinic10', 'caltech']
print(files)
for f in files:
    if os.path.isdir(f) and not f.startswith('.') and not f.startswith('_') and not f.startswith('MNIST'):
        #print(f)
        for dr_name in ['PCA','TSNE', 'ISM', 'LLE', 'UMAP', 'SE', 'GRP']:
            #print(dr_name)
            if dr_name in ['PCA','GRP']:
                dr_data = apply_dr.remote(dr_name, f+'/flat_img_50_1.csv', dataset_name = f)
            elif dr_name == 'TSNE':
                for p in [5,10,30,50,150,100]:
                    #print(p)
                    dr_data = apply_dr.remote(dr_name,f+'/flat_img_50_1.csv', perplexity=p, dataset_name = f)   
            else:
                for n in [2,3,5,7,10,15]:
                    #print(n)
                    if dr_name == 'UMAP':
                        for d in [0.05, 0.1, 0.25, 0.5, 0.8, 0.99]:
                            dr_data = apply_dr.remote(dr_name,f+'/flat_img_50_1.csv', n_neighbors=n, min_dist=d, dataset_name = f)
                    else:
                        dr_data = apply_dr.remote(dr_name, f+'/flat_img_50_1.csv', n_neighbors=n, dataset_name = f)


['fashionmnist', 'coil-100', 'paris_buildings', 'pets', 'oxford_buildings', 'dogs', 'cinic10', 'caltech']
[2m[36m(pid=38354)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38361)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38357)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38355)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38360)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38358)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38359)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38356)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38354)[0m TSNE_p5
[2m[36m(pid=38354)[0m File was already calculated. Skipping ....
[2m[36m(pid=38354)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38361

[2m[36m(pid=38358)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38356)[0m UMAP_n10_d0.05
[2m[36m(pid=38356)[0m File was already calculated. Skipping ....
[2m[36m(pid=38356)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38361)[0m UMAP_n10_d0.1
[2m[36m(pid=38361)[0m File was already calculated. Skipping ....
[2m[36m(pid=38361)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38354)[0m UMAP_n10_d0.25
[2m[36m(pid=38354)[0m File was already calculated. Skipping ....
[2m[36m(pid=38354)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38357)[0m UMAP_n10_d0.99
[2m[36m(pid=38357)[0m File was already calculated. Skipping ....
[2m[36m(pid=38357)[0m ---------Starting: fashionmnist/flat_img_50_1.csv-----------
[2m[36m(pid=38359)[0m UMAP_n10_d0.5
[2m[36m(pid=38359)[0m File was already calculated. Skipping ....
[2m[36m(pid=38359)[0m -------

2020-03-10 17:18:04,438	ERROR worker.py:994 -- Possible unhandled error from worker: [36mray::__main__.apply_dr()[39m (pid=38355, ip=129.69.205.59)
  File "/Users/morarica/miniconda3/envs/SepMe/lib/python3.7/site-packages/scipy/sparse/linalg/eigen/arpack/arpack.py", line 1642, in eigsh
    hermitian=True, tol=tol)
  File "/Users/morarica/miniconda3/envs/SepMe/lib/python3.7/site-packages/scipy/sparse/linalg/eigen/arpack/arpack.py", line 1059, in get_OPinv_matvec
    return get_inv_matvec(A, hermitian=hermitian, tol=tol)
  File "/Users/morarica/miniconda3/envs/SepMe/lib/python3.7/site-packages/scipy/sparse/linalg/eigen/arpack/arpack.py", line 1052, in get_inv_matvec
    return SpLuInv(M).matvec
  File "/Users/morarica/miniconda3/envs/SepMe/lib/python3.7/site-packages/scipy/sparse/linalg/eigen/arpack/arpack.py", line 914, in __init__
    self.M_lu = splu(M)
  File "/Users/morarica/miniconda3/envs/SepMe/lib/python3.7/site-packages/scipy/sparse/linalg/dsolve/linsolve.py", line 326, in spl

[2m[36m(pid=38355)[0m ISM_n3
[2m[36m(pid=38360)[0m ---------Finished: ISM_n2-----------
[2m[36m(pid=38360)[0m ---------Starting: coil-100/flat_img_50_1.csv-----------
[2m[36m(pid=38360)[0m ISM_n5
[2m[36m(pid=38355)[0m ---------Finished: ISM_n3-----------
[2m[36m(pid=38355)[0m ---------Starting: coil-100/flat_img_50_1.csv-----------
[2m[36m(pid=38355)[0m ISM_n7
[2m[36m(pid=38356)[0m [t-SNE] Computed neighbors for 5000 samples in 511.711s...
[2m[36m(pid=38356)[0m [t-SNE] Computed conditional probabilities for sample 1000 / 5000
[2m[36m(pid=38356)[0m [t-SNE] Computed conditional probabilities for sample 2000 / 5000
[2m[36m(pid=38356)[0m [t-SNE] Computed conditional probabilities for sample 3000 / 5000
[2m[36m(pid=38356)[0m [t-SNE] Computed conditional probabilities for sample 4000 / 5000
[2m[36m(pid=38356)[0m [t-SNE] Computed conditional probabilities for sample 5000 / 5000
[2m[36m(pid=38356)[0m [t-SNE] Mean sigma: 328.163510
[2m[36m(pid=38356)