In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
from DRflow.metrics import ABW, CAL, DSC, HM, NH, SC
from DRflow.metrics import AUClogRNX, CCA, CC, LCMC, NeRV, NLM, TandC, Stress,scagnostics
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [49]:
def sample_and_save(input_file, output_folder, size_limit = 800, idx=None):
    fn = input_file.split('/')[-1]
    print(input_file)
    df = pd.read_csv(input_file)
    
    if idx is None:
        if df.shape[0] > size_limit:
            try:
                _, df = train_test_split(df, test_size = size_limit, stratify = df["labels"])
            except: 
                _, df = train_test_split(df, test_size = size_limit)
    
    else:
        df = df.loc[idx].copy()

    df.to_csv(output_folder+fn, index=False)
    idx = list(df.index)
    
    print('Saved mini batch: {}'.format(fn))
    return idx
    
    
    
    
    

In [50]:
all_names = ['MNIST','coil-100','stanfordfaces','yalefaces','Caltech_instruments', 'Caltech_plants', 'Caltech_vehicles', 
         'fashionmnist',   'flowers',
         'paintings','oxford_buildings','paris_buildings']

In [53]:
# create mini batches
for f in all_names: 
    output_folder = '../../data/{}/mini_batch/'.format(f)
    
    try:
        os.makedirs(output_folder + "metrics/")
        os.makedirs(output_folder + "flatfiles/")
        os.makedirs(output_folder + "dr/")
    except:
        continue
        
    
    flat_files = '../../data/{}/flatfiles/'.format(f)
    for flat in os.listdir(flat_files):
        if not flat.endswith('.csv'):
            continue
        print(flat) 
        idx = sample_and_save(flat_files + flat, output_folder + "flatfiles/", size_limit = 800)

        if 'classes' not in flat:
            c = -1
            size = flat.split('flat')[-1].split('.csv')[0]
        else: 
            tmp = flat.split('.csv')[0]
            tmp = tmp.split('_')
            print(tmp)

            size = tmp[-2].split('flat')[-1]
            print(size)
            c = tmp[-1].split('classes')[0]
            print(c)

        dr_files = '../../data/{}/dr_data/size{}/'.format(f, size)
        for dr in os.listdir(dr_files):
            if not dr.endswith('.csv'):
                continue
                
            if ('c{}'.format(c) in dr):
                print(dr)
                sample_and_save(dr_files + dr, output_folder + "dr/", size_limit = 800, idx=idx)
            
            elif ('call' in dr) and (c==-1):
                print(dr)
                sample_and_save(dr_files + dr, output_folder + "dr/", size_limit = 800, idx=idx)
                
    

In [81]:
import json


def apply_supervised(filename):
    
    dr = pd.read_csv(filename)
    visu = np.array(dr.iloc[:,[0,1]])
    labels = dr.labels
    name = 'supervised_' + filename.split('/')[-1].split('.csv')[0]
    
    abw = ABW.compute(visu, labels)
    print('AWB complete for: {}'.format(name))
    cal = CAL.compute(visu, labels)
    print('CAL complete for: {}'.format(name))
    dsc = DSC.compute(visu, labels)
    print('DSC complete for: {}'.format(name))
    hm = HM.compute(visu, labels)
    print('HM complete for: {}'.format(name))
    nh = NH.compute(dr.iloc[:,[0,1,2]])
    print('NH complete for: {}'.format(name))
    sc = SC.compute(visu, labels)
    print('SC complete for: {}'.format(name))
    
    res ={}
    res[name] = {'abw':abw, 'cal':cal, 'dsc':dsc,'hm':hm, 'nh':nh, 'sc':sc}
    
    path = filename.split('dr')[0] + 'metrics/'
    # Serialize data into file:
    json.dump( res, open( path + name + '.json', 'w' ) )

    
    return res


def apply_highlow(drname, flatname):
    
    high = pd.read_csv(flatname)
    
    dr = pd.read_csv(drname)
    visu = np.array(dr.iloc[:,[0,1]])
    labels = dr.labels

    name = 'highlow_' + drname.split('/')[-1].split('.csv')[0]
    
    auclog = AUClogRNX.compute(high.iloc[:, 0:-2], visu)
    print('AUClogRNX complete for: {}'.format(name))
    
    cca = CCA.compute(high.iloc[:, 0:-2], visu)
    print('CCA complete for: {}'.format(name))
    
    cc = CC.compute(high.iloc[:, 0:-2], visu)
    print('CC complete for: {}'.format(name))
    
    lcmc = LCMC.compute(high.iloc[:, 0:-2], visu)
    print('LCMC complete for: {}'.format(name))
    
#     nerv = NeRV.compute(high.iloc[:, 0:-2], visu)
#     print('NeRV complete for: {}'.format(name))
    
#     nlm = NLM.compute(high.iloc[:, 0:-2], visu)
#     print('NLM complete for: {}'.format(name))
    
#     tnc = TandC.compute(high.iloc[:, 0:-2], visu)
#     print('TandC complete for: {}'.format(name))
    
#     stress = Stress.compute(high.iloc[:, 0:-2], visu)
#     print('Stress complete for: {}'.format(name))

    
    res ={}
    res[name] = {'auclog':auclog, 'cca':cca, 'cc':cc,'lcmc':lcmc, }
#                  'nerv':nerv, 'nlm':nlm, 'stress':stress, 'tnc': tnc}
    
    path = drname.split('dr')[0] + 'metrics/'
    json.dump(res, open( path + name + '.json', 'w' ))
    
    return res
  
def apply_scagnostics(drname):
    dr = pd.read_csv(drname)
    x = np.array(dr.iloc[:, [0]])
    y = np.array(dr.iloc[:, [1]])
    name = 'scagnostics_' + drname.split('/')[-1].split('.csv')[0]
    res = {name: scagnostics.compute(x, y)}
    
    path = drname.split('dr')[0] + 'metrics/'
    json.dump(res, open( path + name + '.json', 'w' ))
    
    return res

In [84]:
all_names


def evaluate_files(types=['supervised','scagnostics', 'highlow'], 
                   high_dir = '../../data/{}/mini_batch/flatfiles/',
                  dr_dir = '../../data/{}/mini_batch/dr/'):
    
    for flat in os.listdir(high_dir):
        if not flat.endswith('.csv'):
            continue
        if 'classes' not in flat:
            c = 'all'
            size = flat.split('flat')[-1].split('.csv')[0]
        else: 
            tmp = flat.split('.csv')[0]
            tmp = tmp.split('_')
            print(tmp)

            size = tmp[-2].split('flat')[-1]
            print(size)
            c = tmp[-1].split('classes')[0]
            print(c)
        
        for dr in os.listdir(dr_dir):
            if not dr.endswith('.csv'):
                continue
                
            if ('c{}'.format(c) in dr):
                print(dr)
                if 'supervised' in types:
                    apply_supervised(dr_dir + dr)
                if 'scagnostics' in types:
                    apply_scagnostics(dr_dir + dr)
                if 'highlow' in types:
                    apply_highlow(dr_dir + dr, high_dir+flat)
                    
            

        
    
evaluate_files(high_dir = '../../data/{}/mini_batch/flatfiles/'.format('MNIST'),
                  dr_dir = '../../data/{}/mini_batch/dr/'.format('MNIST'))

['MNIST', 'flat28', '10classes']
28
10
MNIST_size28_c10_UMAP_n5_d0.1.csv
AWB complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
CAL complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
DSC complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
HM complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
NH complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
SC complete for: supervised_MNIST_size28_c10_UMAP_n5_d0.1
AUClogRNX complete for: highlow_MNIST_size28_c10_UMAP_n5_d0.1
CCA complete for: highlow_MNIST_size28_c10_UMAP_n5_d0.1
CC complete for: highlow_MNIST_size28_c10_UMAP_n5_d0.1
LCMC complete for: highlow_MNIST_size28_c10_UMAP_n5_d0.1
MNIST_size28_c10_MDS.csv


KeyboardInterrupt: 

In [29]:
high = pd.read_csv('../../data/stanfordfaces/mini_batch/flatfiles/stanfordfaces_flat50.csv')
dr = pd.read_csv('../../data/stanfordfaces/mini_batch/dr/stanfordfaces_size50_call_UMAP_n15_d0.5.csv')
dr.shape
dr

Unnamed: 0,UMAP_1,UMAP_2,labels,filename
0,7.503400,6.276552,left down,left down_74_74
1,5.383214,3.682751,mid left up,mid left up_317_317
2,0.621888,7.902173,mid right down,mid right down_374_374
3,4.269809,7.556913,mid left down,mid left down_276_276
4,-5.283817,7.042769,right down,right down_568_568
...,...,...,...,...
693,10.299421,11.949068,left down,left down_14_14
694,9.005250,4.143293,left up,left up_166_166
695,4.775728,5.389145,mid left up,mid left up_350_350
696,7.601295,9.128738,mid left down,mid left down_231_231


In [58]:
%time
path = '../../data/stanfordfaces/mini_batch/dr/stanfordfaces_size50_call_UMAP_n15_d0.5.csv'
res = apply_supervised(path)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.77 µs
AWB complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5
CAL complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5
DSC complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5
HM complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5
NH complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5
SC complete for: supervised_stanfordfaces_size50_call_UMAP_n15_d0.5


In [83]:
%time
path = '../../data/stanfordfaces/mini_batch/dr/stanfordfaces_size50_call_UMAP_n15_d0.5.csv'
res = apply_scagnostics(path)
res

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
/Users/morarica/Developer/DRflow/DRflow/examples


{'scagnostics_stanfordfaces_size50_call_UMAP_n15_d0.5': {'outlying': 0.0,
  'skewed': 0.6364482601161021,
  'clumpy': 0.024160320958999695,
  'sparse': 0.027796075460412125,
  'striated': 0.06747404844290658,
  'convex': 0.49258801328782537,
  'skinny': 0.7894510877773224,
  'stringy': 0.34934344879754786,
  'monotonic': 0.1744664112159387}}

In [77]:
%time
from datetime import datetime as time
start = time.now()
dr_path = '../../data/stanfordfaces/mini_batch/dr/stanfordfaces_size50_call_UMAP_n15_d0.5.csv'
flat_path = '../../data/stanfordfaces/mini_batch/flatfiles/stanfordfaces_flat50.csv'
res_dr = apply_unsupervised(dr_path, flat_path)
end = time.now()
print(end-start)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.81 µs
AUClogRNX complete for: stanfordfaces_size50_call_UMAP_n15_d0.5.csv
CCA complete for: stanfordfaces_size50_call_UMAP_n15_d0.5.csv


KeyboardInterrupt: 

In [20]:
path = '../../data/stanfordfaces/mini_batch/'
for folder in os.listdir(path):
    if not os.path.isdir(path + folder):
        continue
    ff = path + folder
    for f in os.listdir(ff):
        if not f.endswith('.csv'):
            continue
        print(f)
        file_path = ff + '/' + f
        df = pd.read_csv(file_path)
        df.labels = [i[0] for i in df.labels.str.split('_')]
        df.to_csv(file_path, index=False)  

stanfordfaces_size50_call_UMAP_n5_d0.5.csv
stanfordfaces_size50_call_UMAP_n15_d0.1.csv
stanfordfaces_size50_call_UMAP_n15_d0.5.csv
stanfordfaces_size50_call_UMAP_n5_d0.1.csv
stanfordfaces_size50_call_UMAP_n7_d0.8.csv
stanfordfaces_size50_call_TSNE_p30.csv
stanfordfaces_size50_call_UMAP_n2_d0.1.csv
stanfordfaces_size50_call_UMAP_n2_d0.5.csv
stanfordfaces_size50_call_TSNE_p100.csv
stanfordfaces_size50_call_TSNE_p5.csv
stanfordfaces_size50_call_SE_n7.csv
stanfordfaces_size50_call_UMAP_n7_d0.1.csv
stanfordfaces_size50_call_ISM_n2.csv
stanfordfaces_size50_call_TSNE_p10.csv
stanfordfaces_size50_call_UMAP_n2_d0.8.csv
stanfordfaces_size50_call_ISM_n7.csv
stanfordfaces_size50_call_UMAP_n7_d0.5.csv
stanfordfaces_size50_call_ISM_n15.csv
stanfordfaces_size50_call_ISM_n5.csv
stanfordfaces_size50_call_PCA.csv
stanfordfaces_size50_call_MDS.csv
stanfordfaces_size50_call_UMAP_n15_d0.8.csv
stanfordfaces_size50_call_GRP.csv
stanfordfaces_size50_call_UMAP_n5_d0.8.csv
stanfordfaces_flat50_10classes.csv
sta