In [1]:
from __future__ import division
import os
import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json

## INPUT: preprocessed group data CSV from mongo (see pilot2_analysis_sketchpad_basic)
## OUTPUT: full set of bdaInput files

__list of bdaInpput files__: 
* sketchData CSV (generated inside generate_bdaInput_csv)
* test_examples TXT/JSON/CSV (generated inside generate_bdaInput_csv)
* costs JSON
* condition-lookup JSON
* similarity JSON

#### load in sketch data and filter to generate sketchData CSVs

In [2]:
# directory & file hierarchy
iterationName = 'pilot2'
exp_path = './'
analysis_dir = os.getcwd()
data_dir = os.path.abspath(os.path.join(os.getcwd(),'../../..','data',exp_path))
exp_dir = './'
sketch_dir = os.path.abspath(os.path.join(os.getcwd(),'../../..','analysis',exp_path,'sketches','pilot2'))

In [4]:
D = pd.read_csv(os.path.join(analysis_dir,'sketchpad_basic_pilot2_group_data.csv'))
DUNFIL = pd.read_csv(os.path.join(analysis_dir,'sketchpad_basic_pilot2_group_data_unfiltered.csv'))

# filter out incorrect and invalid trials as well
incorrects = pd.read_csv('./incorrect_trial_paths_pilot2.txt',header=None)[0].values
invalids = pd.read_csv('./invalid_trial_paths_pilot2.txt',header=None)[0].values

def add_fnames(D):
    fname = []
    for i,_d in D.iterrows():
        fname.append('gameID_' + _d['gameID'] + '_trial_' + str(_d['trialNum']) + '_' + _d['target'] +'.png')    
    D = D.assign(fname=pd.Series(fname).values)  

    fname_no_target = []
    for i,_d in D.iterrows():
        fname_no_target.append('gameID_' + _d['gameID'] + '_trial_' + str(_d['trialNum']) +'.png')    
    D = D.assign(fname_no_target=pd.Series(fname_no_target).values) 
    return D

def filter_out_incorrect(D, incorrects):
    D = D[~D['fname'].isin(incorrects)]
    return D

def filter_out_invalids(D, invalids):
    D = D[~D['fname_no_target'].isin(invalids)] 
    return D
    
## add some filename columns
D = add_fnames(D)
DUNFIL = add_fnames(DUNFIL) ## version of dataframe with ALL trials, garbage games, incorrect trials, invalid trials

DNOINC = filter_out_incorrect(DUNFIL, incorrects) ## save version of D containing with incorrect trials filtered out only
DNOINV = filter_out_invalids(DUNFIL, invalids) ## save version of D containing with invalid trials filtered out only
D = filter_out_invalids(filter_out_incorrect(D, incorrects), invalids) ## both kinds of garbage filtered out

print np.shape(D)
print str(np.shape(D)[0]) + ' records in merged dataframe'

print '{} incorrect trials'.format(len(incorrects))
print '{} invalid trials'.format(len(invalids))
print ' '
print '{} trials with NO GARBAGE filtered out'.format(DUNFIL.shape[0])
print '{} trials with incorrects filtered out'.format(DNOINC.shape[0])
print '{} trials with invalids filtered out'.format(DNOINV.shape[0])
print '{} trials with ALL GARBAGE filtered out'.format(D.shape[0])

(2682, 24)
2682 records in merged dataframe
194 incorrect trials
132 invalid trials
 
3072 trials with NO GARBAGE filtered out
2878 trials with incorrects filtered out
2940 trials with invalids filtered out
2682 trials with ALL GARBAGE filtered out


In [5]:
## now assign sketch_label and class_label for the other versions of D
import analysis_helpers as h
reload(h)
def add_extra_label_columns(D):
    sketch_label = [(i[-12:] + '_' + str(j)) for i,j in zip(D['gameID'].values,D['trialNum'].values)]
    D = D.assign(sketch_label=pd.Series(sketch_label).values)

    # add class label
    category = []
    classes = ['bird','car','chair','dog']
    for i,d in D.iterrows():
        category.append(h.objcat[d['target']])
    D = D.assign(category=pd.Series(category).values)
    return D

D = add_extra_label_columns(D)
DUNFIL = add_extra_label_columns(DUNFIL)
DNOINC = add_extra_label_columns(DNOINC)
DNOINV = add_extra_label_columns(DNOINV)

In [8]:
def generate_bdaInput_csv(D,filtration_level,train_test_split=True,
                          adaptor_type='sketch_avg_full25k',
                          split_type='splitbyobject'):

    ### filter out training examples 
    test_examples = pd.read_json('pilot2_{}_test_examples.json'.format(adaptor_type),orient='records')
    test_examples = list(test_examples[0].values)
    test_examples = [i.split('.')[0] + '.png' for i in test_examples]    

    if train_test_split==True:
        keep_examples = test_examples 
    else:
        keep_examples = D['fname_no_target'].values ## keep all datapoints

    D0 = D[D['fname_no_target'].isin(keep_examples)]

    ## generate lists to compose new bdaInput CSV
    _sketchLabel = []
    _Condition = []
    _Target = []
    _Distractor1 = []
    _Distractor2 = []
    _Distractor3 = []
    _coarseGrainedSketchInfo = [] # condition_objectName ... e.g., further_knob
    for i, _d in D0.iterrows():
        _sketchLabel.append(_d['sketch_label'])
        _Condition.append(_d['condition']) 
        _Target.append(_d['target'])
        distractor1 = _d['Distractor1']
        distractor2 = _d['Distractor2']
        distractor3 = _d['Distractor3']      
        
        d_list = sorted([distractor1, distractor2, distractor3])
        _Distractor1.append(d_list[0])
        _Distractor2.append(d_list[1])    
        _Distractor3.append(d_list[2])  
        _coarseGrainedSketchInfo.append('{}_{}'.format(_d['condition'],_d['target'])) 

    D2 = pd.DataFrame([_Condition,_sketchLabel,_Target,_Distractor1,_Distractor2,_Distractor3,_coarseGrainedSketchInfo])
    D2 = D2.transpose()
    D2.columns = ['condition','sketchLabel','Target','Distractor1','Distractor2','Distractor3','coarseGrainedSketchInfo']
    print '{} datapoints x {} columns'.format(D2.shape[0],D2.shape[1])
    
    if train_test_split==True:
        print 'saving CSV with only test data'
        if len(filtration_level)==0:
            D2.to_csv('../models/bdaInput/sketchData_fixedPose_{}_{}_pilot2.csv'.format(split_type,adaptor_type))
        else:  
            D2.to_csv('../models/bdaInput/sketchData_fixedPose_{}_{}_pilot2_{}.csv'.format(split_type,adaptor_type,filtration_level))
    else: ## run bda on ALL datapoints (not just test split)
        print 'saving CSV including all datapoints'
        if len(filtration_level)==0:
            D2.to_csv('../models/bdaInput/sketchData_fixedPose_alldata_{}_pilot2.csv'.format(adaptor_type))
        else:  
            D2.to_csv('../models/bdaInput/sketchData_fixedPose_alldata_{}_pilot2_{}.csv'.format(adaptor_type,filtration_level))        
    print 'Saved out bdaInput CSV ... {}'.format(filtration_level) 

#### now actually generate and save out the bdaInputCSV, both the split and alldata versions

In [9]:
## params
adaptor_type = 'sketch_avg_full25k'
split_type = 'splitbyobject'

# first split versions
generate_bdaInput_csv(D,'',adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DNOINC,'no_incorrect',adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DNOINV,'no_invalid',adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DUNFIL,'unfiltered',adaptor_type = adaptor_type,split_type = split_type)

# now alldata versions
generate_bdaInput_csv(D,'',train_test_split=False,adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DNOINC,'no_incorrect',train_test_split=False,adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DNOINV,'no_invalid',train_test_split=False,adaptor_type = adaptor_type,split_type = split_type)
generate_bdaInput_csv(DUNFIL,'unfiltered',train_test_split=False,adaptor_type = adaptor_type,split_type = split_type)

544 datapoints x 7 columns
saving CSV with only test data
Saved out bdaInput CSV ... 
573 datapoints x 7 columns
saving CSV with only test data
Saved out bdaInput CSV ... no_incorrect
574 datapoints x 7 columns
saving CSV with only test data
Saved out bdaInput CSV ... no_invalid
608 datapoints x 7 columns
saving CSV with only test data
Saved out bdaInput CSV ... unfiltered
2682 datapoints x 7 columns
saving CSV including all datapoints
Saved out bdaInput CSV ... 
2878 datapoints x 7 columns
saving CSV including all datapoints
Saved out bdaInput CSV ... no_incorrect
2940 datapoints x 7 columns
saving CSV including all datapoints
Saved out bdaInput CSV ... no_invalid
3072 datapoints x 7 columns
saving CSV including all datapoints
Saved out bdaInput CSV ... unfiltered


#### remove cost outliers

In [10]:
## load data in again and filter out cost outliers

def remove_outliers(X,column):
    mu = np.mean(X[column].values)
    sd = np.std(X[column].values)
    thresh = mu + 5*sd        
    X = X.drop(X[X[column] > thresh].index)
    return X
 
## make copy of D that has cost outliers removed    
D2 = remove_outliers(D,'drawDuration')    
D2 = remove_outliers(D2,'mean_intensity')
D2 = remove_outliers(D2,'numStrokes')

# print D2.shape

splits = ['splitbyobject','alldata']
for split in splits:
    ### subset drawing data csv by sketches that are accounted for here (i.e., that were not cost outliers)
    B = pd.read_csv('../models/bdaInput/sketchData_fixedPose_{}_{}_pilot2.csv'.format(split_type,adaptor_type))    
#     print B.shape

    remaining_sketches = list(np.unique(D2['sketch_label'].values))
    _B = B[B['sketchLabel'].isin(remaining_sketches)]
#     print _B.shape
    _B.to_csv('../models/bdaInput/sketchData_fixedPose_{}_{}_pilot2_costOutliersRemoved.csv'.format(split,adaptor_type))

#### make condition-lookup json

In [11]:
## generate condition-lookup.json to be able to pair sketches with condition
cond_json = {}
sketchID_list = np.unique(D['sketch_label'].values)
for i,d in enumerate(sketchID_list):
    cond = D[D['sketch_label']==d]['condition'].values[0]
    obj = D[D['sketch_label']==d]['target'].values[0]
    cond_json[d] = '{}_{}'.format(cond,obj)
    
## output json in the same format as the other cost json
output_path = '../models/bdaInput/condition-lookup.json'
with open(output_path, 'wb') as fp:
    json.dump(cond_json, fp)    

#### generate cost dictionary

In [12]:
def sigmoid(x,k=1,x0=0.5):
    return 1 / (1 + np.exp(-k * (x - x0)))

def add_rescaled_metric(X,metric,transform='maxnorm',k=5):
    '''
    input: X is a data frame, metric is the name of one of the (cost) metrics that you want to scale between 0 and 1
            transform options include:
                :'maxnorm', which means dividing each value by maximum in list
                :'minmaxnorm', look at it
                :'sigmoid', which means passing each value through logistic function with mean
    output: X with additional column that has the rescaled metric
    '''
    if metric=='drawDuration': ## if handling drawDuration, log first -- no wait, maybe not 
        vals = X[metric].values
    else:
        vals = X[metric].values
    X['vals'] = vals
    if transform=='maxnorm':
        top_val = np.max(vals)
        rescaled_val = []
        for i,d in X.iterrows():
            rescaled_val.append(d['vals']/top_val)
    elif transform=='minmaxnorm':
        bottom_val = np.min(vals)
        top_val = np.max(vals)
        rescaled_val = []
        for i,d in X.iterrows():
            rescaled_val.append((d['vals']-bottom_val)/(top_val-bottom_val))        
    elif transform=='sigmoid':
        median_val = np.median(vals)
        rescaled_val = []
        for i,d in X.iterrows():
            rescaled_val.append(sigmoid(d['vals'],k=k,x0=median_val))
    X['rescaled_{}'.format(metric)] = rescaled_val          
    return X

In [13]:
## actually add rescaled metric
D2 = add_rescaled_metric(D2,'numStrokes',transform='minmaxnorm')
D2 = add_rescaled_metric(D2,'mean_intensity',transform='minmaxnorm')
D2 = add_rescaled_metric(D2,'drawDuration',transform='minmaxnorm')

In [14]:
## generate cost dictionaries to try out with pragmatics model
print len(np.unique(D2['sketch_label'].values))
sketchID_list = np.unique(D2['sketch_label'].values)
metrics = ['drawDuration','mean_intensity','numStrokes']

for metric in metrics:    
    print metric
    cost_json = {}
    for i,d in enumerate(sketchID_list):
        assert len(np.unique(D2[D2['sketch_label']==d]['rescaled_{}'.format(metric)].values))==1
        cost_json[d] = D2[D2['sketch_label']==d]['rescaled_{}'.format(metric)].values[0]    

    ## output json in the same format as the other cost json
    output_path = '../models/refModule/json/costs-fixedPose96-{}.json'.format(metric)
    with open(output_path, 'wb') as fp:
        json.dump(cost_json, fp)
        

2653
drawDuration
mean_intensity
numStrokes


### evaluate model predictions (bdaOutput)

In [15]:
## define set of models to compare
# model_zoo = ['fc6_combined_cost','fc6_combined_nocost','fc6_S0_cost','fc6_S0_nocost']
model_zoo = ['sketch-avg-full25k_combined_cost', 'sketch-unroll-full25k_S0_cost']
this_model = model_zoo[0]

## define paths to model predictions
path_to_evaluate = '../models/evaluateOutput'
pred_path = os.path.join(path_to_evaluate,this_model)

### load in model param posterior (bdaOutput/**splitbyobjectParams.csv)

In [17]:
## get file with params from this model
this_params = os.path.join('../models/bdaOutput',this_model+'_splitbyobjectParams.csv')
params = pd.read_csv(this_params)
assert np.round(np.sum(np.exp(params.posteriorProb.values)),12)==1

In [18]:
## get list of all predictives (accepted MCMC samples)
pred_files = [i for i in os.listdir(pred_path) if i[-15:] =='Predictives.csv']

OSError: [Errno 2] No such file or directory: '../models/evaluateOutput/sketch-avg-full25k_combined_cost'