# imports

In [None]:
import os
import sys
import warnings

import pymongo as pm
import pandas as pd

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

## add helpers to python path
if os.path.join(proj_dir,'analysis','helpers') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis','helpers'))
    
# Assign variables within imported analysis helpers
import df_generation_helpers as h
if sys.version_info[0]>=3:
    from importlib import reload

## directory setup

In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('../../')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'data')
experiment_dir = os.path.join(results_dir,'experiment')
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['3dObjects']
coll = db['graphical_conventions']

# which iteration name should we use?
iterationName1 = 'run3_size4_waiting'
iterationName2 = 'run4_generalization'
iterationName3 = 'run5_submitButton_testing'

In [None]:
## list of researcher mturk worker ID's to ignore
jefan = ['A1MMCS8S8CTWKU','A1MMCS8S8CTWKV','A1MMCS8S8CTWKS']
hawkrobe = ['A1BOIDKD33QSDK']
megsano = ['A1DVQQLVZR7W6I']
researchers = jefan + hawkrobe + megsano 

In [None]:
## run 3 - get total number of stroke and clickedObj events in the collection as a whole
S1 = coll.find({ '$and': [{'iterationName':iterationName1}, {'eventType': 'stroke'}]}).sort('time')
C1 = coll.find({ '$and': [{'iterationName':iterationName1}, {'eventType': 'clickedObj'}]}).sort('time')

## run 4 - get total number of stroke and clickedObj events in the collection as a whole
S2 = coll.find({ '$and': [{'iterationName':iterationName2}, {'eventType': 'stroke'}]}).sort('time')
C2 = coll.find({ '$and': [{'iterationName':iterationName2}, {'eventType': 'clickedObj'}]}).sort('time')

## run 5 - get total number of stroke and clickedObj events in the collection as a whole
S3 = coll.find({ '$and': [{'iterationName':iterationName3}, {'eventType': 'stroke'}]}).sort('time')
C3 = coll.find({ '$and': [{'iterationName':iterationName3}, {'eventType': 'clickedObj'}]}).sort('time')

print str(S1.count() + S2.count() + S3.count()) + ' stroke records in the database.'
print str(C1.count() + C2.count() + C3.count()) + ' clickedObj records in the database.' # previously 722 so 882 ideally 

## generate group dataframes

In [None]:
reload(h)
## get list of all candidate games
games = coll.distinct('gameid')

## get list of complete and valid games
run3_complete_games = h.get_complete_and_valid_games(games,coll,iterationName1,researchers=researchers, tolerate_undefined_worker=False)
run4_complete_games = h.get_complete_and_valid_games(games,coll,iterationName2,researchers=researchers, tolerate_undefined_worker=False)
run5_complete_games = h.get_complete_and_valid_games(games,coll,iterationName3,researchers=researchers, tolerate_undefined_worker=False)

In [None]:
reload(h)
## generate actual dataframe and get only valid games (filtering out games with low accuracy, timeouts)
D_run3 = h.generate_dataframe(coll, run3_complete_games, iterationName1, experiment_dir)
D_run4 = h.generate_dataframe(coll, run4_complete_games, iterationName2, experiment_dir)
D_run5 = h.generate_dataframe(coll, run5_complete_games, iterationName3, experiment_dir)

In [None]:
## filtering outliers 
D_run3_filtered = h.filter_crazies(D_run3, 'numStrokes')
D_run3_filtered = h.filter_crazies(D_run3_filtered, 'numCurvesPerSketch')
D_run4_filtered = h.filter_crazies(D_run4, 'numStrokes')
D_run4_filtered = h.filter_crazies(D_run4_filtered, 'numCurvesPerSketch')
D_run5_filtered = h.filter_crazies(D_run5, 'numStrokes')
D_run5_filtered = h.filter_crazies(D_run5_filtered, 'numCurvesPerSketch')

# filter out incorrect trials 
D_run3_correct = D_run3_filtered[D_run3_filtered['outcome'] == True]
D_run4_correct = D_run4_filtered[D_run4_filtered['outcome'] == True]
D_run5_correct = D_run5_filtered[D_run5_filtered['outcome'] == True]

# keep this dataframe and make normalized dataframe for within-subject errors 
D_run3_normalized = D_run3_correct.copy(deep = True)
D_run4_normalized = D_run4_correct.copy(deep = True)
D_run5_normalized = D_run5_correct.copy(deep = True)

reload(h)
D_run3_normalized = h.grand_mean_normalize(D_run3_normalized, 'numStrokes', run3_complete_games)
D_run3_normalized = h.grand_mean_normalize(D_run3_normalized, 'drawDuration', run3_complete_games)
D_run3_normalized = h.grand_mean_normalize(D_run3_normalized, 'numCurvesPerSketch', run3_complete_games)
D_run3_normalized = h.grand_mean_normalize(D_run3_normalized, 'meanPixelIntensity', run3_complete_games)

D_run4_normalized = h.grand_mean_normalize(D_run4_normalized, 'numStrokes', run4_complete_games)
D_run4_normalized = h.grand_mean_normalize(D_run4_normalized, 'drawDuration', run4_complete_games)
D_run4_normalized = h.grand_mean_normalize(D_run4_normalized, 'numCurvesPerSketch', run4_complete_games)
D_run4_normalized = h.grand_mean_normalize(D_run4_normalized, 'meanPixelIntensity', run4_complete_games)

D_run5_normalized = h.grand_mean_normalize(D_run4_normalized, 'numStrokes', run4_complete_games)
D_run5_normalized = h.grand_mean_normalize(D_run4_normalized, 'drawDuration', run4_complete_games)
D_run5_normalized = h.grand_mean_normalize(D_run4_normalized, 'numCurvesPerSketch', run4_complete_games)
D_run5_normalized = h.grand_mean_normalize(D_run4_normalized, 'meanPixelIntensity', run4_complete_games)


# writing out data 

## raw, unfiltered
D_run3.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run3', 'raw')))
D_run4.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run4', 'raw')))
D_run5.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run5', 'raw')))


## filtered, but includes correct and incorrect trials 
D_run3_filtered.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run3', 'filtered')))
D_run4_filtered.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run4', 'filtered')))
D_run5_filtered.to_csv(os.path.join(csv_dir, 'graphical_conventions_{}_{}.csv'.format('run5', 'filtered')))


## filtered, and correct trials only 
D_run3_correct.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run3', 'unnormalized')))
D_run4_correct.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run4', 'unnormalized')))
D_run5_correct.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run5', 'unnormalized')))


## filtered, correct trials only, and normalized within subject 
D_run3_normalized.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run3', 'normalized')))
D_run4_normalized.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run4', 'normalized')))
D_run5_normalized.to_csv(os.path.join(csv_dir,'graphical_conventions_{}_{}.csv'.format('run5', 'normalized')))

### load in pre-existing dataframes to get png renders to extract features

In [None]:
fpath = os.path.join(results_dir,'graphical_conventions_{}_{}.csv'.format('run3', 'unnormalized'))
D_run3_correct = pd.read_csv(fpath)

fpath = os.path.join(results_dir,'graphical_conventions_{}_{}.csv'.format('run4', 'unnormalized'))
D_run4_correct = pd.read_csv(fpath)

fpath = os.path.join(results_dir,'graphical_conventions_{}_{}.csv'.format('run5', 'unnormalized'))
D_run5_correct = pd.read_csv(fpath)

In [None]:
reload(h)
h.save_sketches(D_run3_correct, sketch_dir, 'combined', 'run3')
h.save_sketches(D_run4_correct, sketch_dir, 'combined', 'run4')
h.save_sketches(D_run5_correct, sketch_dir, 'combined', 'run5')