In [1]:
%matplotlib inline
##from skimage import data, io, filters
import os
import numpy as np
from PIL import Image
import matplotlib
from matplotlib import pyplot,pylab
plt = pyplot
import scipy
from __future__ import division
import seaborn as sns
sns.set_style('white')
import string
import pandas as pd
import json
import pymongo as pm

#### helper funcs 

In [6]:
## this helps to sort in human order
import re

def tryint(s):
    try:
        return int(s)
    except ValueError:
        return s
     
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

def sort_nicely(l):
    """ Sort the given list in the way that humans expect.
    """
    l.sort(key=alphanum_key)
    
def load_text(path):
    with open(path, 'r') as f:
        x = f.readlines()
    utt = x[0]
    # replace special tokens with question marks
    if '<DIA>' in utt:
        utt = utt.replace('<DIA>', '-')
    if '<UKN>' in utt:
        utt = utt.replace('<UKN>', '___')    
    return utt

#### setup

In [7]:
# paths
alphanum = dict(zip(range(26),string.ascii_lowercase))
conditions = ['literal','pragmatic']
upload_dir = './context_agnostic_False_rs_33'
bucket_name = 'shapenet-chairs-speaker-eval' 
dataset_name = 'shapenet_chairs_speaker_eval' 

In [8]:
# get list of triplet dirs
triplet_dirs = [i for i in os.listdir(upload_dir) if i != '.DS_Store']
triplet_dirs = [i for i in triplet_dirs if i[:7]=='triplet']
triplet_dirs = [os.path.join(upload_dir,i) for i in triplet_dirs]
sort_nicely(triplet_dirs)

In [9]:
# go through and rename the images from 0,1,2 to distractor1,distractor2,target
for this_triplet in triplet_dirs:
    if os.path.exists(os.path.join(this_triplet,'0.png')):    
        _shapenet_ids = np.load(os.path.join(this_triplet,'shape_net_ids.npy'))
        shapenet_id_dict = dict(zip(['distractor1','distractor2','target'],_shapenet_ids))
        os.rename(os.path.join(this_triplet,'0.png'),os.path.join(this_triplet,'{}_distractor1.png'.format(shapenet_id_dict['distractor1'])))
        os.rename(os.path.join(this_triplet,'1.png'),os.path.join(this_triplet,'{}_distractor2.png'.format(shapenet_id_dict['distractor2'])))
        os.rename(os.path.join(this_triplet,'2.png'),os.path.join(this_triplet,'{}_target.png'.format(shapenet_id_dict['target'])))

In [10]:
# _shapenet_ids = np.load(os.path.join(this_triplet,'shape_net_ids.npy'))
# shapenet_id_dict = dict(zip(['distractor1','distractor2','target'],_shapenet_ids))

In [11]:
# literal_utt = load_text(os.path.join(this_triplet,'literal_utterance.txt'))
# pragmatic_utt = load_text(os.path.join(this_triplet,'pragmatic_utterance.txt'))

#### upload stims to s3

In [12]:
import boto
runThis = 0
if runThis:
    conn = boto.connect_s3()
    b = conn.create_bucket(bucket_name) ### if bucket already exists, then get_bucket, else create_bucket
    for ind,this_triplet in enumerate(triplet_dirs):
        ims = [i for i in os.listdir(this_triplet) if i[-3:]=='png']
        for im in ims:
            print ind, im
            k = b.new_key(im)
            k.set_contents_from_filename(os.path.join(this_triplet,im))
            k.set_acl('public-read')

#### build stimulus dictionary & upload metadata to mongo

In [14]:
print('Generating list of triplets and their attributes...')    
# generate pandas dataframe with different attributes
condition = []
family = []
utt = []
target = []
distractor1 = []
distractor2 = []
games = [] # this field keeps track of which games this triplet has been shown in
shuffler_ind = []

## generate permuted list of triplet indices in order to be able retrieve from triplets pseudorandomly
inds = np.arange(len(conditions)*len(triplet_dirs))
shuffled_inds = np.random.RandomState(0).permutation(inds)
counter = 0
for cond_ind,this_condition in enumerate(conditions):
    for trip_ind,this_triplet in enumerate(triplet_dirs):        
        ims = [i for i in os.listdir(this_triplet) if i[-3:]=='png']
        # extract filename
        target_filename = [i for i in ims if 'target' in i][0]
        distractor1_filename = [i for i in ims if 'distractor1' in i][0]
        distractor2_filename = [i for i in ims if 'distractor2' in i][0]
        # define url
        target_url = 'https://s3.amazonaws.com/{}/{}'.format(bucket_name,target_filename)
        distractor1_url = 'https://s3.amazonaws.com/{}/{}'.format(bucket_name,distractor1_filename)        
        distractor2_url = 'https://s3.amazonaws.com/{}/{}'.format(bucket_name,distractor2_filename)
        # extract shapenetid
        target_shapenetid = target_filename.split('_')[0]
        distractor1_shapenetid = distractor1_filename.split('_')[0]
        distractor2_shapenetid = distractor2_filename.split('_')[0]
        # roll metadata into targ, d1, d2 dictionaries
        _target = {'filename': target_filename, 'url': target_url, 'shapenetid': target_shapenetid}
        _distractor1 = {'filename': distractor1_filename, 'url': distractor1_url, 'shapenetid': distractor1_shapenetid}
        _distractor2 = {'filename': distractor2_filename, 'url': distractor2_url, 'shapenetid': distractor2_shapenetid}
        # extract family and utt info
        this_family = this_triplet.split('/')[-1]        
        this_utt = load_text(os.path.join(this_triplet,'{}_utterance.txt'.format(this_condition)))        
        # append to lists to prep for dataframe
        condition.append(this_condition)
        family.append(this_family)
        utt.append(this_utt)
        target.append(_target)
        distractor1.append(_distractor1)
        distractor2.append(_distractor2)
        games.append([])
        shuffler_ind.append(shuffled_inds[counter])
        counter += 1

Generating list of triplets and their attributes...


In [15]:
print('Generating pandas dataframe...') 
table = [condition,family,utt,target,distractor1,distractor2,games,shuffler_ind]
headers = ['condition','family','utt','target','distractor1','distractor2','games','shuffler_ind']
df = pd.DataFrame(table)
df = df.transpose()
df.columns = headers

Generating pandas dataframe...


In [16]:
## save out to file
print('Saving out json dictionary out to file...') 
stimdict = df.to_dict(orient='records') 
with open('{}.js'.format(dataset_name), 'w') as fout:
    json.dump(stimdict, fout)

Saving out json dictionary out to file...


In [17]:
### next todo is to upload this JSON to initialize the new stimulus collection
print('next todo is to upload this JSON to initialize the new stimulus collection...')
import json
J = json.loads(open('{}.js'.format(dataset_name),mode='ru').read())

next todo is to upload this JSON to initialize the new stimulus collection...


In [72]:
##assert len(J)==len(all_files)
print 'dataset_name: {}'.format(dataset_name)
print len(J)

dataset_name: shapenet_chairs_speaker_eval
2408


In [73]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[dataset_name]

In [74]:
## actually add data now to the database
reallyRun = 1
if reallyRun:
    for (i,j) in enumerate(J):
        if i%100==0:
            print ('%d of %d' % (i,len(J)))
        coll.insert_one(j)

0 of 2408
100 of 2408
200 of 2408
300 of 2408
400 of 2408
500 of 2408
600 of 2408
700 of 2408
800 of 2408
900 of 2408
1000 of 2408
1100 of 2408
1200 of 2408
1300 of 2408
1400 of 2408
1500 of 2408
1600 of 2408
1700 of 2408
1800 of 2408
1900 of 2408
2000 of 2408
2100 of 2408
2200 of 2408
2300 of 2408
2400 of 2408


In [None]:
## check how many records have been retrieved
a = coll.find({'shuffler_ind':{'$gte':0}})
numGames = []
for rec in a:
    numGames.append(len(rec['games']))
b = np.array(numGames)
print np.mean(b>0)

0.0
