# explore sketch object similarities

The purpose of this notebook is to:
* Measure how much more similar successive sketches of the SAME object are than sketches of DIFFERENT objects drawn in consecutive repetitions
* Measure how much more similar sketches of the SAME object are within a repetition (a proxy for resemblance) than arbitrarily sampled sketches from within a repetition. Examine how this similarity changes over time. 

In [1]:
import os, sys
import shutil
## add helpers to python path
if os.path.join('..','helpers') not in sys.path:
    sys.path.append(os.path.join('..','helpers'))

from embeddings import *
from extract_features import *
    
import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import requests
import re
from io import BytesIO
from PIL import Image, ImageFilter
import object_mask_utils as u
import socket
import glob
from scipy.stats import entropy

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from skimage import io, img_as_float
import base64

from IPython.core.pylabtools import figsize, getfigs

from IPython.display import clear_output
import importlib
import time

from collections import Counter
import operator

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### setup paths

In [2]:
## which experiment do you want to analyze? options: refgame1.2, refgame2.0
curr_exp = 'refgame1.2'

## directory & file hierarchy
proj_dir = os.path.abspath('../../')
analysis_dir = os.getcwd()
data_dir = os.path.join(proj_dir,'data')
experiment_dir = os.path.join(data_dir, 'experiment', curr_exp)
feat_dir = os.path.join(data_dir, 'features', curr_exp)

## paths to visual features & metadata
path_to_feats = os.path.join(feat_dir, 'FEATURES_vgg_FC6.npy')
path_to_meta = os.path.join(feat_dir, 'METADATA.csv')

## import dictionaries that map between shapenet ids and graphical conventions naming scheme
importlib.reload(u)
G2S = u.GC2SHAPENET
S2G = u.SHAPENET2GC

## Measure how much more similar successive sketches of the SAME object are than sketches of DIFFERENT objects drawn in consecutive repetitions
To do this analysis, we either conduct a "direct" analysis addressing this question: how much more strongly drawings of Chair A "resemble" Chair A than they resemble Chair B. OR we conduct an "indirect" analysis asking how much more strongly drawings of Chair A are similar to one another than are drawings of Chair B. Further, we ask if Repetition 1 drawings display STRONGER resemblance than Repetition 8 drawings – validating the notion that resemblance goes down over time.

#### extract VGG features for each of the object stimuli

In [4]:
## path to object images
path_to_actual_stims = os.path.join(data_dir, 'stimuli','actual')
path_to_candidate_stims = os.path.join(data_dir, 'stimuli','candidate')
stim_list_shapenet = ['{}.png'.format(i) for i in list(S2G.keys())]

## get paths of all candidate stimuli
candidate_image_paths = sorted(list_files(path_to_candidate_stims,'png'))
print('Number of candidate stimuli we originally considered: {}'.format(len(candidate_image_paths)))

## only copy over those candidate stims that were actually used in our experiment
for this_candidate_stim in candidate_image_paths:
    if this_candidate_stim.split('/')[-1] in stim_list_shapenet:
        shutil.copyfile(this_candidate_stim, os.path.join(path_to_actual_stims,this_candidate_stim.split('/')[-1]))
        
## get paths to all actual stimuli
actual_image_paths = sorted(list_files(path_to_actual_stims,'png'))
print('Number of actual stimuli we used: {}'.format(len(actual_image_paths)))

Number of candidate stimuli we originally considered: 55
Number of actual stimuli we used: 16


In [8]:
## instantiate feature extractor class
layer_name = 5
data_type = 'images'
extractor = FeatureExtractor(actual_image_paths,layer=layer_name,
                             data_type=data_type,
                             use_cuda = torch.cuda.is_available())

## extract features & metadata
features, paths = extractor.extract_feature_matrix()
meta = pd.DataFrame({'path' : list(flatten_list(paths))})

## define directory to save features out to
obj_feat_out_dir = os.path.join(data_dir, 'features','stimuli')
if not os.path.exists(obj_feat_out_dir):
    os.makedirs(obj_feat_out_dir)
features_fname = 'FEATURES_vgg_{}'.format(layer_name)

## save out features and meta
np.save(os.path.join(obj_feat_out_dir,'{}.npy'.format(features_fname)), 
        features)
meta.to_csv(os.path.join(obj_feat_out_dir,'METADATA.csv'), 
            index=True, 
            index_label='feature_ind')

print('Saved features and meta out!')
print(os.listdir(obj_feat_out_dir))

stopped!
['FEATURES_vgg_5.npy', 'METADATA.csv']


In [59]:
## load in sketch features (pre-extracted earlier)
F_sketch = np.load(path_to_feats)
M_sketch = pd.read_csv(path_to_meta)

## rename object features (extracted above)
F_obj = features
M_obj = meta

In [60]:
## APPLY PREPROCESSING TO THE SKETCH METADATA

## add gameID, repetition number, and object ID information
for ind, row in M_sketch.iterrows():
    path_pieces = row['path'].split('/')
    fname = path_pieces[-1]
    M_sketch.loc[ind,'gameID'] = fname.split('_')[0]
    M_sketch.loc[ind,'repetition_number'] = fname.split('.')[0].split('_')[-1]
    M_sketch.loc[ind,'objectID'] = fname.split('_')[1] + '_' + fname.split('_')[2]
    
## add condition information
for name, group in M_sketch.groupby(['gameID', 'objectID']):
    num_instances = group.shape[0] ## how many times is this object drawn in that game? if 2, then control condition; if 8, then repeated condidtion
    if num_instances == 2: ## control condition
        M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]),'condition'] = 'control'
    elif num_instances == 8: ## repeated condition 
        M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]),'condition'] = 'repeated'        
    else:
        print('Something went wrong with the number of sketches!')
        
## add phase information
for name, group in M_sketch.groupby(['gameID', 'objectID']):
    ## initialize all phase to "repeated" then overwrite the first and last with "pre" and "post" labels, respectively
    M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]),'phase'] = 'repeated'
    ## when repetition number is '00', you know that it is the pretest for ALL drawings in both conditions
    M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]) & (M_sketch['repetition_number']=='00'),'phase'] = 'pre'
    ## for the repeated condition, the post phase repetition is '07'; for the control condition, the post phase repetition is '01'
    if group['condition'].unique()[0]=='repeated':
        M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]) & (M_sketch['repetition_number']=='07'),'phase'] = 'post'
    elif group['condition'].unique()[0]=='control':
        M_sketch.loc[(M_sketch['gameID']==name[0]) & (M_sketch['objectID']==name[1]) & (M_sketch['repetition_number']=='01'),'phase'] = 'post'    
    else:
        print('Something went wrong when trying to add phase information!')
                       

In [70]:
## APPLY PREPROCESSING TO THE OBJECT METADATA
for ind, row in M_obj.iterrows():
    path_pieces = row['path'].split('/')
    fname = path_pieces[-1]
    shapenetID = fname.split('.')[0]
    M_obj.loc[ind,'filename'] = fname
    M_obj.loc[ind,'shapenetID'] = shapenetID
                

In [68]:
## join sketch metadata and features to create unified dataframe
S = M_sketch.join(pd.DataFrame(F_sketch))    

## join object metadata and features to create unified dataframe
O = M_obj.join(pd.DataFrame(F_obj))    

Unnamed: 0,path,filename,shapenetID
0,/Users/judithfan/graphical_conventions/data/st...,1d1641362ad5a34ac3bd24f986301745.png,1d1641362ad5a34ac3bd24f986301745
1,/Users/judithfan/graphical_conventions/data/st...,1da9942b2ab7082b2ba1fdc12ecb5c9e.png,1da9942b2ab7082b2ba1fdc12ecb5c9e
2,/Users/judithfan/graphical_conventions/data/st...,23b0da45f23e5fb4f4b6538438a0b930.png,23b0da45f23e5fb4f4b6538438a0b930
3,/Users/judithfan/graphical_conventions/data/st...,2448d9aeda5bb9b0f4b6538438a0b930.png,2448d9aeda5bb9b0f4b6538438a0b930
4,/Users/judithfan/graphical_conventions/data/st...,2b5953c986dd08f2f91663a74ccd2338.png,2b5953c986dd08f2f91663a74ccd2338
5,/Users/judithfan/graphical_conventions/data/st...,2e291f35746e94fa62762c7262e78952.png,2e291f35746e94fa62762c7262e78952
6,/Users/judithfan/graphical_conventions/data/st...,2eaab78d6e4c4f2d7b0c85d2effc7e09.png,2eaab78d6e4c4f2d7b0c85d2effc7e09
7,/Users/judithfan/graphical_conventions/data/st...,309674bdec2d24d7597976c675750537.png,309674bdec2d24d7597976c675750537
8,/Users/judithfan/graphical_conventions/data/st...,30afd2ef2ed30238aa3d0a2f00b54836.png,30afd2ef2ed30238aa3d0a2f00b54836
9,/Users/judithfan/graphical_conventions/data/st...,30dc9d9cfbc01e19950c1f85d919ebc2.png,30dc9d9cfbc01e19950c1f85d919ebc2


In [None]:
## extract attributes from filename 


In [None]:
## take repetition 1 drawings only


In [None]:
## compare repetition 1 drawings of chair A to chair B (heatmap)
