In [1]:
from __future__ import division

import numpy as np
import os
from glob import glob
import scipy

from PIL import Image
from copy import deepcopy

from sklearn import linear_model, datasets, neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm

%matplotlib inline
from scipy.misc import imread, imresize
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import torch
import torchvision.models as models
import torch.nn as nn
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd
import pickle

VGG_SIZE_X = 224
VGG_SIZE_Y = 224
VGG_SIZE_Z = 3

import sys
sys.path.insert(0, '/home/rslee/sketchfeat/sketches')


import embedding as emb
reload(emb)
from embedding import *


In [2]:
def normalize(X, u, std):

    X = X - u
    X = X / std
    return X

def list_files(path, ext='png'):
    result = [y for x in os.walk(path)
              for y in glob(os.path.join(x[0], '*.%s' % ext))]
    return result

def load_images(paths, num_sketches):
    X = np.empty((num_sketches, VGG_SIZE_X * VGG_SIZE_Y* VGG_SIZE_Z), np.uint8)

    for p_i, p in enumerate(paths):
        im = Image.open(p)
        im = im.convert('RGB')
        im.thumbnail((VGG_SIZE_X, VGG_SIZE_Y), Image.ANTIALIAS)
        img = PIL2array(im)

        X[p_i, :] = img.flatten()
    
    return X

# normalizes images and re-saves accordingly.... 
def normalize_images(paths, num_sketches, train_u, train_std):
    X = np.empty((num_sketches, VGG_SIZE_X * VGG_SIZE_Y* VGG_SIZE_Z), np.uint8)

    for p_i, p in enumerate(paths):
        im = Image.open(p)
        im = im.convert('RGB')
        im.thumbnail((VGG_SIZE_X, VGG_SIZE_Y), Image.ANTIALIAS)
        img = PIL2array(im)

        X[p_i, :] = img.flatten()
    
    return X

def PIL2array(img):

    return np.array(img.getdata(),
                    np.uint8).reshape(img.size[1], img.size[0], 3)

In [3]:
# short function to copy metadata file from sketch_paths to partial_sketches

# import shutil, os
# final_sketch_folder = '/home/rslee/sketch_data'

# def get_csv(path, ext='csv'):
#     result = [y for x in os.walk(path)
#               for y in glob(os.path.join(x[0], '*.%s' % ext))]
#     return result

# sketch_csv = get_csv(final_sketch_folder)

# psketch_folder = '/home/rslee/partial_sketches/'
# psketch_dest_folders = glob(os.path.join(psketch_folder, '*'))

# for src, dest in zip(sketch_csv, psketch_dest_folders):
#     shutil.copy(src, dest)

In [4]:
# we will train the network on the baseline_sketches (or sketches from a different dataset)

# get metadata


def get_train_label_from_path(path):
    return path.split('/')[-1].split('_')[0]


def get_train_viewpoint_from_path(path):
    return path.split('/')[-1].split('_')[1]

def get_train_ID_from_path(path):
    return path.split('/')[-1].split('.')[0]


    
# extract metadata
path_to_train = '/home/rslee/baseline_sketches' 
train_paths = list_files(path_to_train)
train_labels = map(get_train_label_from_path,train_paths)
train_viewpoint = map(get_train_viewpoint_from_path, train_paths)
train_ID = map(get_train_ID_from_path, train_paths)

num_train = len(train_paths)

# organize into dataframe
Tr = pd.DataFrame([train_ID, train_labels, train_viewpoint, train_paths])
Tr = Tr.transpose()
Tr.columns = ['trainID', 'label', 'viewpoint', 'path']

print('Number of train sketches: {:d}'.format(len(train_paths)))

Number of train sketches: 1567


In [5]:
extractor = FeatureExtractor(train_paths,6)
X_train, _ = extractor.extract_feature_matrix()

Batch 5
Batch 10
Batch 15
Batch 20
Batch 25
stopped!


In [6]:
train_u = X_train.mean()
train_std = X_train.std()
_X_train = normalize(X_train, train_u, train_std)

In [7]:
# logreg = linear_model.LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial')
logreg = linear_model.LogisticRegression(penalty = 'l2')

trained_classifier = logreg.fit(_X_train, train_labels)

# Sanity check 

To run thru the log regression 

In [8]:
def get_test_label_from_path(path):
    return path.split('.')[-2].split('_')[-1] 

def get_subj_from_path(path):
    return path.split('/')[-2]

path_to_test = '/home/rslee/sketch_data'
test_paths = list_files(path_to_test)
test_labels = map(get_test_label_from_path,test_paths)
subj = map(get_subj_from_path,test_paths)


print('Number of test sketches: {:d}'.format(len(test_paths)))
print('Number of test subjects: {:d}'.format(len(np.unique(subj))))

Number of test sketches: 1400
Number of test subjects: 35


In [9]:
extractor = FeatureExtractor(test_paths,6)
X_test, _ = extractor.extract_feature_matrix()

Batch 5
Batch 10
Batch 15
Batch 20
stopped!


In [10]:
#load an normalize
_X_test = normalize(X_test, train_u, train_std)

In [11]:
trained_classifier.score(_X_test, test_labels)

0.78642857142857148

# Run thru one user

Just a first step 

In [12]:
# for testing data, we will test for every partial sketch. To just test this, let's just try one set of partial sketches
def get_psketch_ID_from_path(path):
    return path.split('/')[-1].split('.')[0]

def get_marg_for_psketch(path_to_test, plot = 0):
    test_paths = list_files(path_to_test)
    test_psketch_ID = map(get_psketch_ID_from_path, test_paths)
    # sort by psketchID
    psketch_order = np.argsort(np.asarray(test_psketch_ID).astype(int))
    _test_paths = [test_paths[i] for i in psketch_order] 

     # extract and normalize
    extractor = FeatureExtractor(_test_paths,6)
    X_test, _ = extractor.extract_feature_matrix()
    _X_test = normalize(X_test, train_u, train_std)
    # calculate marginals
    prob = np.squeeze(trained_classifier.predict_proba(_X_test))

    # plot
    
    if plot:
        plt.plot(prob)
        plt.legend(trained_classifier.classes_, bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
                   ncol=4, mode="expand", borderaxespad=0.)

        
    return prob


In [13]:
path_to_psketch = '/home/rslee/partial_sketches/'
subject_folders = [x[1] for x in os.walk(path_to_psketch)][0]

In [None]:
data = []

In [None]:

for folder_name in subject_folders[2:]: 
    subject_path = '/home/rslee/partial_sketches/' + folder_name; 
    sketch_foldersto = [x[0] for x in os.walk(subject_path)][1:]
    csv_path =  [y for x in os.walk(subject_path)
          for y in glob(os.path.join(x[0], '*.%s' % 'csv'))][0]
    df = pd.read_csv(csv_path)
    df = df.sort_values(['target', 'trial'])
 

    subject_data = []
   
    for i, f in enumerate(np.sort(sketch_folders)):
        if i == len(sketch_folders)/2:
            print('Halfway there')

        metadata = df.loc[df.index[i], ['wID', 'viewpoint', 'trial', 'trialDuration', 'target', 'competitor']]

        md = pd.DataFrame( [metadata,] * 23)
        md['numSketch'] = np.arange(23)
        md = md.reset_index()
        prob_df = pd.DataFrame(get_marg_for_psketch(f),columns = trained_classifier.classes_ );

        subject_data.append(md.join(prob_df))

    data.append(pd.concat(subject_data).reset_index(drop=True))

data = pd.concat(data).reset_index(drop=True)

stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
Halfway there
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
Halfway there
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
Halfway there
stopped!
stopped!
stopped!
stopped!
stopped!
stopped!
stop

In [27]:
data.to_csv('/home/rslee/partial_sketch_full')

In [30]:
data

Unnamed: 0,index,wID,viewpoint,trial,trialDuration,target,competitor,numSketch,bed,bench,chair,table
0,0,0119174_neurosketch,20,320,39.001440,bench,chair,0,0.026822,0.055895,9.162428e-01,0.001040
1,0,0119174_neurosketch,20,320,39.001440,bench,chair,1,0.934225,0.026225,3.908525e-02,0.000465
2,0,0119174_neurosketch,20,320,39.001440,bench,chair,2,0.006035,0.970770,2.713733e-03,0.020481
3,0,0119174_neurosketch,20,320,39.001440,bench,chair,3,0.009842,0.978927,3.783683e-03,0.007447
4,0,0119174_neurosketch,20,320,39.001440,bench,chair,4,0.019298,0.967674,1.463238e-03,0.011564
5,0,0119174_neurosketch,20,320,39.001440,bench,chair,5,0.034789,0.958795,1.097902e-03,0.005318
6,0,0119174_neurosketch,20,320,39.001440,bench,chair,6,0.089720,0.902217,3.614909e-03,0.004449
7,0,0119174_neurosketch,20,320,39.001440,bench,chair,7,0.084351,0.907088,4.322320e-03,0.004238
8,0,0119174_neurosketch,20,320,39.001440,bench,chair,8,0.007951,0.979885,6.957215e-03,0.005207
9,0,0119174_neurosketch,20,320,39.001440,bench,chair,9,0.005105,0.967654,1.299671e-02,0.014245


In [None]:
test_path1 = '/home/rslee/partial_sketches/0110171_neurosketch/';

folders = [x[0] for x in os.walk(test_path1)][1:]
csv_path =  [y for x in os.walk(test_path1)
      for y in glob(os.path.join(x[0], '*.%s' % 'csv'))][0]
df = pd.read_csv(csv_path)
df = df.sort_values(['target', 'trial'])


In [None]:
data = []
for i, f in enumerate(np.sort(folders)):
    if i == len(folders)/2:
        print('Halfway there')
        
    metadata = df.loc[df.index[i], ['wID', 'viewpoint', 'trial', 'trialDuration', 'target', 'competitor']]

    md = pd.DataFrame( [metadata,] * 23)
    md['numSketch'] = np.arange(23)
    md = md.reset_index()
    prob_df = pd.DataFrame(get_marg_for_psketch(f),columns = trained_classifier.classes_ );

    data.append(md.join(prob_df))

data = pd.concat(data).reset_index(drop=True)
