# Final Project for Cogs 118B
Group Members:
- Samruddhi Hande ([email](shande@ucsd.edu))
- Ron Hasson ([email](rhasson@ucsd.edu))
- Andrew Hernandez ([email](ash053@ucsd.edu))
- Mehail Mathew Sunny ([email](msmathew@ucsd.edu))
- Justin Yang ([email](justin-yang@ucsd.edu))

### Import packages and set up paths

In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.decomposition import PCA

In [2]:
# Change directory to photodraw project -- on Justin's local laptop
orig_wd = os.path.abspath('data')
os.chdir('F:\\photodraw\\analysis')

# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
csv_dir = os.path.join(results_dir,'csv')
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))
feature_dir = os.path.abspath(os.path.join(proj_dir,'features'))
    
meta_path = os.path.abspath(os.path.join(feature_dir, 'metadata_pixels.csv'))
image_path = os.path.abspath(os.path.join(feature_dir, 'flattened_sketches_pixels.npy'))
meta_path_fc6 = os.path.abspath(os.path.join(feature_dir, 'METADATA_sketch.csv'))
image_path_fc6 = os.path.abspath(os.path.join(feature_dir, 'FEATURES_FC6_sketch_no-channel-norm.npy'))

### Construct easy-to-use feature representations with corresponding metadata

In [3]:
sketchnames = os.listdir(os.path.join(sketch_dir, 'photodraw2x2'))
sketchpaths = [os.path.join(sketch_dir, 'photodraw2x2', name) for name in sketchnames]

In [4]:
df = pd.DataFrame([(path.split('\\')[-1].split('_', 3)[-1].rsplit('_',3)[0],
                   i) 
                   for i,path in enumerate(sketchpaths)], columns = ['category', 'raw_sketch_ind'])
df = df.sort_values(by=['category', 'raw_sketch_ind']).reset_index(drop=True)

In [5]:
# put data into flattened num_sketchesx(224*224) array
flattened_sketch_raw = [np.array(Image.open(path))[:,:,3].flatten() for path in sketchpaths]
flattened_sketch_raw = np.vstack(flattened_sketch_raw)

# convert rgba to binary, while also casting away inprecisions
flattened_sketch_raw = flattened_sketch_raw / 255     
flattened_sketch_raw = flattened_sketch_raw.astype(int)

np.save(os.path.join(orig_wd, 'sketches_raw_nopca'), flattened_sketch_raw)

In [6]:
sketch_df = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))
sketch_df = sketch_df.sort_values(by=['category'])
F_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_photodraw2x2_sketch.npy'))

In [7]:
F_fc6 = F_fc6[sketch_df.feature_ind.values]
df['fc6_sketch_ind'] = sketch_df.feature_ind.values
np.save(os.path.join(orig_wd, 'sketches_fc6_nopca'), F_fc6)

In [9]:
pca = PCA(n_components=100)
flattened_sketch_raw_pca = pca.fit_transform(flattened_sketch_raw)
df['raw_sketch_pca_ind'] = df.raw_sketch_ind.values
np.save(os.path.join(orig_wd, 'sketches_raw_pca'), flattened_sketch_raw_pca)

In [10]:
pca.explained_variance_ratio_

array([0.01578763, 0.00921605, 0.00847987, 0.00707454, 0.00431856,
       0.00420534, 0.00406048, 0.00327873, 0.00308827, 0.00289928,
       0.00274532, 0.00270691, 0.0024798 , 0.00233328, 0.00225339,
       0.0021528 , 0.00202969, 0.00197784, 0.00188014, 0.00184932,
       0.00178209, 0.00174908, 0.00170956, 0.00167063, 0.00163295,
       0.00158864, 0.00157283, 0.00153249, 0.00148812, 0.00147843,
       0.00145237, 0.00143714, 0.00138856, 0.00137556, 0.00135323,
       0.00135114, 0.00131148, 0.00127875, 0.00127645, 0.00127292,
       0.00124718, 0.00123121, 0.00122015, 0.00121007, 0.00118175,
       0.00116402, 0.00115972, 0.00114664, 0.00113694, 0.00112911,
       0.00111525, 0.00111413, 0.00108532, 0.00107746, 0.00106794,
       0.00106016, 0.00105088, 0.00103849, 0.00103289, 0.00103023,
       0.00101597, 0.00101335, 0.00100215, 0.0009869 , 0.00098477,
       0.00097857, 0.00095704, 0.00094837, 0.00094558, 0.00093971,
       0.00093624, 0.00093289, 0.00092468, 0.00091711, 0.00091

In [11]:
sum(pca.explained_variance_ratio_)

0.17226653149344687

In [12]:
pca = PCA(n_components=100)
flattened_sketch_fc6_pca = pca.fit_transform(F_fc6)
df['fc6_sketch_pca_ind'] = sketch_df.feature_ind.values
np.save(os.path.join(orig_wd, 'sketches_fc6_pca'), flattened_sketch_fc6_pca)

In [13]:
pca.explained_variance_ratio_

array([0.15037596, 0.09233716, 0.05246543, 0.03805462, 0.03609551,
       0.02821388, 0.02515587, 0.02094992, 0.01914285, 0.01688796,
       0.0148682 , 0.01350217, 0.01162995, 0.01121422, 0.0099503 ,
       0.0096756 , 0.00913479, 0.00862238, 0.00764023, 0.00673766,
       0.00656597, 0.00624368, 0.00595396, 0.00567503, 0.00538422,
       0.00509788, 0.00481224, 0.00467246, 0.00435901, 0.00428869,
       0.00421399, 0.00398709, 0.00397775, 0.00378253, 0.00374653,
       0.0035372 , 0.00346483, 0.0033729 , 0.00330363, 0.0031794 ,
       0.00313625, 0.00301402, 0.00288464, 0.00282521, 0.0027513 ,
       0.00270727, 0.00267559, 0.00258625, 0.00256693, 0.00252794,
       0.00247262, 0.00240036, 0.00233745, 0.00229766, 0.00227147,
       0.00225098, 0.00211866, 0.00209548, 0.00199791, 0.0019473 ,
       0.00190209, 0.00184384, 0.00181227, 0.00180796, 0.00176567,
       0.00169692, 0.00167522, 0.00166307, 0.00163838, 0.00161411,
       0.00155621, 0.00153532, 0.00151751, 0.00149896, 0.00144

In [14]:
sum(pca.explained_variance_ratio_)

0.7769962185993791

In [15]:
df.to_csv(os.path.join(orig_wd, 'sketches_metadata.csv'), index=False)