# Demo for Image Gen Project

### JSON Vectorization to Improve Performance

In [1]:
from collections.abc import MutableMapping
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
import scipy.sparse
import json
import pickle

### Load Mock Database of JSON Descriptions of Images

This loads 1 million lines of json, each describing the mock attributes of an image

In [2]:
with open("sample_json_1000000.json", 'r') as fin:
    dict_list = json.load(fin)

In [3]:
len(dict_list)

1000000

In [4]:
dict_list[0]

{'sex': 'female',
 'age': 74,
 'skin': {'wrinkles': True},
 'hair': {'colour': 'blonde', 'length': 'short', 'texture': 'straight'},
 'emotion': 'angry',
 'ears': 'big',
 'eyebrows': 'arched',
 'accessories': 'earrings'}

### Flatten the json objects & convert to strings

In order to vectorize these objects, first we need them to be flat and converted into categorical strings.  Example:

#### { 'hair' : { 'color' : 'brown' } } becomes 'hair.color.brown'





In [5]:
def flatten_dict_gen(d, parent_key, sep):
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):      # testing if the value is itself a mutable key/value object
            yield from flatten_dict(v, new_key, sep=sep).items()
        else:
            yield new_key, v

In [6]:
def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
    return dict(flatten_dict_gen(d, parent_key, sep))

In [7]:
def flat_to_string(in_dict):
    dict_string = " ".join([f"{k}.{v}" if v not in [0,1, True, False] else f"{k}" for k,v in in_dict.items()])
    return dict_string

In [8]:
%%time
string_list = []
for item in dict_list:
    flat = flatten_dict(item)
    dict_string = flat_to_string(flat)
    string_list.append(dict_string)

CPU times: user 8.37 s, sys: 79.7 ms, total: 8.45 s
Wall time: 8.45 s


In [9]:
len(string_list)

1000000

#### Compare the original dict and the new string

In [10]:
dict_list[0]

{'sex': 'female',
 'age': 74,
 'skin': {'wrinkles': True},
 'hair': {'colour': 'blonde', 'length': 'short', 'texture': 'straight'},
 'emotion': 'angry',
 'ears': 'big',
 'eyebrows': 'arched',
 'accessories': 'earrings'}

In [11]:
string_list[0]

'sex.female age.74 skin.wrinkles hair.colour.blonde hair.length.short hair.texture.straight emotion.angry ears.big eyebrows.arched accessories.earrings'

#### Create a sparse matrix of the strings

This only has to be done on startup, not the every time you compare a user input.   

In [12]:
vectorizer = CountVectorizer(token_pattern='\S+', binary=True)

In [13]:
%%time
db_matrix = vectorizer.fit_transform(string_list)

CPU times: user 6.14 s, sys: 233 ms, total: 6.37 s
Wall time: 6.36 s


In [14]:
db_matrix

<1000000x102 sparse matrix of type '<class 'numpy.int64'>'
	with 6205181 stored elements in Compressed Sparse Row format>

In [15]:
db_array = db_matrix.toarray()

In [16]:
db_array

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]])

In [17]:
db_features = vectorizer.get_feature_names()

In [18]:
db_features

['accessories.earrings',
 'accessories.glasses',
 'accessories.hat',
 'age.10',
 'age.11',
 'age.12',
 'age.13',
 'age.14',
 'age.15',
 'age.16',
 'age.17',
 'age.18',
 'age.19',
 'age.20',
 'age.21',
 'age.22',
 'age.23',
 'age.24',
 'age.25',
 'age.26',
 'age.27',
 'age.28',
 'age.29',
 'age.30',
 'age.31',
 'age.32',
 'age.33',
 'age.34',
 'age.35',
 'age.36',
 'age.37',
 'age.38',
 'age.39',
 'age.40',
 'age.41',
 'age.42',
 'age.43',
 'age.44',
 'age.45',
 'age.46',
 'age.47',
 'age.48',
 'age.49',
 'age.50',
 'age.51',
 'age.52',
 'age.53',
 'age.54',
 'age.55',
 'age.56',
 'age.57',
 'age.58',
 'age.59',
 'age.60',
 'age.61',
 'age.62',
 'age.63',
 'age.64',
 'age.65',
 'age.66',
 'age.67',
 'age.68',
 'age.69',
 'age.70',
 'age.71',
 'age.72',
 'age.73',
 'age.74',
 'age.75',
 'age.76',
 'age.77',
 'age.78',
 'age.79',
 'age.80',
 'ears.big',
 'ears.droopy',
 'ears.huge',
 'emotion.angry',
 'emotion.happy',
 'emotion.sad',
 'ethnicity.asian',
 'ethnicity.black',
 'ethnicity.cau

#### Save the vectorizer for future use

In [19]:
with open("vectorizer.pickle", "wb") as fout:
    pickle.dump(vectorizer,fout)

In [None]:
# to reload the vectorizer
#with open("vectorizer.pickle", "rb") as fin3:
#    test =pickle.load(fin3)

### Vectorize the user input

In [20]:
# simulated user input json

input_json = {
    'sex' : 'male',
    'age' : 55,
    'ears' : 'big',
    'hair' : {'colour':'blonde' }
}

#### Flatten and convert to string just like json from the database

In [21]:
flat_input = flatten_dict(input_json)
input_string = flat_to_string(flat_input)

In [22]:
input_string

'sex.male age.55 ears.big hair.colour.blonde'

#### Note: the input string needs to be inside a list (done below)

In [23]:
input_matrix = vectorizer.transform([input_string])

In [24]:
input_matrix

<1x102 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [25]:
input_array = input_matrix.toarray()

### Find the five nearest matching rows in the db_array

In [26]:
%%time
distances = distance.cdist(input_array, db_array, "cosine")[0]
five_closest = np.argsort(distances)[:5]  # get N closest matches
#closest_match = np.argmin(distances) # this gives index of closest match


CPU times: user 578 ms, sys: 479 ms, total: 1.06 s
Wall time: 1.06 s


### Display indexes of five closest matching rows

In [27]:
five_closest

array([849770, 356286, 882888, 572881, 753306])

#### Compare input array and closest match

In [28]:
input_array

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

In [29]:
db_array[five_closest[0]]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0])

### Verify that features/values are close to input

In [30]:
verify_dict = dict(zip(db_features,db_array[five_closest[0]].tolist()))

In [31]:
# show non-zero values in closest row
{k:v for k,v in verify_dict.items() if v !=0}

{'age.55': 1,
 'ears.big': 1,
 'hair.colour.blonde': 1,
 'hair.texture.curly': 1,
 'sex.male': 1}

In [32]:
# show the input json for comparison
input_json

{'sex': 'male', 'age': 55, 'ears': 'big', 'hair': {'colour': 'blonde'}}

#### Distance to closest matches

In [33]:
#closest match
distances[five_closest[0]]

0.10557280900008414

In [34]:
#fifth closest match
distances[five_closest[4]]

0.10557280900008414