# Demo for Image Gen Project

### JSON Vectorization to Improve Performance

In [35]:
from collections.abc import MutableMapping
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
import scipy.sparse
import json
import pickle

### Load Mock Database of JSON Descriptions of Images

This loads 1 million lines of json, each describing the mock attributes of an image

In [2]:
with open("sample_json_1000000.json", 'r') as fin:
    dict_list = json.load(fin)

In [11]:
len(dict_list)

1000000

### Flatten the json objects & convert to strings

In order to vectorize these objects, first we need them to be flat and converted into categorical strings.  Example:

#### { 'hair' : { 'color' : 'brown' } } becomes 'hair.color.brown'





In [8]:
def flatten_dict_gen(d, parent_key, sep):
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):      # testing if the value is itself a mutable key/value object
            yield from flatten_dict(v, new_key, sep=sep).items()
        else:
            yield new_key, v

In [9]:
def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
    return dict(flatten_dict_gen(d, parent_key, sep))

In [16]:
def flat_to_string(in_dict):
    dict_string = " ".join([f"{k}.{v}" if v not in [0,1, True, False] else f"{k}" for k,v in in_dict.items()])
    return dict_string

In [17]:
%%time
string_list = []
for item in dict_list:
    flat = flatten_dict(item)
    dict_string = flat_to_string(flat)
    string_list.append(dict_string)

CPU times: user 8.31 s, sys: 66 ms, total: 8.38 s
Wall time: 8.38 s


In [21]:
len(string_list)

1000000

#### Compare the original dict and the new string

In [3]:
dict_list[0]

{'sex': 'female',
 'age': 74,
 'skin': {'wrinkles': True},
 'hair': {'colour': 'blonde', 'length': 'short', 'texture': 'straight'},
 'emotion': 'angry',
 'ears': 'big',
 'eyebrows': 'arched',
 'accessories': 'earrings'}

In [22]:
string_list[0]

'sex.female age.74 skin.wrinkles hair.colour.blonde hair.length.short hair.texture.straight emotion.angry ears.big eyebrows.arched accessories.earrings'

#### Create a sparse matrix of the strings

This only has to be done on startup, not the every time you compare a user input.   

In [29]:
vectorizer = CountVectorizer(token_pattern='\S+', binary=True)

In [30]:
%%time
X = vectorizer.fit_transform(string_list)

CPU times: user 6.06 s, sys: 224 ms, total: 6.28 s
Wall time: 6.28 s


In [31]:
X

<1000000x102 sparse matrix of type '<class 'numpy.int64'>'
	with 6205181 stored elements in Compressed Sparse Row format>

In [45]:
X[0].toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]])

In [46]:
X2[0].toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]])

In [32]:
Y = vectorizer.get_feature_names()

#### Save the vectorizer for future use

In [52]:
with open("vectorizer.pickle", "wb") as fout:
    pickle.dump(vectorizer,fout)

### Vectorize the user input

In [47]:
# simulated user input json

input_json = {
    'sex' : 'male',
    'age' : 55,
    'ears' : 'big',
    'hair' : {'colour':'blonde' }
}

#### Flatten and convert to string just like json from the database

In [48]:
flat = flatten_dict(input_json)
dict_string = flat_to_string(flat)

In [49]:
dict_string

'sex.male age.55 ears.big hair.colour.blonde'

#### Note: the input string needs to be inside a list (done below)

In [50]:
input_matrix = vectorizer.transform([dict_string])

In [51]:
input_matrix

<1x102 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>