In [1]:
import os, math
from collections import defaultdict, namedtuple
import itertools
import sqlite3
import logging
import logging.handlers
import pickle
import datetime
import multiprocessing as mp

from PIL import Image, ImageDraw
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import scipy
import numpy as np
import cupy as cp
import pandas as pd

from utils import *
from hdc import *
import cortical_column_host as ccp

In [2]:
DF_TEST_RESULTS = pd.DataFrame(columns=['test_run_id', 'config_variant', 'cortical_columns', 'train_images_count', 'train_runs',
                                        'source_ds', 'test_image_id', 'true_value', 'infer_value', 'sim'])
TRAIN_RUN_ID = 0
TEST_RUN_ID = 0

In [3]:
LOG = Logging()

config_var = 'SQLITE3_NORMAL_GRID_RADIAL_7_256'
config = Config(config_var)
RNG = np.random.default_rng()

if cp.cuda.is_available():
    xp = cp.get_array_module(cp.empty(1))
    xp_array_from_gpu = lambda a: a.get() if isinstance(a, cp.ndarray) else a
    xp_array_to_gpu = lambda a: cp.asarray(a) if isinstance(a, np.ndarray) else a
else:
    xp = cp.get_array_module(np.empty(1))
    xp_array_from_gpu = lambda a: a
    xp_array_to_gpu = lambda a: a
    
print(f'xp = {xp.__name__}')

def get_full_db_file_name(db_file_name):
    return os.path.join(config.dataset_path, config.db_file_name_prefix + db_file_name)

train_db_con = sqlite3.connect(get_full_db_file_name(config.train_db_file_name))
test_db_con = sqlite3.connect(get_full_db_file_name(config.test_db_file_name))

mp_ctx = mp.get_context('spawn') # req-d for CUPY to work, fork method leads to failures in bootstrap somewhere around CUDA

xp = cupy


In [4]:
hdc = Hdc(10_000, xp)

In [5]:
%time
df_train_images = pd.read_sql_query('SELECT * FROM images', con=train_db_con, index_col='image_id')
df_test_images = pd.read_sql_query('SELECT * FROM images', con=test_db_con, index_col='image_id')
df_train_images.shape, df_test_images.shape

CPU times: user 8 μs, sys: 1e+03 ns, total: 9 μs
Wall time: 18.6 μs


((10000, 3), (2000, 3))

In [6]:
count = 8

def get_some_random_images_and_labels(df, count):
    image_ids = RNG.choice(df.index.unique(), count, replace=False)
    image_datas = df.loc[image_ids]['png']
    images = [Image.open(io.BytesIO(image_data)) for image_data in image_datas]
    labels = df.loc[image_ids]['value']
    return images, list(labels)

train_images, train_labels = get_some_random_images_and_labels(df_train_images, count)
test_images, test_labels = get_some_random_images_and_labels(df_test_images, count)

display_images_grid(train_images + test_images, captions=train_labels + test_labels, col_count=count)

In [7]:
class CorticalColumnHost(object):
    def __init__(self, host_id, column_ids):
        self.host_id = host_id
        self.column_ids = column_ids
        self.task_queue = mp_ctx.Queue()
        self.result_queue = mp_ctx.Queue()
        self.process = mp_ctx.Process(target=ccp.live, args=(config.section_name, host_id, column_ids, self.task_queue, self.result_queue))
        self.process.start()

    def die(self):
        self.task_queue.put({'OP': 'TERMINATE'})
        self.result_queue.get()
        self.process.join()

    def healthcheck(self):
        self.task_queue.put({'OP': 'HEALTHCHECK'})
        self.result_queue.get()

    def train_start(self, train_run_id, image_ids, consolidation_threshold, attempts_to_get_no_mistakes):
        self.task_queue.put({'OP': 'TRAIN', 
                             'train_run_id': train_run_id, 
                             'image_ids': image_ids, 
                             'consolidation_threshold': consolidation_threshold, 
                             'attempts_to_get_no_mistakes': attempts_to_get_no_mistakes})

    def train_finish(self):
        self.result_queue.get()       

In [8]:
COLUMN_HOSTS = dict(map(lambda i: (i, CorticalColumnHost(i, [i])), range(config.cortical_columns_count)))

for column_host in COLUMN_HOSTS.values():
    column_host.healthcheck()
    print(f'Column host {column_host.host_id} is ready')

Column host 0 is ready
Column host 1 is ready
Column host 2 is ready
Column host 3 is ready
Column host 4 is ready
Column host 5 is ready
Column host 6 is ready


In [None]:
%%time

train_runs = 10
attempts_to_get_no_mistakes = 10
count = min(1000, len(df_train_images))
consolidation_threshold = 1000

for _ in tqdm(range(train_runs), desc='Train run'):
    TRAIN_RUN_ID += 1
    image_ids = RNG.choice(df_train_images.index.unique(), count, replace=False)
    
    for column_id, column in COLUMN_HOSTS.items():
        column.train_start(TRAIN_RUN_ID, image_ids, consolidation_threshold, attempts_to_get_no_mistakes)

    for column_id, column in tqdm(COLUMN_HOSTS.items(), desc='Column', leave=False):
        column.train_finish()

In [None]:
def softmax(x):
    max_x = np.max(x)
    exp_x = np.exp(x - max_x)
    sum_exp_x = np.sum(exp_x)
    return exp_x / sum_exp_x

def conflate(pdfs):    
    n = np.prod(pdfs, axis=0)
    d = n.sum()

    if np.isclose(d, 0):
        return np.zeros(len(pdfs))
        
    return n / d

In [None]:
# TEST RUN
TEST_RUN_ID += 1
test_run_source = (df_test_images, test_db_con, 'test')
# test_run_source = (df_train_images, train_db_con, 'train')
count = min(2000, len(test_run_source[0]))
test_image_ids = RNG.choice(test_run_source[0].index.unique(), count, replace=False)
test_result_rows = defaultdict(list)
column_id_whitelist = COLUMNS.keys()
# column_id_whitelist = [3]
assert np.all(np.array(list(map(lambda i: i in COLUMNS, column_id_whitelist))) == 1)

for image_id in tqdm(test_image_ids):
    column_votes_vector = np.zeros(10)
    
    for column_id, column in COLUMNS.items():
        if not column_id in column_id_whitelist:
            continue
            
        max_cos_sim_index = -1 # aka engram id
        max_similar_engram_image_value = ''
        max_cos_sim = 0
    
        image_value = test_run_source[0].loc[image_id]['value']
        df_image_encodings = pd.read_sql('SELECT hdv FROM image_encodings WHERE image_id=:image_id AND column_id=:column_id', 
                                         params={'image_id': int(image_id), 'column_id': column_id}, con=test_run_source[1])
        assert len(df_image_encodings) > 0
        image_encoding_hdvs = list(map(lambda h: np.frombuffer(h, dtype='b'), df_image_encodings['hdv']))
        image_encoding_hdvs_norm = hdc.normalize(image_encoding_hdvs)
        image_encoding_hdvs_norm = xp_array_to_gpu(image_encoding_hdvs_norm)
    
        cos_sim_matrix = column.engram_norms.array_active @ image_encoding_hdvs_norm.T
        cos_sim_matrix[cos_sim_matrix < Hdc.COS_SIM_THRESHOLD] = 0
        cos_sim_vector = xp_array_from_gpu(xp.sum(cos_sim_matrix, axis=1)) # how each mem recall (sum cos sim) is close to current image
        
        assert cos_sim_vector.shape == (column.engram_norms.array_active.shape[0],)
        engram_ids_by_match_score = np.argsort(-cos_sim_vector) # sorted desc
    
        if engram_ids_by_match_score.shape[0] > 0:
            engram_id = engram_ids_by_match_score[0]
            cos_sim_value = cos_sim_vector[engram_id]
    
            if cos_sim_value > 0:
                max_cos_sim_index = engram_id
                max_similar_engram_image_value = column.engrams[engram_id].image_value
                max_cos_sim = cos_sim_value
                column_votes_vector[int(max_similar_engram_image_value)] += max_cos_sim

    infer_value = ''
    
    if np.any(column_votes_vector) > 0:
        digit_probabilities = softmax(column_votes_vector)
        infer_value = str(RNG.choice(10, p=digit_probabilities))
    
    test_result_rows['test_run_id'].append(TEST_RUN_ID)
    test_result_rows['config_variant'].append(config.section_name)
    test_result_rows['cortical_columns'].append(sorted(column_id_whitelist))
    test_result_rows['train_images_count'].append(column.images_seen)
    test_result_rows['train_runs'].append(TRAIN_RUN_ID)
    test_result_rows['source_ds'].append(test_run_source[2])
    test_result_rows['test_image_id'].append(image_id)
    test_result_rows['true_value'].append(image_value)
    test_result_rows['infer_value'].append(infer_value) 
    test_result_rows['sim'].append(column_votes_vector)
    

assert set(test_result_rows.keys()) == set(DF_TEST_RESULTS.columns), set(test_result_rows.keys()) ^ set(DF_TEST_RESULTS.columns)
df_test_results_for_run = pd.DataFrame(test_result_rows, columns=DF_TEST_RESULTS.columns)
concat_list = [DF_TEST_RESULTS] if len(DF_TEST_RESULTS) > 0 else []
concat_list.append(df_test_results_for_run)
DF_TEST_RESULTS = pd.concat(concat_list, ignore_index=True)

In [None]:
df_test_results2 = DF_TEST_RESULTS.copy()
df_test_results2['is_infer'] = df_test_results2['infer_value'] != ''
df_test_results2['is_correct_infer'] = df_test_results2['true_value'] == df_test_results2['infer_value']
df_test_results2 = df_test_results2.groupby(by=['test_run_id']).agg({'config_variant': 'first', 
                                                                     'source_ds': 'first',
                                                                     'cortical_columns': 'first',
                                                                     'train_images_count': 'first',
                                                                     'train_runs': 'first',
                                                                     'test_image_id': 'count', 
                                                                     'is_infer': 'sum', 
                                                                     'is_correct_infer': 'sum'})
df_test_results2.rename(columns=dict(test_image_id='count', is_infer='infers', is_correct_infer='correct_infers'), inplace=True, errors='raise')
df_test_results2['infer_ratio'] = df_test_results2['infers'] / df_test_results2['count']
df_test_results2['accuracy_count'] = df_test_results2['correct_infers'] / df_test_results2['count']
df_test_results2['accuracy_infers'] = df_test_results2['correct_infers'] / df_test_results2['infers']
df_test_results2['accuracy_infers'] = df_test_results2['accuracy_infers'].fillna(0)

plt.figure(figsize=(16, 4))
plt.subplot(1, 3, 1)
b = plt.bar(list(map(str, df_test_results2.index)), list(df_test_results2['infer_ratio']))
plt.bar_label(b, map(lambda p: f'{p*100:.0f}', df_test_results2['infer_ratio']), label_type='center')
plt.ylim(0, 1)
plt.title('Infers, %')
plt.xlabel('Test run')
plt.grid()

plt.subplot(1, 3, 2)
b = plt.bar(list(map(str, df_test_results2.index)), list(df_test_results2['accuracy_count']))
plt.bar_label(b, map(lambda p: f'{p*100:.0f}', df_test_results2['accuracy_count']), label_type='center')
plt.ylim(0, 1)
plt.title('Accuracy (Count), %')
plt.xlabel('Test run')
plt.grid()

plt.subplot(1, 3, 3)
b = plt.bar(list(map(str, df_test_results2.index)), list(df_test_results2['accuracy_infers']))
plt.bar_label(b, map(lambda p: f'{p*100:.0f}', df_test_results2['accuracy_infers']), label_type='center')
plt.ylim(0, 1)
plt.title('Accuracy (Infers), %')
plt.xlabel('Test run')
plt.grid()

plt.show()
df_test_results2