In [2]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xadic_memory.mem_sdrsdm2 import DiadicMemory_Orig, DiadicMemory_Counters, DiadicMemory_SdmCounters, DiadicMemory_SdmCountersAndRetrieval
import xadic_memory.mem_sdrsdm2
import xadic_memory.sdr_util
from tqdm.notebook import tqdm

In [3]:
def clean_text(text):
    text = text.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z_]', '', text)
    return text

In [4]:
with open(os.path.join('text', 'text_sh.txt'), 'r', encoding='cp1251') as file:
    content = file.read().replace('\n', '')
    content = clean_text(content)

In [5]:
PERIOD_OF_IDS = 10
WINDOW_SIZE = 6
CODE_SIZE = 10000
SENTENCE_ACTIVE_BITS = 12
assert SENTENCE_ACTIVE_BITS % WINDOW_SIZE == 0
WORD_ACTIVE_BITS = SENTENCE_ACTIVE_BITS // WINDOW_SIZE

In [6]:
VOCAB = dict()
syms = [chr(x) for x in range(ord('a'), ord('z') + 1)]
syms += ['_']
#codes = set()
blum_filter = np.zeros(CODE_SIZE, dtype=int)

for sym in tqdm(syms):
    for p in tqdm(range(PERIOD_OF_IDS), desc=sym, leave=False):
        vocab_item_id = f'{sym}:{p}'
        vocab_item_code = np.zeros(CODE_SIZE, dtype=int)
        available_positions = np.argwhere(blum_filter == 0).reshape(-1)
        assert len(available_positions) >= WORD_ACTIVE_BITS
        fill_positions = np.random.choice(available_positions, WORD_ACTIVE_BITS, replace=False)
        vocab_item_code[fill_positions] = 1
        assert blum_filter @ vocab_item_code == 0
        blum_filter = blum_filter + vocab_item_code
        
        # while True:
        #     vocab_item_code = np.random.permutation(vocab_item_code)
        #     assert (vocab_item_code == 1).sum() == CODE_ACTIVE_BITS
        #     vocab_item_code_str = ''.join(map(str, vocab_item_code))

        #     if not vocab_item_code_str in codes:
        #         codes.add(vocab_item_code_str)
        #         break

        VOCAB[vocab_item_id] = vocab_item_code

# assert len(codes) == len(syms) * PERIOD_OF_IDS, (len(codes), len(syms))

  0%|          | 0/27 [00:00<?, ?it/s]

a:   0%|          | 0/10 [00:00<?, ?it/s]

b:   0%|          | 0/10 [00:00<?, ?it/s]

c:   0%|          | 0/10 [00:00<?, ?it/s]

d:   0%|          | 0/10 [00:00<?, ?it/s]

e:   0%|          | 0/10 [00:00<?, ?it/s]

f:   0%|          | 0/10 [00:00<?, ?it/s]

g:   0%|          | 0/10 [00:00<?, ?it/s]

h:   0%|          | 0/10 [00:00<?, ?it/s]

i:   0%|          | 0/10 [00:00<?, ?it/s]

j:   0%|          | 0/10 [00:00<?, ?it/s]

k:   0%|          | 0/10 [00:00<?, ?it/s]

l:   0%|          | 0/10 [00:00<?, ?it/s]

m:   0%|          | 0/10 [00:00<?, ?it/s]

n:   0%|          | 0/10 [00:00<?, ?it/s]

o:   0%|          | 0/10 [00:00<?, ?it/s]

p:   0%|          | 0/10 [00:00<?, ?it/s]

q:   0%|          | 0/10 [00:00<?, ?it/s]

r:   0%|          | 0/10 [00:00<?, ?it/s]

s:   0%|          | 0/10 [00:00<?, ?it/s]

t:   0%|          | 0/10 [00:00<?, ?it/s]

u:   0%|          | 0/10 [00:00<?, ?it/s]

v:   0%|          | 0/10 [00:00<?, ?it/s]

w:   0%|          | 0/10 [00:00<?, ?it/s]

x:   0%|          | 0/10 [00:00<?, ?it/s]

y:   0%|          | 0/10 [00:00<?, ?it/s]

z:   0%|          | 0/10 [00:00<?, ?it/s]

_:   0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
def encode(content, offsets):
    df = pd.DataFrame(columns=['text'])
    
    for offset in offsets:
        window_texts = []
        window_offset_texts = []
        cumcodes = []
        
        window = []
        window_text = []
        window_offset_text = []
        
        for i, ch in enumerate(content):
            p = (i + offset) % PERIOD_OF_IDS
            q = f'{ch}:{p}'
            code = VOCAB[q]
            window.append(code)
            window_text.append(ch)
            window_offset_text.append(q)
        
            while len(window) > WINDOW_SIZE:
                window.pop(0)
                window_text.pop(0)
                window_offset_text.pop(0)
        
            if len(window) == WINDOW_SIZE:
                cumcode = np.sum(window, axis=0)
                cumcode[cumcode > 1] = 1
                assert (cumcode == 1).sum() == SENTENCE_ACTIVE_BITS
                cumcodes.append(cumcode)
    
                window_texts.append(''.join(window_text))
                window_offset_texts.append(''.join(window_offset_text))
        
        df['text'] = window_texts
        df[f'offset_{offset}_text'] = window_offset_texts
        df[f'offset_{offset}_code'] = cumcodes

    return df

In [8]:
encode(clean_text('hello_'), [0, 8])

Unnamed: 0,text,offset_0_text,offset_0_code,offset_8_text,offset_8_code
0,hello_,h:0e:1l:2l:3o:4_:5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",h:8e:9l:0l:1o:2_:3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
df = encode(content, [0, 1])
df.head()

Unnamed: 0,text,offset_0_text,offset_0_code,offset_1_text,offset_1_code
0,the_so,t:0h:1e:2_:3s:4o:5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",t:1h:2e:3_:4s:5o:6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,he_son,h:1e:2_:3s:4o:5n:6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",h:2e:3_:4s:5o:6n:7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,e_sonn,e:2_:3s:4o:5n:6n:7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",e:3_:4s:5o:6n:7n:8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,_sonne,_:3s:4o:5n:6n:7e:8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",_:4s:5o:6n:7n:8e:9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,sonnet,s:4o:5n:6n:7e:8t:9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",s:5o:6n:7n:8e:9t:0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
df_train, df_test = train_test_split(df[:10000])

In [11]:
mem = DiadicMemory_Counters(CODE_SIZE, SENTENCE_ACTIVE_BITS)

In [12]:
for _, df_train_row in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    sdr_x = np.argwhere(df_train_row['offset_0_code'] > 0).reshape(-1)
    sdr_y = np.argwhere(df_train_row['offset_1_code'] > 0).reshape(-1)
    mem.store(sdr_x, sdr_y)

  0%|          | 0/7500 [00:00<?, ?it/s]

In [13]:
hamming_dists = []

for _, df_test_row in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    sdr_x = np.argwhere(df_test_row['offset_0_code'] > 0).reshape(-1)
    sdr_y = np.argwhere(df_test_row['offset_1_code'] > 0).reshape(-1)
    sdr_y_q = mem.query(sdr_x)
    #hamming_dist = np.count_nonzero(sdr_y != sdr_y_q)
    hamming_dist = len(set(sdr_y) ^ set(sdr_y_q))
    hamming_dists.append(hamming_dist)

hamming_dists = np.array(hamming_dists)
hamming_dists.mean()

  0%|          | 0/2500 [00:00<?, ?it/s]

np.float64(2.3936)

In [14]:
sdr_x, sdr_y = encode(clean_text('hello_'), [0, 1]).iloc[0][['offset_0_code', 'offset_1_code']]
sdr_x = np.argwhere(sdr_x > 0).reshape(-1)
sdr_y = np.argwhere(sdr_y > 0).reshape(-1)
mem.query(sdr_x), sdr_y

(array([  43,  763, 1645, 2850, 3254, 4450, 4609, 6487, 7074, 7879, 8504,
        9372]),
 array([  43,  763, 2850, 3254, 3348, 4450, 4609, 6487, 7016, 7879, 8504,
        9372]))

In [27]:
M = np.zeros(CODE_SIZE)
vec_from = df_train.loc[0]['offset_0_code']
vec_to = df_train.loc[0]['offset_1_code']
M = np.logical_or(M, np.logical_xor(vec_from, vec_to)).astype(int)
assert np.array_equal(np.logical_xor(vec_from, M).astype(int), vec_to)

In [34]:
M = np.zeros(CODE_SIZE)
hamming_dists = []

for _, row in df_train[:100].iterrows():
    vec_from = row['offset_0_code']
    vec_to = row['offset_1_code']
    M = np.logical_or(M, np.logical_xor(vec_from, vec_to)).astype(int)
    #assert np.array_equal(np.logical_xor(vec_from, M).astype(int), vec_to)
    vec_to_m = np.logical_xor(vec_from, M).astype(int)
    hamming_dist = np.count_nonzero(vec_to_m != vec_to)
    hamming_dists.append(hamming_dist)

np.array(hamming_dists).mean()

np.float64(312.0)

In [15]:
P = np.zeros((6,6), dtype=int)
pool = list(range(6))
i = 0

while pool:
    p = pool.pop(np.random.randint(len(pool)))
    P[i, p] = 1
    i += 1

v = np.random.randint(2, size=6)
print(f'1) {v}')
v_hat = P @ v
print(f'2) {v_hat}')
v_hat_hat = P.T @ v_hat
print(f'3) {v_hat_hat}')

1) [1 1 1 1 0 0]
2) [1 0 0 1 1 1]
3) [1 1 1 1 0 0]
