In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import random
import numpy as np
import itertools

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.ticker as ticker


In [3]:
def kolakoski_gen(start_items):
    c = itertools.cycle(start_items).__next__
    s, k = [], 0
    while True:
        c_next = c()
        s.append(c_next)
        sk = s[k]
        yield sk
        if sk > 1:
            s += [c_next] * (sk - 1)
        k += 1

def kolakoski(start_items=(1, 2), length=22):
    return list(itertools.islice(kolakoski_gen(start_items), length))

def binary_to_uint(seq):
    last_uchar = ""
    bytes_list = []
    for t_idx, term in enumerate(seq):
#         print(t_idx, t_idx % 8)
        last_uchar += str(term-1) 
        if t_idx % 8 == 7:
            next_byte = int(last_uchar, 2)
#             print("last_uchar", last_uchar, "to next_byte", next_byte)
            last_uchar = ""
            bytes_list.append(next_byte)
    return bytes_list

def insert_pair(ds_pairs, counts, start_idxs, key, key_value, s_idx):
    if key not in ds_pairs.keys():
        ds_pairs[key] = [key_value]
        counts[key] = [1]
        start_idxs[key] = [s_idx]
    else:
        exists_similar = False
        for idx_value, value in enumerate(ds_pairs[key]):
            if value == key_value:
                counts[key][idx_value] = counts[key][idx_value] + 1
                exists_similar = True
                break
        if not exists_similar:
            ds_pairs[key].append(key_value)
            counts[key].append(1)

def str_to_array(str_key):
    return [int(x) for x in str_key.replace("[", "").replace("]", "").split(",")]


In [None]:
# check duplicates in train/valid
pair_seq_lens = 40
results = []

pairs = {}
counts = {}

# generate pairs
for s_idx in idxs_train:
    e_idx = s_idx + pair_seq_len
    insert_pair(pairs, counts, str(bytes_list[s_idx:e_idx]), bytes_list[e_idx])

new_key = 0
old_key = 0
diff_values = 0
for s_idx in idxs_valid:
    e_idx = s_idx + pair_seq_len
    key = str(bytes_list[s_idx:e_idx])
    
    #  exists?
    if key not in pairs.keys():
        new_key += 1
    else:
        if bytes_list[e_idx] not in pairs[key]:
            diff_values +=1
        old_key += 1
#     different_ending?
print("new keys in valid: ", new_key)
print("repeating keys in train and valid:", old_key, ". With different values:", diff_values)
print("percent same values: %.2f" % ((old_key-diff_values)*100/len(idxs_valid)))
# kolakoski_len = int(8 * 1e6) => percent same values: 91%
# kolakoski_len = int(8 * 1e7) => percent same values: 95%
# kolakoski_len = int(8 * 1e8) => percent same values: 100%

In [10]:
kolakoski_len = int(8 * 1e8)
start_items = (1, 2)

# generate sequence
full_kola_seq = kolakoski(length=kolakoski_len)
bytes_list = binary_to_uint(full_kola_seq)

In [11]:
# generate only unique pairs
pair_seq_len = 40
results = []

pairs = {}
counts = {}
start_idxs = {}

# generate pairs
for s_idx in range(0, len(bytes_list) - pair_seq_len):
    e_idx = s_idx + pair_seq_len
    insert_pair(pairs, counts, start_idxs, str(bytes_list[s_idx:e_idx]), bytes_list[e_idx], s_idx)

# stats - count pairs
n_keys = len(pairs.keys())
length_key = list(pairs.keys())[0].count(", ") + 1

print("Kolakoski seq len:", len(bytes_list))
print("Number of unique sequences:", n_keys)



Kolakoski seq len: 100000000
Number of unique sequences: 308862


In [12]:
unique_dataset = []

for pair_key in pairs.keys():
    n_diff_values = len(pairs[pair_key])
    if n_diff_values == 1:
        s_idx = start_idxs[pair_key]

        # take this unique pair
        array_key = str_to_array(pair_key)
        ds_sample = s_idx + array_key + pairs[pair_key]
        unique_dataset.append(ds_sample)
unique_dataset_np = np.array(unique_dataset)


In [13]:
dir_path = "dataset/1e8_unique"
os.system("mkdir -p %s" % dir_path)

# Split train/valid
random.seed(42)
perm = np.random.permutation(unique_dataset_np.shape[0])
train_len = int(len(perm) * 0.8)
idxs_train = perm[:train_len]
idxs_valid = perm[train_len:]

np.save("%s/kolakosky.npy" % dir_path, unique_dataset_np)
np.save("%s/train_idxs.npy" % dir_path, idxs_train)
np.save("%s/valid_idxs.npy" % dir_path, idxs_valid)

In [14]:
unique_dataset_np.shape

(290474, 42)