In [None]:
# to check, whether the results depend on shuffling
# test on the smallest set
%matplotlib inline
import matplotlib.pyplot as plt
import subprocess
import os
import random
import shutil
from collections import Counter
import statistics

DATASET_DIR = "../tests/input_files/test_100_25_25/"
TEMP_DIR = "temp"
DATASET_FILES = os.listdir(DATASET_DIR)
ANSWER = 25
STRNUM = 50
STRLEN = 100
os.mkdir(TEMP_DIR) if not os.path.isdir(TEMP_DIR) else None

In [None]:
# create shuffled files
rep_files = []
print("Overall {} files in the dataset".format(len(DATASET_FILES)))
for num, to_check in enumerate(DATASET_FILES[:25]):
    # to_check = random.choice(DATASET_FILES)
    to_check_path = os.path.join(DATASET_DIR, to_check)
    replicate_min = {}  # number of replicate -> minimal answer
    
    shuffle_cmd_templ = "../shuffle_input.py " + to_check_path + " {} -o {} -c"
    for i in range(1, STRNUM):
        out_file = os.path.join(TEMP_DIR, "rep_{}_{}.txt".format(num, i))
        shuffle_cmd = shuffle_cmd_templ.format(i, out_file)
        subprocess.call(shuffle_cmd, shell=True)
        rep_files.append(out_file)

In [None]:
# call CSP for each file
# a long stage
csp_cmd_templ = "../CSP {} {} -v 1"
print("Overall {} files to go".format(len(rep_files)))
for num, rep in enumerate(rep_files):
    # get the minimal answer for each
    csp_cmd = csp_cmd_templ.format(rep, ANSWER)
    csp_out = subprocess.check_output(csp_cmd, shell=True).decode("utf-8").split("\n")
    answ = csp_out[-3]
    if answ == "True":
        replicate_min[num] = ANSWER
    else:
        max_cov_line = csp_out[-5].split()
        exp = int(max_cov_line[-1])
        real = int(max_cov_line[-3])
        diff = exp - real
        replicate_min[num] = ANSWER + diff

In [None]:
# visualize it
min_times = Counter()
for v in replicate_min.values():
    min_times[v] += 1
times_sort = sorted(min_times.keys())
for t in range(times_sort[0], times_sort[-1] + 1):
    app_ = min_times.get(t) if min_times.get(t) else 0
    print("{}:\n{}".format(t, "*" * app_))
# defenitely, algorithm is init string - dependent

In [None]:
# maybe get some statistics?
rep_zeros = {}
rep_col_zeros = {}
rep_k = {}
nums = range(len(rep_files))
maxes = []
mins = []
ks = []
deltas = []
aves = []
medians = []
stds = []
fline_to_pat_sizes = {}

for num, rep in enumerate(rep_files):
    # get the minimal answer for each
    f = open(rep, "r")
    lines = [l[:-1] for l in f.readlines()]
    f.close()
    
    k = replicate_min[num]
    zeros = sorted([line.count("0") for line in lines])
    rep_zeros[num] = zeros[1:]
    rep_k[num] = k
    ks.append(k)
    max_zeros = zeros[-1]
    min_zeros = zeros[1]  # because in 0 -> 0
    maxes.append(max_zeros)
    mins.append(min_zeros)
    delta = max_zeros - min_zeros
    deltas.append(delta)
    aves.append(statistics.mean(zeros))
    medians.append(statistics.median(zeros))
    stds.append(statistics.stdev(zeros))
    fline_to_pat_sizes[num] = []
    column_z = []
    for i in range(STRLEN):
        i_col = [line[i] for line in lines]
        size_ = i_col.count("1")
        c_zeros = i_col.count("0")
        column_z.append(c_zeros)
        fline_to_pat_sizes[num].append(size_)
    rep_col_zeros[num] = column_z
        

In [None]:
# another visualisation?
fig = plt.figure(figsize=(30, 20))
rows = 2
cols = 3

datasets = [maxes, mins, deltas, aves, medians, stds]
names = ["Max_zeros", "Min_zeros", "Min_max_delta",
         "Mean", "Median", "Std"]
colors = ["red", "green", "blue", "magenta", "cyan", "#555555"]
assert(len(datasets) == len(names) == len(colors))  # just in case

for num, data in enumerate(zip(datasets, names, colors), 1):
    ax = fig.add_subplot(rows, cols, num)
    ax.scatter(ks, data[0], s=10, color=data[2])
    ax.set_xlabel("K")
    ax.set_ylabel(data[1])
    ax.grid(which="both")
plt.savefig("plots/shuffle_params.svg")
plt.show()
# somehow depends on max zeros and delta?

In [None]:
# let's have a look at this...
from collections import defaultdict
fig = plt.figure(figsize=(10, 10))
plt.grid()
K_range = list(range(min(ks), max(ks) + 1))
K_range_size = len(K_range)
max_R = 255
min_R = 16
G = "1"
B = "1"
C_ = "#{}{}{}"
k_to_m_m = defaultdict(list)
for elem in zip(ks, maxes, mins):
    k_to_m_m[elem[0]].append((elem[1], elem[2]))

k_keys = sorted(set(ks))
step = int((max_R - min_R) / K_range_size)
for num, k_ in enumerate(k_keys):
    R = hex(min_R + step * num)[2:].upper()
    k_vals = k_to_m_m[k_]
    _max = [x[0] for x in k_vals]
    _min = [x[1] for x in k_vals]
    color = C_.format(R, R, R)
    plt.scatter(_max, _min, s=40, color=color, alpha=0.5)
plt.xlabel("Max zeros")
plt.ylabel("Min zeros")
plt.show()

In [None]:
shutil.rmtree(TEMP_DIR) if os.path.isdir(TEMP_DIR) else None