# Load the data

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Activation, Dense, Flatten, Reshape, Conv1D, Conv2D, Add, BatchNormalization, MaxPooling1D)
import numpy as np
from random import sample, shuffle

K = 5

# The amino alphabet
aa1 = list("ACDEFGHIKLMNPQRSTVWY")
aa2int = dict((c, i) for i, c in enumerate(aa1))

#Parsing of filtered dataset file
raw = list(open("../data/nodups.fasta", 'r'))
raw = [i[:-1] for i in raw] # remove newlines
labels = np.array([int(i.split(";")[1]) for i in raw])
seq = [i.split(";")[0] for i in raw]
data = list(zip(seq, labels))
longest = max([len(i[0]) for i in data]) # Currently 8797

# Get the corresponding 2^n
longest = (np.power(2, np.ceil(np.log(longest)/np.log(2)))).astype(np.int64)

In [26]:
display(data[0])
display(data[-1])

('MVQINGSYKLEKSDNFDAFLKELGLNFVTRNLAKSATPTVEVSVNGDSYTIKTASTLKNTEISFKLGEEFEEARADGKTVKTVVNKESDTKFVQVQQGDKEVTIVREFSDEGLTVTATVNGVTSVRFYKRQ',
 1)

('MNNEEDLLQEDSTRDEGNETEANSMNTLRRTRKKVTKPYVCSTEVGETDMSNSNDCMRDSSQILTPPQLSSRMKHIRQAMAKNRLQFVRFEATDLHGVSRSKTIPAHFFQEKVSHGVCMPRGYLEVIPNPKDNEMNNIRATCFNSDIVLMPELSTFRVLPWADRTARVICDTFTVTGEPLLTSPRYIAKRQLSHLQASGFSLLSAFIYDFCIFGVPEILNSKIISFPALTFLNNHDQPFMQELVDGLYHTGANVESFSSSTRPGQMEISFLPEFGISSADNAFTLRTGVKEVARKYNYIASFFIETGFCDSGILSHSLWDVDRKKNMFCSTSGTEQLTITGKKWLAGLLKHSAALSCLMAPSVSCRKRYSKDRKDLKKSVPTTWGYNDNSCIFNIKCHGEKGTRIENKLGSATANPYLVLAATVAAGLDGLHSSNEVLAGPDESTDFYQVEPSEIPLKLEDALVALEEDQCLRQALGETFIRYFVAMKKYELENEEIAAERNKFLEYFI',
 0)

## Divide into allergens / non allergens and perform the k-split

In [27]:
allergens = [i for i in data if i[1] == 1]
non_allergens = [i for i in data if i[1] == 0]

In [28]:
shuffle(allergens)
shuffle(non_allergens)

In [45]:
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

allergens_split = [[allergens[i] for i in range_] for range_ in chunkIt(range(len(allergens)), K)]
non_allergens_split = [[non_allergens[i] for i in range_] for range_ in chunkIt(range(len(non_allergens)), K)]

# Combine the splits
data_split = [allergens_split[k] + non_allergens_split[k] for k in range(K)]

# Shuffle each entry
for k in range(K):
    shuffle(data_split[k])

# Store to file

In [69]:
import os
import datetime
the_timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

for k in range(K):
    with open(F".//data_{k}_{the_timestamp}.dat", "w") as f:
        for x, y in data_split[k]:
            f.write(F"{x};{y}\n")

# Store the different splits directly
ksplit=0
for v in range(K):
    for t in range(K):
        if t == v:
            continue
        
        x_train = []
        y_train = []
        for i in range(K):
            if i == v or i == t:
                continue
            for x,y in data_split[i]:
                x_train.append(x)
                y_train.append(y)
        
        train = list(zip(x_train, y_train))
        shuffle(train)
        
        with open(F".//data_split_{ksplit+1}_train_{the_timestamp}.dat", "w") as f:
            for x, y in train:
                f.write(F"{x};{y}\n")
        
        with open(F".//data_split_{ksplit+1}_test_{the_timestamp}.dat", "w") as f:
            for x, y in data_split[t]:
                f.write(F"{x};{y}\n")
        
        with open(F".//data_split_{ksplit+1}_valid_{the_timestamp}.dat", "w") as f:
            for x, y in data_split[v]:
                f.write(F"{x};{y}\n")
            
        ksplit += 1
    

#onehot_coded_data = np.array(
#    list(map(onehot, list([i for i in seq])))
#)

#display(onehot_coded_data[0])
#display(onehot_coded_data[-1])