# Preprocessing mini scripts useful for the multiple LSTM sub-pipelines

# Disclaimer:

##### This notebook is a sandbox and it evolved with the project. Code snippets were added progressively. Using this code requires understanding of the pipeline presented in the report. 
##### It remains a sandbox, partly because no positive classification accuracy was obtained.
##### This script contains many steps, not necessarily interdependent and not neccessarily respecting a certain order. 
##### Each part has a certain utility, allowing flexible experimenting, and avoiding redundance in multiple scripts.
##### CHECKPOINTS are, as name tells, checkpoint for loading/saving processed data. This is useful to save time 

In [None]:
# !pip install keras
# !pip install tensorflow
# !pip install numpy
# !pip install Bio 
# !pip install tensorflow_addons


In [1]:
import numpy as np
import pickle
# import swifter
import gc
import timeit
import pandas as pd
from Bio import SeqIO
import time
%matplotlib inline
import pickle
from tqdm.auto import tqdm

from keras.models import Sequential
from keras.layers import Dense, Bidirectional
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
from sklearn.model_selection import train_test_split
import tensorflow as tf

## some toy code to test the numpy.pad() method

In [68]:
sequences =np.array([[1,2,3,4,5,6,7,8,9], [1,2,3], [1,'a','hi']],dtype=object)#, ['a','b','c'], ['a','b'], ['a','b','c','d','e'], ['a','b','c','d','e','f','g','h']])
maxlen = len(max(sequences, key=len))
strSeq = [np.pad(seq, (0, maxlen-len(seq)), 'reflect') for seq in sequences]
strSeq

[array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1, 2, 3, 2, 1, 2, 3, 2, 1]),
 array(['1', 'a', 'hi', 'a', '1', 'a', 'hi', 'a', '1'], dtype='<U11')]

## Sequence processing and Padding (repeat vs zero padding)
Colabs session crashes with training model on data length 14999 

In [71]:
# # Read fasta sequence
# from Bio import SeqIO
# # chromid = []
# sequences = []
# # sequences = np.empty(shape=(0),dtype=object)
# for sample in SeqIO.parse(r'D:\DataSet\Data\sequences.fasta', "fasta"):
#     # chromid.append(sample.id)
#     sequences.append(list(str(sample.seq)))
# #     sequences = np.append(sequences,str(sample.seq))



## Zero padding

In [None]:
# Padding with zeros
strSeq = sequence.pad_sequences(sequences, dtype=object, padding='post', value = "Z")

In [4]:
# # Padding with zeros
sequences = [[1,3,5,3,5,1,7,7,2],[2,4,32,2,3]]
strSeq = sequence.pad_sequences(sequences, dtype=object, padding='post', value = 0)
strSeq

array([[1, 3, 5, 3, 5, 1, 7, 7, 2],
       [2, 4, 32, 2, 3, 0, 0, 0, 0]], dtype=object)

## Repead padding: padding with sequence itself

In [72]:
# padding with same sequence
maxlen = len(max(sequences, key=len))
strSeq = [np.pad(seq, (0, maxlen-len(seq)), 'reflect') for seq in sequences]


In [None]:
df = pd.DataFrame(sequences, index=np.arange(1, len(sequences)+1), 
             columns=['Sequences'])
strSeq = sequences

## Sequential Encoding of raw sequences only composed of A,C,G,T, Z

In [75]:
def toNum(seq):
    nseq = []
    for c in seq:
        if(c=='A'): 
            nseq.append(1)
        elif c=='C':
            nseq.append(2)
        elif c=='G':
            nseq.append(3)
        elif c=='T':
            nseq.append(4)
        else :
            nseq.append(0)
#     print(nseq)
    return nseq

In [76]:
sequences = [toNum(seq) for seq in strSeq]

### transform the list into numpy array for less storage size

In [None]:
sequences = np.array(sequences, dtype='object')

## Checkpoint: Read or write padded/ encoded sequences to disk

## Write to disk

In [8]:
with open(r'D:\DataSet\LstmData\repeatPad-numSeq-2.pkl','wb') as f:
    pickle.dump(sequences, f)

## Load from disk 

In [3]:
with open(r'D:\DataSet\LstmData\repeatPad-numSeq.pkl','rb') as f:
    sequences = pickle.load(f)


In [4]:
# display(sequences[1])



## test print a single encoded sequence 

In [5]:
sequences[3]

array([4, 2, 2, ..., 1, 3, 1], dtype=object)

In [None]:
# with open(r'D:\DataSet\LstmData\repeatPad-numSeq.pkl','wb') as f:
#     pickle.dump(sequences, f)

# load data, optionally trimming sequence length

In [5]:
# with open('/content/drive/MyDrive/Colab Notebooks/Data/numSeq-sliced.pkl','rb') as f:
#     sequences = pd.DataFrame(pickle.load(f))
# with open('/content/drive/MyDrive/Colab Notebooks/Data/numSeq-sliced-5000.pkl','rb') as f:
#       sequences = np.asarray(pickle.load(f))
with open(r'D:\DataSet\LstmData\numSeq.pkl','rb') as f:
      sequences = np.asarray(pickle.load(f))
#         sequences = pickle.load(f)

In [6]:
type(sequences[0])
# sequences.shape
# sequences

numpy.ndarray

## Checkpoint: Split sequences into five chunks, to send later to colabs drive

In [13]:
s1, s2, s3, s4, s5 = np.split(sequences, 5)

In [21]:
l1, l2, l3, l4, l5 = np.split(labels, 5)

## Save chunks to disk

In [30]:
with open(r'D:\DataSet\LstmData\chuncks\chunck-1.pkl','wb') as f:
    pickle.dump(s1, f)
with open(r'D:\DataSet\LstmData\chuncks\chunck-2.pkl','wb') as f:
    pickle.dump(s2, f)
with open(r'D:\DataSet\LstmData\chuncks\chunck-3.pkl','wb') as f:
    pickle.dump(s3, f)
with open(r'D:\DataSet\LstmData\chuncks\chunck-4.pkl','wb') as f:
    pickle.dump(s4, f)
with open(r'D:\DataSet\LstmData\chuncks\chunck-5.pkl','wb') as f:
    pickle.dump(s5, f)
    
  

In [32]:
pd.DataFrame(s1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14989,14990,14991,14992,14993,14994,14995,14996,14997,14998
0,3,3,1,3,2,3,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,4,3,1,2,3,2,3,3,2,4,...,0,0,0,0,0,0,0,0,0,0
2,1,2,4,4,4,2,3,2,1,2,...,0,0,0,0,0,0,0,0,0,0
3,4,2,2,2,2,1,1,3,3,4,...,0,0,0,0,0,0,0,0,0,0
4,2,4,2,2,4,2,1,1,3,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,2,2,3,3,1,2,4,1,3,4,...,0,0,0,0,0,0,0,0,0,0
11996,4,1,4,2,2,3,4,1,2,3,...,0,0,0,0,0,0,0,0,0,0
11997,4,4,2,2,4,4,4,1,2,3,...,0,0,0,0,0,0,0,0,0,0
11998,2,1,2,1,2,2,4,4,4,3,...,0,0,0,0,0,0,0,0,0,0
