In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import re
import sklearn
from sklearn.model_selection import train_test_split
import dill
import tensorflow.keras.layers as tf_ke_l
from tensorflow import keras as ke

2024-02-21 17:16:19.213428: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare dataset

In [2]:
# Importing datasets
X = pd.read_pickle('./bprnra_one_hot_sequences.pkl') # one-hot sequence
with open("./bprnra_pairing_matricies.pkl", "rb") as f:
    y = dill.load(f) # pairing matrix

KeyboardInterrupt: 

In [None]:
X_ten_test = X.iloc[10:20]
for i in X_ten_test:
    print(i['one_hot_pairing'].shape)

In [3]:
# Stratify data into train, valid, splits
X_ten = X[X['name'] != 'bpRNA_CRW_3732'].head(10)
X_ten.head()

Unnamed: 0,name,one_hot_sequence
0,bpRNA_RFAM_30074,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 1, 0], [0,..."
1,bpRNA_RFAM_17909,"[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1,..."
2,bpRNA_RFAM_10080,"[[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [1,..."
4,bpRNA_RFAM_10379,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,..."
5,bpRNA_RFAM_23588,"[[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [1,..."


In [4]:
y_ten = y[y['name'] != 'bpRNA_CRW_3732'].head(10)
y_ten.head()

Unnamed: 0,name,pairing_matrix
0,bpRNA_RFAM_30074,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,bpRNA_RFAM_17909,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,bpRNA_RFAM_10080,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,bpRNA_RFAM_10379,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,bpRNA_RFAM_23588,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [5]:
X_ten = X_ten.drop(columns=['name'])
y_ten = y_ten.drop(columns=['name'])

In [6]:
y_ten['pairing_matrix'][0].shape

(38, 38)

In [7]:
X_ten.to_pickle('./bprnra_one_hot_sequences_ten.pkl')
y_ten.to_pickle('./bprnra_pairing_matricies_ten.pkl')


In [8]:
# Display basic info

def basic_eda(df):
    print("----------TOP 5 RECORDS--------")
    print(df.head(5))
    print("----------INFO-----------------")
    print(df.info())
    print("----------Describe-------------")
    print(df.describe())
    print("----------Columns--------------")
    print(df.columns)
    print("----------Data Types-----------")
    print(df.dtypes)
    print("-------Missing Values----------")
    print(df.isnull().sum())
    print("-------NULL values-------------")
    print(df.isna().sum())
    print("-----Shape Of Data-------------")
    print(df.shape)

## Creating padded input and output

In [13]:
X_ten_padded = X_ten.copy(deep=True)
for index, sequence in X_ten_padded.iterrows():
    sequence = sequence['one_hot_sequence']
    while sequence.shape[0] != 178:
        sequence = np.vstack((sequence, [0,0,0,0]))
    X_ten_padded.at[index, 'one_hot_sequence'] = sequence
    
y_ten_padded = y_ten.copy(deep=True)
m = 178
for index, matrix in y_ten_padded.iterrows():
    matrix = matrix['pairing_matrix']
    n = matrix.shape[0]
    while n != m:
        pad_width = ((178 - n) // 2, m - n - (m - n) // 2)
        matrix = np.pad(matrix, pad_width=pad_width, mode='constant', constant_values=0)
        n = matrix.shape[0]
    y_ten_padded.at[index, 'pairing_matrix'] = matrix

# masks = []
# for i in X_ten['one_hot_sequence']:
#     padding_number = 178 - i.shape[0]
#     mask = [1] * i.shape[0] + [0] * padding_number
#     masks.append(mask)

In [14]:
X_ten_padded.to_pickle('./bprnra_one_hot_sequences_ten_padded.pkl')
y_ten_padded.to_pickle('./bprnra_pairing_matricies_ten_padded.pkl')

# Creating the model

In [11]:
from utils.OneHotEncoding import get_one_hot_sequence
from utils.BuildMatrixFromDotBracket import build_matrix, get_couples

example_structure = "...(((((((..((((((.........))))))......).((((((.......))))))..))))))..."
example_sequence = "CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAGCCUUAAACUCUUGAUUAUGAAGUG"
example_length = len(example_sequence)

# Get one-hot encoded sequence (X) and adjacency matrix (y) of example data
example_one_hot = get_one_hot_sequence(example_sequence)
example_matrix = build_matrix(get_couples(example_structure), example_length)

In [12]:
print(example_one_hot.shape)
print(example_matrix.shape)

(71, 4)
(71, 71)


In [35]:
# Define the input layer to accept one-hot encoded sequences
input_layer = tf.keras.layers.Input(shape=(example_length, 4))

In [36]:
# Process sequences using an LSTM layer, returning all outputs
lstm_layer = tf.keras.layers.LSTM(units=64, return_sequences=True)(input_layer)

In [37]:
# Flatten the LSTM output into a 1D vector
flattened = tf.keras.layers.Flatten()(lstm_layer)

In [None]:
# Create a dense output layer with sigmoid activation for probabilities
output_layer = tf.keras.layers.Dense(example_length * example_length, activation='sigmoid')(flattened)

In [13]:
# Reshape the output into a 2D adjacency matrix
adjacency_matrix = tf.reshape(output_layer, (-1, example_length, example_length))

In [None]:
# Construct the model, specifying input and output layers
model = tf.keras.Model(inputs=example_one_hot, outputs=example_matrix)

In [10]:
# Configure the model for training
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [12]:
# Reshape input data to match model's expected shape
example_one_hot = example_one_hot.reshape(1, 71, 4)
example_matrix = example_matrix.reshape(1, 71, 71)
print(example_one_hot.shape, example_matrix.shape)

(1, 71, 4) (1, 71, 71)


In [None]:
# Train the model with input and target data for 10 epochs
history = model.fit(example_one_hot, example_matrix, epochs=10)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
# Plot the loss and accuracy
history_df.loc[:, ['loss']].plot()
history_df.loc[:, ['accuracy']].plot()

# Making a model with variable input

## Padding and masking

- perform minibatches and manually loop through data (then it could use variable length data)
- mask insignificant values

In [10]:
# Importing datasets
X_pad = pd.read_pickle('./bprnra_one_hot_sequences_ten_padded.pkl') # one-hot sequence
with open("./bprnra_pairing_matricies_ten_padded.pkl", "rb") as f:
    y_pad = dill.load(f) # pairing matrix

In [11]:
print(X_pad.shape)
print(y_pad.shape)

(10, 1)
(10, 1)


In [12]:
arrays = []
for i in X_pad['one_hot_sequence']:
  arrays.append(i)

In [14]:
X_tensor = np.stack(arrays)
X_tensor.shape

(10, 178, 4)

In [15]:
X_tensor_float32 = X_tensor.astype(np.float32)
type(X_tensor_float32)

numpy.ndarray

In [16]:
arrays = []
for i in y_pad['pairing_matrix']:
  arrays.append(i)

In [18]:
y_tensor = np.stack(arrays)
y_tensor.shape

(10, 178, 178)

In [19]:
y_tensor_float32 = y_tensor.astype(np.float32)

In [20]:
length = 178

In [21]:
input_layer = tf.keras.layers.Input(shape=(length, 4))

In [22]:
# Process sequences using an LSTM layer, returning all outputs
lstm_layer = tf.keras.layers.LSTM(units=64, return_sequences=True)(input_layer)

In [23]:
# Flatten the LSTM output into a 1D vector
flattened = tf.keras.layers.Flatten()(lstm_layer)

In [24]:
output_layer = tf.keras.layers.Dense(length * length, activation='relu')(flattened)

: 

In [138]:
# Reshape the output into a 2D adjacency matrix
adjacency_matrix = tf.reshape(output_layer, (-1, length, length))

In [None]:
# Output tensors of a Functional model must be the output of a TensorFlow `Layer` (thus holding past layer metadata).
model = tf.keras.Model(inputs=input_layer, outputs=adjacency_matrix)

In [160]:
# Configure the model for training
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [161]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 one_hot_sequence (InputLay  [(None, 1)]                  0         []                            
 er)                                                                                              
                                                                                                  
 pairing_matrix (InputLayer  [(None, 1)]                  0         []                            
 )                                                                                                
                                                                                                  
Total params: 0 (0.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_______________________________________________________________________________________________

In [None]:
history = model.fit(X_tensor_float32, y_tensor_float32, epochs=10)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
# Plot the loss and accuracy
history_df.loc[:, ['loss']].plot()
history_df.loc[:, ['accuracy']].plot()

## Batch = 1

In [None]:
inputs = Input(shape=(None, 4))