In [1]:
import numpy as np
import pickle
import gc
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import pad_sequences
from keras.losses import categorical_crossentropy

2024-03-31 12:17:35.057129: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-31 12:17:35.271557: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 12:17:35.271597: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 12:17:35.313506: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-31 12:17:35.402601: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

Num GPUs Available:  1
['/device:GPU:0']


2024-03-31 12:17:42.340206: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:17:42.463018: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:17:42.463049: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:17:42.465462: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:17:42.465497: I external/local_xla/xla/stream_executor

In [3]:
# Load data
path = 'data/train_2p5M_struct.pkl'
with open(path, 'rb') as f:
    data_train = pickle.load(f).dataset

path = 'data/valid_2p5M_struct.pkl'
with open(path, 'rb') as f:
    data_valid = pickle.load(f).dataset

path = 'data/test_2p5M_struct.pkl'
with open(path, 'rb') as f:
    data_test = pickle.load(f).dataset

In [4]:
# Extract sequences and energies
sequences_train, energies_train, struct_train, hairpins_train = zip(*data_train)
sequences_valid, energies_valid, struct_valid, hairpins_valid = zip(*data_valid)
sequences_test, energies_test, struct_test, hairpins_test = zip(*data_test)

energies_train = np.asarray(energies_train)
hairpins_train = np.asarray(hairpins_train)
energies_valid = np.asarray(energies_valid)
hairpins_valid = np.asarray(hairpins_valid)

# Free memory
del data_train
del data_valid
del data_test
gc.collect()

0

In [5]:
# Convert sequences to n grams
def seq2ngrams(seqs, n=1):
    return np.array([[seq[i:i+n] for i in range(len(seq))] for seq in seqs], dtype=object)

maxlen_seq = 50
input_grams_train = seq2ngrams(sequences_train)
input_grams_valid = seq2ngrams(sequences_valid)
input_grams_test = seq2ngrams(sequences_test)

# Free memory
del sequences_train
del sequences_valid
del sequences_test
gc.collect()

0

In [6]:
# Prepare for embedding
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(input_grams_train)
input_data_train = tokenizer_encoder.texts_to_sequences(input_grams_train)
input_data_train = pad_sequences(input_data_train, maxlen=maxlen_seq, padding='post')
input_data_valid = tokenizer_encoder.texts_to_sequences(input_grams_valid)
input_data_valid = pad_sequences(input_data_valid, maxlen=maxlen_seq, padding='post')
input_data_test = tokenizer_encoder.texts_to_sequences(input_grams_test)
input_data_test = pad_sequences(input_data_test, maxlen=maxlen_seq, padding='post')
n_words = len(tokenizer_encoder.word_index) + 1

tokenizer_decoder = Tokenizer(char_level=True)
tokenizer_decoder.fit_on_texts(struct_train)
struct_train = tokenizer_decoder.texts_to_sequences(struct_train)
struct_train = pad_sequences(struct_train, maxlen=maxlen_seq, padding='post')
struct_train = to_categorical(struct_train)

struct_valid = tokenizer_decoder.texts_to_sequences(struct_valid)
struct_valid = pad_sequences(struct_valid, maxlen=maxlen_seq, padding='post')
struct_valid = to_categorical(struct_valid)

struct_test = tokenizer_decoder.texts_to_sequences(struct_test)
struct_test = pad_sequences(struct_test, maxlen=maxlen_seq, padding='post')
struct_test = to_categorical(struct_test)

# Free memory
del input_grams_train
del input_grams_valid
del input_grams_test
gc.collect()

0

In [7]:
@tf.keras.saving.register_keras_serializable()
def weighted_categorical_crossentropy(y_true, y_pred):
    class_weights = tf.constant([1.0, 1.0, 2.0, 2.0])
    weights = tf.reduce_sum(class_weights * y_true, axis=-1)
    unweighted_loss = categorical_crossentropy(y_true, y_pred)
    weighted_loss = unweighted_loss * weights
    return weighted_loss

In [8]:
# Layers
input_seq = Input(shape=(maxlen_seq,))
shared_embedding_layer = Embedding(input_dim=n_words, output_dim=128)(input_seq)
shared_lstm_layer1 = Bidirectional(LSTM(128, return_sequences=True))(shared_embedding_layer)
shared_lstm_layer2 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer1)
shared_lstm_layer3 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer2)
shared_lstm_layer4 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer3)
shared_lstm_layer5 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer4)

# Outputs
output_mfe = Dense(1, activation='linear', name='MFE')(shared_lstm_layer5[:, -1, :])  # Take only the last output
output_hairpins = Dense(1, activation='linear', name='Hairpins')(shared_lstm_layer5[:, -1, :])  # Take only the last output
output_seq = TimeDistributed(Dense(4, activation='softmax'), name='Struct')(shared_lstm_layer5)

# Define the model
model = Model(inputs=input_seq, outputs=[output_mfe, output_hairpins, output_seq])
model.compile(optimizer='adam',
              loss={'MFE': 'mean_squared_error', 'Hairpins': 'mean_squared_error', 'Struct': weighted_categorical_crossentropy},
              loss_weights={'MFE': 1.0, 'Hairpins': 2.0, 'Struct': 2.0})
model.summary()

2024-03-31 12:20:13.036588: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:20:13.039948: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:20:13.039980: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:20:13.079184: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-31 12:20:13.079407: I tensorflow/core/common_runtime/gpu/gpu

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 50, 128)              640       ['input_1[0][0]']             
                                                                                                  
 bidirectional (Bidirection  (None, 50, 256)              263168    ['embedding[0][0]']           
 al)                                                                                              
                                                                                                  
 bidirectional_1 (Bidirecti  (None, 50, 256)              394240    ['bidirectional[0][0]']   

In [11]:
# Train and save model

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

his = model.fit(input_data_train,
          [energies_train, hairpins_train, struct_train],
          batch_size=128,
          epochs=40,
          validation_data=(input_data_valid, [energies_valid, hairpins_valid, struct_valid]),
          verbose=1,
          callbacks=[early_stopping])

model.save('full_2p5M.keras')

Epoch 1/40


2024-03-30 02:03:39.662601: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-03-30 02:03:40.706864: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f3b140d16a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-30 02:03:40.706894: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-03-30 02:03:40.814124: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1711778621.004820    6078 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [9]:
# Shared layers
input_seq = Input(shape=(maxlen_seq,))
shared_embedding_layer = Embedding(input_dim=n_words, output_dim=128)(input_seq)
shared_lstm_layer1 = Bidirectional(LSTM(128, return_sequences=True))(shared_embedding_layer)
shared_lstm_layer2 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer1)
shared_lstm_layer3 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer2)
shared_lstm_layer4 = Bidirectional(LSTM(128, return_sequences=True))(shared_lstm_layer3)
shared_lstm_layer5 = Bidirectional(LSTM(128))(shared_lstm_layer4)

# Outputs
output_mfe = Dense(1, activation='linear', name='MFE')(shared_lstm_layer5)
output_hairpins = Dense(1, activation='linear', name='Hairpins')(shared_lstm_layer5)

# Define the model
model = Model(inputs=input_seq, outputs=[output_mfe, output_hairpins])
model.compile(optimizer='adam',
              loss={'MFE': 'mean_squared_error', 'Hairpins': 'mean_squared_error'},
              loss_weights={'MFE': 1.0, 'Hairpins': 2.0})
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 50, 128)              640       ['input_2[0][0]']             
                                                                                                  
 bidirectional_5 (Bidirecti  (None, 50, 256)              263168    ['embedding_1[0][0]']         
 onal)                                                                                            
                                                                                                  
 bidirectional_6 (Bidirecti  (None, 50, 256)              394240    ['bidirectional_5[0][0]'

In [14]:
# Train and save the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

his = model.fit(input_data_train,
          [energies_train, hairpins_train],
          batch_size=128,
          epochs=40,
          validation_data=(input_data_valid, [energies_valid, hairpins_valid]),
          verbose=1,
          callbacks=[early_stopping])

model.save('partial_2p5M.keras')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
