In [11]:
import numpy as np
import pickle
import os
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
# Disable debug info
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [13]:
# Load data
path = 'data_2p5M.pkl'
with open(path, 'rb') as f:
    data = pickle.load(f)

In [14]:
# Extract sequences and energies
sequences, energies = zip(*data)

# 1-hot encode sequences
nucleotides = ['A', 'C', 'G', 'T']
sequence_length = max(len(seq) for seq in sequences)
num_nucleotides = len(nucleotides)

def one_hot_encode(sequence):
    encoding = np.zeros((len(sequence), num_nucleotides))
    for i, nucleotide in enumerate(sequence):
        encoding[i, nucleotides.index(nucleotide)] = 1
    return encoding

encoded_sequences = np.array([one_hot_encode(seq) for seq in sequences])

  encoded_sequences = np.array([one_hot_encode(seq) for seq in sequences])


In [15]:
# Pad sequences to a fixed length
padded_sequences = np.zeros((len(sequences), sequence_length, num_nucleotides))
for i, seq in enumerate(encoded_sequences):
    padded_sequences[i, :len(seq), :] = seq

In [16]:
# Standardize energies
energies = np.array(energies).reshape(-1, 1)
scaler = StandardScaler()
energies = scaler.fit_transform(energies)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, energies, test_size=0.2)

In [17]:
# Build LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(sequence_length, num_nucleotides)))
model.add(Dense(1, activation='linear'))

# Compile model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

2024-03-03 11:45:56.004161: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-03 11:45:56.004543: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-03 11:45:56.005104: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/100


2024-03-03 11:45:57.130252: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-03 11:45:57.130782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-03 11:45:57.131252: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-03-03 11:50:58.889498: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-03-03 11:50:58.889947: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-03-03 11:50:58.890350: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


In [18]:
# Calculate MSE
mse = model.evaluate(X_test, y_test)
print(f'MSE on test data: {mse}')

MSE on test data: 0.28893133997917175


In [31]:
# Make predictions on new data (replace new_sequence with desired sequence)
new_sequence = 'ATCGGAGCAAGTAAAGTGGAACGTTGTAACGGTTGTTAACTCA'
encoded_new_sequence = np.array([one_hot_encode(new_sequence)])
padded_new_sequence = np.zeros((1, sequence_length, num_nucleotides))
padded_new_sequence[0, :len(encoded_new_sequence[0]), :] = encoded_new_sequence[0]
predicted_energy = model.predict(padded_new_sequence)
predicted_energy = scaler.inverse_transform(predicted_energy)
print(f'Predicted Energy for new sequence: {predicted_energy[0, 0]}')

Predicted Energy for new sequence: -2.5390784740448


In [26]:
model.save('lstm.keras')