In [8]:
from nn import io, preprocess, nn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# 1. Read positive and negative sequences

positive_seqs = 'data/rap1-lieb-positives.txt'
negative_seqs = 'data/yeast-upstream-1k-negative.fa'


pos_seqs = io.read_text_file(positive_seqs)
neg_seqs = io.read_fasta_file(negative_seqs)

# 2. Process the negative sequences to match the length of the positive sequences
processed_neg_seqs = []
for seq in neg_seqs:
    for i in range(len(seq) - 17 + 1):
        subseq = seq[i:i + 17]
        processed_neg_seqs.append(subseq)

# 3. Balance the classes using the sample_seqs function
all_seqs = pos_seqs + processed_neg_seqs
all_labels = [True] * len(pos_seqs) + [False] * len(processed_neg_seqs)
sampled_seqs, sampled_labels = preprocess.sample_seqs(all_seqs, all_labels)

# The chosen sampling scheme ensures that both classes have an equal number of samples,
# which helps to prevent the model from being biased towards the majority class.

# 4. One-hot encode the data
encoded_seqs = preprocess.one_hot_encode_seqs(sampled_seqs)

# 5. Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(encoded_seqs, sampled_labels, test_size=0.2, random_state=42)

# Ensure that X_train and X_val have shape (number_of_features, number_of_samples)
X_train = X_train.T
X_val = X_val.T

# Ensure that y_train and y_val have shape (number_of_samples, 1) instead of (1, number_of_samples)
y_train = np.array(y_train).reshape(-1, 1)
y_val = np.array(y_val).reshape(-1, 1)

# 6. Create an instance of the NeuralNetwork class with an appropriate architecture
nn_arch = [
    {'input_dim': 68, 'output_dim': 32, 'activation': 'relu'},
    {'input_dim': 32, 'output_dim': 1, 'activation': 'sigmoid'}
]
lr = 0.01
seed = 42
batch_size = 32
epochs = 50
loss_function = "binary_cross_entropy"

rap1_nn = nn.NeuralNetwork(nn_arch, lr, seed, batch_size, epochs, loss_function)

# Train the neural network
train_losses, val_losses = rap1_nn.fit(X_train, y_train, X_val, y_val)

# 7. Train the neural network on the training data
#train_losses, val_losses = rap1_nn.fit(X_train, np.array(y_train)[:, np.newaxis], X_val, np.array(y_val)[:, np.newaxis])

# 8. Plot training and validation loss by epoch
plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# 9. Report the accuracy of the classifier on the validation dataset
y_val_pred = rap1_nn.predict(X_val)
accuracy = np.mean((y_val_pred > 0.5) == np.array(y_val)[:, np.newaxis])
print(f'Validation accuracy: {accuracy}')

# Explanation for the choice of loss function and hyperparameters:
# Loss function: Binary cross-entropy is used because it's a standard loss function for binary classification problems.
# It measures the dissimilarity between the predicted probabilities and the true labels.
# Hyperparameters: The learning rate, batch size, and epochs are chosen based on common practice and can be further fine-tuned
# using grid search or other hyperparameter optimization techniques to improve the performance of the model.


AssertionError: 