## importing module and libraries needed

In [1]:
import numpy as np
import matplotlib as plt
from nn import io, preprocess, nn

## Reading in both positive and negative classes of data

In [2]:
from nn import io

# reading in positive sequences
positive = 'data/rap1-lieb-positives.txt'
pos_seqs = io.read_text_file(positive)

# Print the number of sequences in pos_seqs
print(f"Number of sequences read from {positive}: {len(pos_seqs)}")


# reading in the negative sequences
negative = 'data/yeast-upstream-1k-negative.fa'
neg_seqs = io.read_fasta_file(negative)

# Print the number of sequences read
print(f"Number of sequences read from {negative}: {len(neg_seqs)}")

Number of sequences read from data/rap1-lieb-positives.txt: 137
Number of sequences read from data/yeast-upstream-1k-negative.fa: 3163


## Processing negative sequences to have same length as positive sequences (note additional function added in preprocess.py)

In [8]:
# Process negative examples to the same length as positive examples
pos_seq_length = len(pos_seqs[0])
negative_examples = preprocess.process_negative_sequences(neg_seqs, pos_seq_length)

print(len(negative_examples))

3109689


## Combining positive and negative examples

In [9]:
# Combine positive and negative examples
sequences = pos_seqs + negative_examples
labels = [True] * len(pos_seqs) + [False] * len(negative_examples)

## Balancing Sequences using the sample_seqs function
This sampling scheme ensures that both classes have an equal number of samples, which helps to prevent the model from being biased towards the majority class.


In [12]:
all_seqs = pos_seqs + negative_examples
all_labels = [True] * len(pos_seqs) + [False] * len(negative_examples)
sampled_seqs, sampled_labels = preprocess.sample_seqs(all_seqs, all_labels)

## one-hot encoding sequences

In [13]:
X = preprocess.one_hot_encode_seqs(sampled_seqs)
y = np.array(sampled_labels, dtype=int)

## splitting the data into training and validation sets

In [14]:
X_train, X_val, y_train, y_val = preprocess.train_test_split_custom(X, y, test_size=0.2, random_state=42)

# defining nn architecture creating an instance of the NeuralNetwork class

In [15]:
# Define the neural network architecture
nn_arch = [
    {"input_dim": 4 * pos_seq_length, "output_dim": 32, "activation": "relu"},
    {"input_dim": 32, "output_dim": 1, "activation": "sigmoid"}
]

# Hyperparameters
lr = 0.01
seed = 42
batch_size = 32
epochs = 100
loss_function = "binary_crossentropy"

# Create NeuralNetwork instance
nn = nn.NeuralNetwork(nn_arch, lr, seed, batch_size, epochs, loss_function)

## Training the model

In [16]:
# Train the neural network
train_loss, val_loss = nn.fit(X_train, y_train, X_val, y_val)

AssertionError: 

## Plot training and validation loss by epoch

In [None]:
# Plot training and validation loss by epoch
plt.plot(train_loss, label="Training Loss")
plt.plot(val_loss, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

## Report the accuracy on the validation dataset

In [None]:
y_val_pred = (nn.predict(X_val) > 0.5).astype(int)
accuracy = np.mean(y_val_pred == y_val)
print("Accuracy on the validation dataset:", accuracy)

Accuracy on the validation dataset: 0.5781818181818181


## Explain your choice of loss function and hyperparameters:
 The loss function used is binary_crossentropy, which is suitable for binary classification problems.It measures the dissimilarity between the predicted probabilities and the true labels.
 The learning rate is set to 0.01, which is a common choice to balance convergence speed and stability.
 The number of epochs is set to 100, which should provide sufficient training without overfitting.
 The batch size is set to 32, which is a common choice to balance training speed and model performance.