In [14]:
#general libraries needed
import numpy as np
from sklearn.model_selection import train_test_split

## Reading in the 137 positive Rap1 motif examples

In [15]:
from nn import io

# Read in the sequences from the file
filename = 'data/rap1-lieb-positives.txt'
pos_seqs = io.read_text_file(filename)

# Print the number of sequences read
print(f"Number of sequences read from {filename}: {len(pos_seqs)}")


Number of sequences read from data/rap1-lieb-positives.txt: 137


## Reading in the negative examples

In [16]:
# Read in the sequences from the file
filename = 'data/yeast-upstream-1k-negative.fa'
neg_seqs = io.read_text_file(filename)

# Print the number of sequences read
print(f"Number of sequences read from {filename}: {len(neg_seqs)}")

Number of sequences read from data/yeast-upstream-1k-negative.fa: 56908


In [17]:
from nn import preprocess
from typing import List

# Sample negative sequences to match number of positive sequences
sampled_neg_seqs, _ = preprocess.sample_seqs(neg_seqs, [False] * len(neg_seqs))

# Randomly sample 17-base-long substrings from each negative sequence
processed_neg_seqs = []
for seq in sampled_neg_seqs:
    start_idx = np.random.randint(0, len(seq) - 16)
    processed_neg_seqs.append(seq[start_idx:start_idx+17])


In [18]:
# Read in positive sequences
pos_seqs = io.read_text_file("data/rap1-lieb-positives.txt")

# Read in negative sequences
neg_seqs = io.read_fasta_file("data/yeast-upstream-1k-negative.fa")

# Sample negative sequences to balance dataset
neg_seqs, _ = preprocess.sample_seqs(neg_seqs, [False]*len(neg_seqs))
seqs = pos_seqs + neg_seqs
labels = [True]*len(pos_seqs) + [False]*len(neg_seqs)

# One-hot encode sequences
encoded_seqs = preprocess.one_hot_encode_seqs(seqs)
print(encoded_seqs)

# Create a list of labels for the balanced dataset
y = [True] * len(X_balanced_pos) + [False] * len(X_balanced_neg)

# Convert y to a numpy array
y = np.array(y)

[[1 0 0 ... 0 0 1]
 [1 0 0 ... 0 1 0]
 [0 0 1 ... 0 1 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 1 0 ... 1 0 0]
 [1 0 0 ... 0 1 0]]


NameError: name 'X_balanced_pos' is not defined

In [19]:
def sample_seqs(seqs: List[str], labels: List[bool]) -> Tuple[List[str], List[bool]]:
    """
    This function should sample the given sequences to account for class imbalance. 
    Consider this a sampling scheme with replacement.
    
    Args:
        seqs: List[str]
            List of all sequences.
        labels: List[bool]
            List of positive/negative labels

    Returns:
        sampled_seqs: List[str]
            List of sampled sequences which reflect a balanced class size
        sampled_labels: List[bool]
            List of labels for the sampled sequences
    """
    pos_seqs = [seq for seq, label in zip(seqs, labels) if label]
    neg_seqs = [seq for seq, label in zip(seqs, labels) if not label]
    
    # Calculate number of sequences to sample from each class
    n_pos = len(pos_seqs)
    n_neg = len(neg_seqs)
    n_samples = min(n_pos, n_neg)
    
    # Sample sequences with replacement
    pos_samples = np.random.choice(pos_seqs, n_samples, replace=True)
    neg_samples = np.random.choice(neg_seqs, n_samples, replace=True)
    
    # Combine the sampled sequences and labels
    sampled_seqs = list(pos_samples) + list(neg_samples)
    sampled_labels = [True] * n_samples + [False] * n_samples
    
    # Shuffle the sequences and labels
    shuffle_idx = np.random.permutation(len(sampled_seqs))
    sampled_seqs = [sampled_seqs[i] for i in shuffle_idx]
    sampled_labels = [sampled_labels[i] for i in shuffle_idx]
    
    return sampled_seqs, sampled_labels


In [21]:
# Imports
import numpy as np
from typing import List, Tuple
from numpy.typing import ArrayLike
from nn import io
from nn import preprocess
from nn import nn
import numpy as np
import matplotlib.pyplot as plt

# Define the maximum sequence length
MAX_SEQ_LENGTH = 17

# Define the data file paths
POSITIVE_FILE = 'data/rap1-lieb-positives.txt'
NEGATIVE_FILE = 'data/yeast-upstream-1k-negative.fa'

# Define the train/test split ratio
TRAIN_RATIO = 0.8

# Read in the positive examples
pos_seqs = io.read_text_file(POSITIVE_FILE)

# Read in the negative examples and preprocess them
neg_seqs = io.read_fasta_file(NEGATIVE_FILE)
neg_seqs = [seq[:MAX_SEQ_LENGTH] for seq in neg_seqs]

# Balance the classes using the sample_seq function
labels = [True] * len(pos_seqs) + [False] * len(neg_seqs)
seqs = pos_seqs + neg_seqs
seqs, labels = sample_seqs(seqs, labels)

# One-hot encode the sequences
X = preprocess.one_hot_encode_seqs(seqs)
y = np.array(labels, dtype=int)

# Shuffle the data
shuffle_idx = np.random.permutation(len(X))
X = X[shuffle_idx]
y = y[shuffle_idx]

# Split the data into training and validation sets
split_idx = int(len(X) * TRAIN_RATIO)
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# Print the shape of the data
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

X_train shape: (219, 68)
X_val shape: (55, 68)
y_train shape: (219,)
y_val shape: (55,)


In [12]:
from nn import io
from nn import preprocess
from nn import nn
import numpy as np
import matplotlib.pyplot as plt

# Reading in data
positive_seqs = io.read_text_file("data/rap1-lieb-positives.txt")
negative_seqs_raw = io.read_fasta_file("data/yeast-upstream-1k-negative.fa")

# Process negative examples to the same length as positive examples:
seq_length = len(positive_seqs[0])
negative_seqs = [seq[i:i+seq_length] for seq in negative_seqs_raw for i in range(0, len(seq) - seq_length + 1)]

# Balance classes using the sample_seqs function
all_seqs, all_labels = preprocess.sample_seqs(positive_seqs + negative_seqs, [True] * len(positive_seqs) + [False] * len(negative_seqs))

# Encode sequences
encoded_seqs = preprocess.one_hot_encode_seqs(all_seqs)

# Split the data into training and validation sets
split_idx = int(0.8 * len(all_labels))
X_train, y_train = encoded_seqs[:split_idx], np.array(all_labels[:split_idx]).reshape(-1, 1)
X_val, y_val = encoded_seqs[split_idx:], np.array(all_labels[split_idx:]).reshape(-1, 1)



# Define the neural network architecture

nn_arch = [    {"input_dim": 4 * seq_length, "output_dim": 64, "activation": "relu"},
               {"input_dim": 64, "output_dim": 64, "activation": "relu"},
               {"input_dim": 64, "output_dim": 1, "activation": "sigmoid"}]


# nn_arch = [
#     {"input_dim": 4 * seq_length, "output_dim": 64, "activation": "relu"},
#     {"input_dim": 64, "output_dim": 32, "activation": "relu"},
#     {"input_dim": 32, "output_dim": 1, "activation": "sigmoid"}
# ]

# Hyperparameters
lr = 0.01
seed = 42
batch_size = 64
epochs = 100
loss_function = "binary_cross_entropy"

# Create NeuralNetwork instance
nn = nn.NeuralNetwork(nn_arch, lr, seed, batch_size, epochs, loss_function)

# Train the neural network
train_loss, val_loss = nn.fit(X_train, y_train, X_val, y_val)

# Plot training and validation loss by epoch
plt.plot(train_loss, label="Training Loss")
plt.plot(val_loss, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Report the accuracy of the classifier on the validation dataset
y_val_pred = (nn.predict(X_val) > 0.5).astype(int)
accuracy = np.mean(y_val_pred == y_val)
print(f"Validation accuracy: {accuracy:.4f}")

ValueError: shapes (64,1) and (64,64) not aligned: 1 (dim 1) != 64 (dim 0)

In [24]:
from nn import io, preprocess, nn
import numpy as np
import matplotlib.pyplot as plt

# Read in the positive examples
positive_seqs = io.read_text_file("data/rap1-lieb-positives.txt")

# Read in the negative examples and preprocess them to the same length as positive examples
negative_seqs_raw = io.read_fasta_file("data/yeast-upstream-1k-negative.fa")
seq_length = len(positive_seqs[0])
negative_seqs = [seq[i:i+seq_length] for seq in negative_seqs_raw for i in range(0, len(seq) - seq_length + 1)]

# Balance classes using the sample_seqs function
all_seqs, all_labels = preprocess.sample_seqs(positive_seqs + negative_seqs, [True] * len(positive_seqs) + [False] * len(negative_seqs))

# One-hot encode sequences
encoded_seqs = preprocess.one_hot_encode_seqs(all_seqs)

# Split the data into training and validation sets
split_idx = int(0.8 * encoded_seqs.shape[1])
X_train, y_train = encoded_seqs[:, :split_idx], np.array(all_labels[:split_idx]).reshape(-1, 1)
X_val, y_val = encoded_seqs[:, split_idx:], np.array(all_labels[split_idx:]).reshape(-1, 1)

# Define the neural network architecture
nn_arch = [
    {"input_dim": 4 * seq_length, "output_dim": 64, "activation": "relu"},
    {"input_dim": 64, "output_dim": 32, "activation": "relu"},
    {"input_dim": 32, "output_dim": 1, "activation": "sigmoid"}
]

# Hyperparameters
lr = 0.01
seed = 42
batch_size = 64
epochs = 100
loss_function = "binary_cross_entropy"

# Create NeuralNetwork instance
nn = nn.NeuralNetwork(nn_arch, lr, seed, batch_size, epochs, loss_function)

# Check the shapes of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Check the shapes of X_val and y_val
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

# Train the neural network
train_loss, val_loss = nn.fit(X_train, y_train, X_val, y_val)

# Plot training and validation loss by epoch
plt.plot(train_loss, label="Training Loss")
plt.plot(val_loss, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Report the accuracy of the classifier on the validation dataset
y_val_pred = (nn.predict(X_val) > 0.5).astype(int)
accuracy = np.mean(y_val_pred == y_val)
print(f"Validation accuracy: {accuracy:.4f}")


X_train shape: (274, 54)
y_train shape: (54, 1)
X_val shape: (274, 14)
y_val shape: (220, 1)


AssertionError: 