In [2]:
import os
import pandas as pd
from pymongo import MongoClient
from tqdm import tqdm

# Setup
CSV_DIR = "E:/DataTesis/Final_Epigenomic"  # E:\DataTesis\Final_Epigenomic
MONGO_URI = "mongodb://localhost:27017"
DB_NAME = "epigenomic_db"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
region_col = db.Region
feature_col = db.EpigenomicFeatures

region_counter = 0

# Iterate over files
for filename in tqdm(os.listdir(CSV_DIR)):
    if not filename.endswith(".csv"):
        continue

    # Parse metadata from filename: A549_1024_promoter_epigenomic.csv
    try:
        base = filename.replace("_epigenomic.csv", "")
        cell_line, window_size, region_type = base.split("_")
    except ValueError:
        print(f"Skipping file with bad name: {filename}")
        continue

    # Load CSV
    filepath = os.path.join(CSV_DIR, filename)
    df = pd.read_csv(filepath)

    for idx, row in df.iterrows():
        region_id = f"{cell_line}_{region_type}_{window_size}_{region_counter}"

        # 1. Insert into Region
        region_doc = {
            "id": region_id,
            "chrom": row["chrom"],
            "start": int(row["start"]),
            "end": int(row["end"]),
            "strand": row["strand"],
            "cell_line": cell_line,
            "region_type": region_type,
            "window_size": int(window_size)
        }
        region_col.insert_one(region_doc)

        # 2. Insert into EpigenomicFeatures
        feature_doc = {
            "id": region_id + "_f",
            "region_id": region_id,
            "TPM": float(row["TPM"])
        }

        # Dynamically add the remaining marks
        for col in df.columns:
            if col not in ["chrom", "start", "end", "strand", "TPM"]:
                feature_doc[col] = float(row[col]) if pd.notna(row[col]) else None

        feature_col.insert_one(feature_doc)
        region_counter += 1

print("✅ Ingestion complete.")


100%|██████████| 70/70 [2:33:53<00:00, 131.90s/it]  

✅ Ingestion complete.





MODELLING

In [23]:
import time
import numpy as np
import pandas as pd
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, regularizers, optimizers, losses, metrics, Model
from tqdm import tqdm  # Added for progress tracking

# --- Configs ---
WINDOW_SIZE = 256
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 0.001
LEARNING_RATE_DECAY = 0.1
L2_REG = 0.0
OPTIMIZER = 'sgd'
MONGO_BATCH_SIZE = 5000  # <-- NEW: batch size for querying MongoDB

# MongoDB connection
client = MongoClient('mongodb://localhost:27017')
db = client['epigenomic_reduction']
epigenomic_collection = db['EpigenomicFeatures']
region_collection = db['Region']

# -------------------------
# Step 1: Get region_ids with window_size=256
start_time = time.time()
region_cursor = region_collection.find({'window_size': WINDOW_SIZE}, {'id': 1})
region_ids_256 = [doc['id'] for doc in region_cursor]
print(f"Filtered {len(region_ids_256)} regions with window_size=256")
extract_time = time.time() - start_time
print(f"Data extraction preparation took {extract_time:.2f} sec")

# Step 2: Extract EpigenomicFeatures in batches
start_time = time.time()
projection = {'_id': 0, 'id': 0}  # Exclude large/irrelevant fields
feature_list = []

for i in tqdm(range(0, len(region_ids_256), MONGO_BATCH_SIZE), desc="Fetching features"):
    batch_ids = region_ids_256[i:i + MONGO_BATCH_SIZE]
    cursor = epigenomic_collection.find({'region_id': {'$in': batch_ids}}, projection).batch_size(1000)
    feature_list.extend(cursor)

print(f"Extracted {len(feature_list)} feature documents")
extract_time += (time.time() - start_time)
print(f"Total data extraction took {extract_time:.2f} sec")

# Step 3: Identify all possible feature keys (excluding metadata)
exclude_keys = {'_id', 'id', 'region_id', 'TPM'}
all_feature_keys = set()
for doc in feature_list:
    all_feature_keys.update(set(doc.keys()) - exclude_keys)

all_feature_keys = sorted(all_feature_keys)
print(f"Feature keys found: {all_feature_keys}")

# Step 4: Create dataset (X, y) with missing features = 0
X = []
y = []
region_types = []

# Create region_type map
region_map = {}
for doc in region_collection.find({'window_size': WINDOW_SIZE}, {'id': 1, 'region_type': 1}):
    region_map[doc['id']] = doc['region_type']

for doc in feature_list:
    features = [doc.get(key, 0) for key in all_feature_keys]
    X.append(features)
    y.append(doc['TPM'])
    region_types.append(region_map.get(doc['region_id'], None))

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)
region_types = np.array(region_types)

print(f"X shape: {X.shape}, y shape: {y.shape}")

# --- Prepare classification tasks ---
def prepare_task_data(region_types, y, X, pos_class, neg_class, pos_region_type, neg_region_type):
    mask = (
        ((region_types == pos_region_type) & (y == pos_class)) |
        ((region_types == neg_region_type) & (y == neg_class))
    )
    X_task = X[mask]
    y_task = y[mask]
    y_task_bin = ((region_types[mask] == pos_region_type) & (y[mask] == pos_class)).astype(np.float32)
    return train_test_split(X_task, y_task_bin, test_size=0.2, random_state=42)

# Task 1: inactive enhancer vs inactive promoter
X_train_1, X_test_1, y_train_1, y_test_1 = prepare_task_data(
    region_types, y, X,
    pos_class=0, neg_class=0,
    pos_region_type='enhancer', neg_region_type='promoter'
)

# Task 2: active promoter vs inactive promoter
X_train_2, X_test_2, y_train_2, y_test_2 = prepare_task_data(
    region_types, y, X,
    pos_class=1, neg_class=0,
    pos_region_type='promoter', neg_region_type='promoter'
)

# Task 3: active enhancer vs inactive enhancer
X_train_3, X_test_3, y_train_3, y_test_3 = prepare_task_data(
    region_types, y, X,
    pos_class=1, neg_class=0,
    pos_region_type='enhancer', neg_region_type='enhancer'
)

# Task 4: active enhancer vs active promoter
X_train_4, X_test_4, y_train_4, y_test_4 = prepare_task_data(
    region_types, y, X,
    pos_class=1, neg_class=1,
    pos_region_type='enhancer', neg_region_type='promoter'
)

print(f"Task 1 train size: {len(y_train_1)}, test size: {len(y_test_1)}")
print(f"Task 2 train size: {len(y_train_2)}, test size: {len(y_test_2)}")
print(f"Task 3 train size: {len(y_train_3)}, test size: {len(y_test_3)}")
print(f"Task 4 train size: {len(y_train_4)}, test size: {len(y_test_4)}")

# --- Build the FFNN model ---
def create_model(input_dim):
    model = tf.keras.Sequential([
        layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(L2_REG), input_shape=(input_dim,)),
        layers.Dense(4, activation='relu', kernel_regularizer=regularizers.l2(L2_REG)),
        layers.Dense(2, activation='relu', kernel_regularizer=regularizers.l2(L2_REG)),
        layers.Dense(1, activation='sigmoid')
    ])
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=LEARNING_RATE,
        decay_steps=1000,
        decay_rate=LEARNING_RATE_DECAY,
        staircase=True
    )
    optimizer = optimizers.SGD(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer,
                  loss=losses.BinaryCrossentropy(),
                  metrics=[metrics.BinaryAccuracy()])
    return model

# --- Training function with timing ---
def train_and_evaluate(X_train, y_train, X_test, y_test):
    model = create_model(X_train.shape[1])
    
    # --- Training ---
    start_train = time.time()
    history = model.fit(X_train, y_train,
                        batch_size=BATCH_SIZE,
                        epochs=EPOCHS,
                        validation_data=(X_test, y_test),
                        verbose=2)
    train_time = time.time() - start_train
    print(f"Training took {train_time:.2f} seconds")
    
    # --- Evaluate on test set ---
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {test_accuracy:.4f}")

    return model, history, test_accuracy


Filtered 1142428 regions with window_size=256
Data extraction preparation took 7.56 sec


Fetching features: 100%|██████████| 229/229 [11:53<00:00,  3.12s/it]


Extracted 1142428 feature documents
Total data extraction took 722.26 sec
Feature keys found: ['ARID1B', 'ARID2', 'ARID3A', 'ARID3B', 'ARID4A', 'ARID4B', 'ARID5B', 'BRD4', 'CHD1', 'CHD2', 'CHD4', 'CHD7', 'CTCF', 'CTCFL', 'DNMT1', 'DNMT3B', 'EED', 'ESR1', 'ETS1', 'ETS2', 'EZH2', 'FOXA1', 'FOXA2', 'FOXA3', 'GATA1', 'GATA2', 'GATA3', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K79me2', 'H3K9ac', 'H3K9me2', 'H3K9me3', 'H4K20me1', 'HDAC1', 'HDAC2', 'HDAC3', 'HDAC6', 'HDAC8', 'KAT2A', 'KAT2B', 'KAT7', 'KAT8', 'KDM1A', 'KDM2A', 'KDM3A', 'KDM4B', 'KDM5A', 'KDM5B', 'KDM6A', 'NR3C1', 'SMARCA4', 'SMARCB1', 'SUZ12']
X shape: (1142428, 58), y shape: (1142428,)
Task 1 train size: 567203, test size: 141801
Task 2 train size: 284659, test size: 71165
Task 3 train size: 354396, test size: 88599
Task 4 train size: 71852, test size: 17963


In [24]:
# --- Configs ---
EPOCHS = 5
LEARNING_RATE = 0.001
LEARNING_RATE_DECAY = 0.1
L2_REG = 0.0
OPTIMIZER = 'sgd'

In [25]:
# --- Example: train task 1 ---
model_1, history_1 = train_and_evaluate(X_train_1, y_train_1, X_test_1, y_test_1)

Epoch 1/5
17726/17726 - 18s - 1ms/step - binary_accuracy: 0.5770 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 2/5
17726/17726 - 17s - 958us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 3/5
17726/17726 - 17s - 974us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 4/5
17726/17726 - 17s - 981us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 5/5
17726/17726 - 17s - 942us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Training took 86.76 seconds
Test Accuracy: 0.5656


ValueError: too many values to unpack (expected 2)

In [None]:
# --- Train task 2 ---
model_2, history_2 = train_and_evaluate(X_train_2, y_train_2, X_test_2, y_test_2)

Epoch 1/5
8896/8896 - 10s - 1ms/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 2/5
8896/8896 - 9s - 1ms/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 3/5
8896/8896 - 9s - 960us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 4/5
8896/8896 - 9s - 980us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 5/5
8896/8896 - 9s - 1ms/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Training took 44.97 seconds
Test Accuracy: 0.8672


ValueError: too many values to unpack (expected 2)

In [None]:

# --- Train task 3 ---
model_3, history_3 = train_and_evaluate(X_train_3, y_train_3, X_test_3, y_test_3)

Epoch 1/5
11075/11075 - 11s - 1ms/step - binary_accuracy: 0.9036 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 2/5
11075/11075 - 11s - 955us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 3/5
11075/11075 - 10s - 878us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 4/5
11075/11075 - 11s - 986us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 5/5
11075/11075 - 13s - 1ms/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Training took 55.86 seconds
Test Accuracy: 0.9057


ValueError: too many values to unpack (expected 2)

In [None]:
# --- Train task 4 ---
model_4, history_4 = train_and_evaluate(X_train_4, y_train_4, X_test_4, y_test_4)

Epoch 1/5
2246/2246 - 3s - 1ms/step - binary_accuracy: 0.5246 - loss: 0.7023 - val_binary_accuracy: 0.5255 - val_loss: 0.6919
Epoch 2/5
2246/2246 - 2s - 1ms/step - binary_accuracy: 0.5273 - loss: 0.6917 - val_binary_accuracy: 0.5255 - val_loss: 0.6919
Epoch 3/5
2246/2246 - 2s - 1ms/step - binary_accuracy: 0.5273 - loss: 0.6917 - val_binary_accuracy: 0.5255 - val_loss: 0.6919
Epoch 4/5
2246/2246 - 2s - 1ms/step - binary_accuracy: 0.5273 - loss: 0.6917 - val_binary_accuracy: 0.5255 - val_loss: 0.6919
Epoch 5/5
2246/2246 - 2s - 1ms/step - binary_accuracy: 0.5273 - loss: 0.6917 - val_binary_accuracy: 0.5255 - val_loss: 0.6919
Training took 12.16 seconds
Test Accuracy: 0.5255


ValueError: too many values to unpack (expected 2)

History

In [12]:

# --- Example: train task 1 ---
model_1, history_1 = train_and_evaluate(X_train_1, y_train_1, X_test_1, y_test_1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/64
17726/17726 - 18s - 1ms/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 2/64
17726/17726 - 16s - 894us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 3/64
17726/17726 - 15s - 872us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 4/64
17726/17726 - 16s - 875us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 5/64
17726/17726 - 15s - 872us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 6/64
17726/17726 - 15s - 869us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 7/64
17726/17726 - 15s - 865us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss: nan
Epoch 8/64
17726/17726 - 15s - 866us/step - binary_accuracy: 0.5647 - loss: nan - val_binary_accuracy: 0.5656 - val_loss

In [13]:
# --- Train task 2 ---
model_2, history_2 = train_and_evaluate(X_train_2, y_train_2, X_test_2, y_test_2)

Epoch 1/64
8896/8896 - 9s - 960us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 2/64
8896/8896 - 8s - 876us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 3/64
8896/8896 - 8s - 878us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 4/64
8896/8896 - 8s - 880us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 5/64
8896/8896 - 8s - 862us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 6/64
8896/8896 - 8s - 893us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 7/64
8896/8896 - 8s - 861us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 8/64
8896/8896 - 8s - 866us/step - binary_accuracy: 0.8669 - loss: nan - val_binary_accuracy: 0.8672 - val_loss: nan
Epoch 9/64
8896/

In [14]:

# --- Train task 3 ---
model_3, history_3 = train_and_evaluate(X_train_3, y_train_3, X_test_3, y_test_3)

Epoch 1/64
11075/11075 - 10s - 942us/step - binary_accuracy: 0.9042 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 2/64
11075/11075 - 10s - 872us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 3/64
11075/11075 - 10s - 867us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 4/64
11075/11075 - 10s - 868us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 5/64
11075/11075 - 10s - 874us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 6/64
11075/11075 - 10s - 873us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 7/64
11075/11075 - 10s - 872us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_loss: nan
Epoch 8/64
11075/11075 - 10s - 865us/step - binary_accuracy: 0.9037 - loss: nan - val_binary_accuracy: 0.9057 - val_lo

In [15]:
# --- Train task 4 ---
model_4, history_4 = train_and_evaluate(X_train_4, y_train_4, X_test_4, y_test_4)

Epoch 1/64
2246/2246 - 2s - 1ms/step - binary_accuracy: 0.8310 - loss: 0.4385 - val_binary_accuracy: 0.8593 - val_loss: 0.3942
Epoch 2/64
2246/2246 - 2s - 919us/step - binary_accuracy: 0.8578 - loss: 0.3968 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 3/64
2246/2246 - 2s - 929us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 4/64
2246/2246 - 2s - 922us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 5/64
2246/2246 - 2s - 912us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 6/64
2246/2246 - 2s - 903us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 7/64
2246/2246 - 2s - 905us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accuracy: 0.8594 - val_loss: 0.3937
Epoch 8/64
2246/2246 - 2s - 893us/step - binary_accuracy: 0.8580 - loss: 0.3963 - val_binary_accura

Standard Scaler pytorch

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
import numpy as np
import pandas as pd
from tqdm import tqdm

# -------------------------------
# MongoDB Setup
# -------------------------------
client = MongoClient("mongodb://localhost:27017/")
db = client["epigenomic_reduction"]
features_collection = db["EpigenomicFeatures"]
region_collection = db["Region"]

# -------------------------------
# Configs
# -------------------------------
BATCH_SIZE = 32
EPOCHS = 64
LEARNING_RATE = 0.001

# -------------------------------
# Step 1: Extract All Unique Feature Keys
# -------------------------------
unique_features = set()
for doc in features_collection.find({}, {"_id": 0}):
    for key in doc.keys():
        if key not in ["id", "region_id", "TPM"]:
            unique_features.add(key)
unique_features = sorted(list(unique_features))  # consistent order

print(f"Total unique features: {len(unique_features)}")

# -------------------------------
# Step 2: Define Dataset Builder
# -------------------------------
def build_dataset(tpm_1_type=None, tpm_0_type=None):
    X = []
    y = []

    def fetch_features(tpm_val, region_type):
        region_ids = region_collection.find(
            {"region_type": region_type, "window_size": 256},
            {"id": 1, "_id": 0}
        )
        region_ids = set([r["id"] for r in region_ids])

        cursor = features_collection.find({"TPM": tpm_val})
        for doc in cursor:
            if doc["region_id"] in region_ids:
                row = [doc.get(f, 0.0) for f in unique_features]
                label = 1 if tpm_val == 1 else 0
                X.append(row)
                y.append(label)

    if tpm_0_type:
        fetch_features(0, tpm_0_type)
    if tpm_1_type:
        fetch_features(1, tpm_1_type)

    return np.array(X), np.array(y)

# -------------------------------
# Step 3: Define FFNN
# -------------------------------
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# -------------------------------
# Step 4: Train Model
# -------------------------------
def train_model(X, y, task_name):
    print(f"\n=== Training: {task_name} ===")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    model = FFNN(X.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")

    # Evaluate
    model.eval()
    with torch.no_grad():
        preds = model(X_test_tensor).round()
        acc = (preds.eq(y_test_tensor)).float().mean().item()
        print(f"{task_name} Accuracy: {acc*100:.2f}%")

Total unique features: 58


In [12]:
# Task 1: Inactive Enhancer vs Inactive Promoter
X1, y1 = build_dataset(tpm_0_type="enhancer")
X2, y2 = build_dataset(tpm_0_type="promoter")
X = np.concatenate([X1, X2])
y = np.concatenate([y1, y2])
train_model(X, y, "Task 1: Inactive Enhancer vs Inactive Promoter")


=== Training: Task 1: Inactive Enhancer vs Inactive Promoter ===


RuntimeError: all elements of input should be between 0 and 1

In [None]:
# Task 2: Active Promoter vs Inactive Promoter
X1, y1 = build_dataset(tpm_1_type="promoter")
X2, y2 = build_dataset(tpm_0_type="promoter")
X = np.concatenate([X1, X2])
y = np.concatenate([y1, y2])
train_model(X, y, "Task 2: Active Promoter vs Inactive Promoter")

In [None]:
# Task 3: Active Enhancer vs Inactive Enhancer
X1, y1 = build_dataset(tpm_1_type="enhancer")
X2, y2 = build_dataset(tpm_0_type="enhancer")
X = np.concatenate([X1, X2])
y = np.concatenate([y1, y2])
train_model(X, y, "Task 3: Active Enhancer vs Inactive Enhancer")

In [None]:
# Task 4: Active Enhancer vs Active Promoter
X1, y1 = build_dataset(tpm_1_type="enhancer")
X2, y2 = build_dataset(tpm_1_type="promoter")
X = np.concatenate([X1, X2])
y = np.concatenate([y1, y2])
train_model(X, y, "Task 4: Active Enhancer vs Active Promoter")
