In [1]:
pip install cassandra-driver

Collecting cassandra-driver
  Downloading cassandra_driver-3.29.2-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting geomet<0.3,>=0.1 (from cassandra-driver)
  Using cached geomet-0.2.1.post1-py3-none-any.whl.metadata (1.0 kB)
Collecting click (from geomet<0.3,>=0.1->cassandra-driver)
  Downloading click-8.2.0-py3-none-any.whl.metadata (2.5 kB)
Downloading cassandra_driver-3.29.2-cp311-cp311-win_amd64.whl (348 kB)
Using cached geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Downloading click-8.2.0-py3-none-any.whl (102 kB)
Installing collected packages: click, geomet, cassandra-driver

   -------------------------- ------------- 2/3 [cassandra-driver]
   -------------------------- ------------- 2/3 [cassandra-driver]
   ---------------------------------------- 3/3 [cassandra-driver]

Successfully installed cassandra-driver-3.29.2 click-8.2.0 geomet-0.2.1.post1
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.5-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ------------------------------------- -- 10.7/11.6 MB 55.8 MB/s eta 0:00:01
   ---------------------------------------- 11.6/11.6 MB 42.7 MB/s eta 0:00:00
Downloading numpy-2.2.5-cp311-cp311-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------- ----- 11.3/12.9 MB 58.6 MB/s eta 0:00:01
   ---------------------------------------- 12.9/12.9 MB 35.3 MB/s eta 0:00:00
Downloading pytz-2

In [1]:
import os
import pandas as pd
import uuid
from cassandra.cluster import Cluster

# Connect to Cassandra
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# Create tables (optional if already created)
session.execute("""
CREATE TABLE IF NOT EXISTS region (
    id TEXT PRIMARY KEY,
    chrom TEXT,
    start INT,
    end INT,
    strand TEXT,
    cell_line TEXT,
    region_type TEXT,
    window_size INT
);
""")

session.execute("""
CREATE TABLE IF NOT EXISTS epigenomic_features (
    id TEXT PRIMARY KEY,
    region_id TEXT,
    tpm FLOAT,
    features MAP<TEXT, FLOAT>
);
""")

# Load CSVs
folder = "E:/DataTesis/Final_Epigenomic"

for filename in os.listdir(folder):
    if not filename.endswith(".csv"):
        continue

    # Parse metadata from filename
    parts = filename.replace(".csv", "").split("_")
    cell_line, window_size, region_type = parts[0], int(parts[1]), parts[2]

    # Read CSV
    df = pd.read_csv(os.path.join(folder, filename))

    for _, row in df.iterrows():
        # Generate UUID for region 
        region_id = str(uuid.uuid4())

        # Insert into region table
        session.execute("""
            INSERT INTO region (id, chrom, start, end, strand, cell_line, region_type, window_size)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, (region_id, row['chrom'], int(row['start']), int(row['end']),
              row['strand'], cell_line, region_type, window_size))

        # Prepare epigenomic features
        feature_cols = [col for col in df.columns if col not in ['chrom', 'start', 'end', 'strand', 'TPM']]
        features = {col: float(row[col]) for col in feature_cols if pd.notnull(row[col])}

        # Insert into epigenomic_features table
        session.execute("""
            INSERT INTO epigenomic_features (id, region_id, tpm, features)
            VALUES (%s, %s, %s, %s)
        """, (str(uuid.uuid4()), region_id, float(row['TPM']), features))


KeyboardInterrupt: 

In [4]:
from cassandra.cluster import Cluster
import pandas as pd

cluster = Cluster(['127.0.0.1'])
session = cluster.connect('bioinfo')

rows = session.execute("SELECT cell_line, region_type, window_size FROM region")
df = pd.DataFrame(rows, columns=['cell_line', 'region_type', 'window_size'])

# Get distinct combinations
distinct_combinations = df.drop_duplicates()
print(distinct_combinations)


    cell_line region_type  window_size
0          H1    promoter          512
1          H1    promoter          256
2        K562    promoter          512
3        A549    enhancer         1024
4        A549    enhancer          128
..        ...         ...          ...
203      K562    promoter          256
207    HEK293    enhancer          128
227     HepG2    enhancer          512
258      K562    enhancer          128
267    HEK293    enhancer         1024

[62 rows x 3 columns]


In [5]:
import os
import pandas as pd
import uuid
from cassandra.cluster import Cluster
from tqdm import tqdm

# Connect to Cassandra
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# Create tables (optional)
session.execute("""
CREATE TABLE IF NOT EXISTS region (
    id TEXT PRIMARY KEY,
    chrom TEXT,
    start INT,
    end INT,
    strand TEXT,
    cell_line TEXT,
    region_type TEXT,
    window_size INT
);
""")

session.execute("""
CREATE TABLE IF NOT EXISTS epigenomic_features (
    id TEXT PRIMARY KEY,
    region_id TEXT,
    tpm FLOAT,
    features MAP<TEXT, FLOAT>
);
""")

# Load list of already processed files
processed_files_path = "processed_files.txt"
if os.path.exists(processed_files_path):
    with open(processed_files_path, "r") as f:
        processed_files = set(line.strip() for line in f)
else:
    processed_files = set()

# Folder containing CSVs
folder = "E:/DataTesis/Final_Epigenomic"
all_files = [f for f in os.listdir(folder) if f.endswith(".csv") and f not in processed_files]

# Loop through unprocessed files with progress bar
for filename in tqdm(all_files, desc="Processing files"):
    parts = filename.replace(".csv", "").split("_")
    cell_line, window_size, region_type = parts[0], int(parts[1]), parts[2]

    df = pd.read_csv(os.path.join(folder, filename))

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Inserting rows from {filename}", leave=False):
        region_id = str(uuid.uuid4())

        # Insert into region table
        session.execute("""
            INSERT INTO region (id, chrom, start, end, strand, cell_line, region_type, window_size)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """, (region_id, row['chrom'], int(row['start']), int(row['end']),
              row['strand'], cell_line, region_type, window_size))

        # Prepare feature map
        feature_cols = [col for col in df.columns if col not in ['chrom', 'start', 'end', 'strand', 'TPM']]
        features = {col: float(row[col]) for col in feature_cols if pd.notnull(row[col])}

        session.execute("""
            INSERT INTO epigenomic_features (id, region_id, tpm, features)
            VALUES (%s, %s, %s, %s)
        """, (str(uuid.uuid4()), region_id, float(row['TPM']), features))

    # Mark the file as processed
    with open(processed_files_path, "a") as f:
        f.write(filename + "\n")


Processing files: 100%|██████████| 9/9 [8:45:52<00:00, 3505.79s/it]  


Modelling

In [24]:
import time
from cassandra.cluster import Cluster
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Connect to Cassandra ---
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# --- Step 1: Extract data with time measurement ---
start_extract = time.time()

# Query to get region ids for window_size=256
region_query = "SELECT id, region_type FROM region WHERE window_size=256 ALLOW FILTERING"
regions = session.execute(region_query)

region_dict = {row.id: row.region_type for row in regions}
print(f"Regions with window_size=256: {len(region_dict)}")

features_list = []
labels_list = []

# Query epigenomic_features (no filtering in query; filter in code)
query = "SELECT features, region_id, tpm FROM epigenomic_features"
future = session.execute_async(query)
result = future.result()

count = 0
for row in result:
    if row.region_id not in region_dict:
        continue

    region_type = region_dict[row.region_id]
    if region_type not in ('promoter', 'enhancer'):
        continue

    # Only consider inactive regions (TPM==0) for binary classification
    if row.tpm == 0.0:
        label = 0 if region_type == 'enhancer' else 1  # enhancer=0, promoter=1

        # Append features and label together here
        feat_dict = dict(row.features)
        features_list.append(feat_dict)
        labels_list.append(label)

        count += 1
        if count % 10000 == 0:
            print(f"Processed {count} samples")
    else:
        # Skip active regions (TPM!=0)
        continue

print(f"Total samples collected: {len(labels_list)}")

# Get union of all feature keys to create fixed feature vector columns
all_features = set()
for d in features_list:
    all_features.update(d.keys())
all_features = sorted(all_features)

print(f"Total unique features: {len(all_features)}")

# Build numpy array X of shape (samples, features)
X = np.zeros((len(labels_list), len(all_features)), dtype=np.float32)
for i, feat_dict in enumerate(features_list):
    for j, key in enumerate(all_features):
        X[i, j] = feat_dict.get(key, 0.0)

y = np.array(labels_list, dtype=np.int64)

end_extract = time.time()
print(f"Data extraction time: {end_extract - start_extract:.2f} seconds")

# --- Step 2: Prepare data for training ---
start_train = time.time()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train)
X_test_t = torch.tensor(X_test)
y_test_t = torch.tensor(y_test)

# Dataset and DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

# --- Define FFNN model ---
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FFNN(input_dim=X.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    epoch_loss = running_loss / len(train_dl.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(X_test_t).squeeze()
    preds_label = (preds > 0.5).long()
    accuracy = (preds_label == y_test_t).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

end_train = time.time()
print(f"Training time: {end_train - start_train:.2f} seconds")


Regions with window_size=256: 1142428
Processed 10000 samples
Processed 20000 samples
Processed 30000 samples
Processed 40000 samples
Processed 50000 samples
Processed 60000 samples
Processed 70000 samples
Processed 80000 samples
Processed 90000 samples
Processed 100000 samples
Processed 110000 samples
Processed 120000 samples
Processed 130000 samples
Processed 140000 samples
Processed 150000 samples
Processed 160000 samples
Processed 170000 samples
Processed 180000 samples
Processed 190000 samples
Processed 200000 samples
Processed 210000 samples
Processed 220000 samples
Processed 230000 samples
Processed 240000 samples
Processed 250000 samples
Processed 260000 samples
Processed 270000 samples
Processed 280000 samples
Processed 290000 samples
Processed 300000 samples
Processed 310000 samples
Processed 320000 samples
Processed 330000 samples
Processed 340000 samples
Processed 350000 samples
Processed 360000 samples
Processed 370000 samples
Processed 380000 samples
Processed 390000 samp

Model FIX

IE vs IP

In [27]:
import time
from cassandra.cluster import Cluster
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Connect to Cassandra ---
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# --- Step 1: Extract data with time measurement ---
start_extract = time.time()

# Query to get region ids for window_size=256
region_query = "SELECT id, region_type FROM region WHERE window_size=256 ALLOW FILTERING"
regions = session.execute(region_query)

region_dict = {row.id: row.region_type for row in regions}
print(f"Regions with window_size=256: {len(region_dict)}")

features_list = []
labels_list = []

# Query epigenomic_features (no filtering in query; filter in code)
query = "SELECT features, region_id, tpm FROM epigenomic_features"
future = session.execute_async(query)
result = future.result()

count = 0
for row in result:
    if row.region_id not in region_dict:
        continue

    region_type = region_dict[row.region_id]
    if region_type not in ('promoter', 'enhancer'):
        continue

    # Only consider inactive regions (TPM==0) for binary classification
    if row.tpm == 0.0:
        label = 0 if region_type == 'enhancer' else 1  # enhancer=0, promoter=1

        # Append features and label together here
        feat_dict = dict(row.features)
        features_list.append(feat_dict)
        labels_list.append(label)

        count += 1
        if count % 10000 == 0:
            print(f"Processed {count} samples")
    else:
        # Skip active regions (TPM!=0)
        continue

print(f"Total samples collected: {len(labels_list)}")

# Get union of all feature keys to create fixed feature vector columns
all_features = set()
for d in features_list:
    all_features.update(d.keys())
all_features = sorted(all_features)

print(f"Total unique features: {len(all_features)}")

# Build numpy array X of shape (samples, features)
X = np.zeros((len(labels_list), len(all_features)), dtype=np.float32)
for i, feat_dict in enumerate(features_list):
    for j, key in enumerate(all_features):
        X[i, j] = feat_dict.get(key, 0.0)

y = np.array(labels_list, dtype=np.int64)

end_extract = time.time()
print(f"Data extraction time: {end_extract - start_extract:.2f} seconds")

# --- Step 2: Prepare data for training ---
start_train = time.time()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train)
X_test_t = torch.tensor(X_test)
y_test_t = torch.tensor(y_test)

# Dataset and DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

# --- Define FFNN model with updated config ---
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FFNN(input_dim=X.shape[1])

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.5, weight_decay=0.0)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)  # decay learning rate

# Use batch size 32
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

# Training loop
epochs = 64
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()  # Apply learning rate decay
    epoch_loss = running_loss / len(train_dl.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.5f}")


# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(X_test_t).squeeze()
    preds_label = (preds > 0.5).long()
    accuracy = (preds_label == y_test_t).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

end_train = time.time()
print(f"Training time: {end_train - start_train:.2f} seconds")


Regions with window_size=256: 1142428
Processed 10000 samples
Processed 20000 samples
Processed 30000 samples
Processed 40000 samples
Processed 50000 samples
Processed 60000 samples
Processed 70000 samples
Processed 80000 samples
Processed 90000 samples
Processed 100000 samples
Processed 110000 samples
Processed 120000 samples
Processed 130000 samples
Processed 140000 samples
Processed 150000 samples
Processed 160000 samples
Processed 170000 samples
Processed 180000 samples
Processed 190000 samples
Processed 200000 samples
Processed 210000 samples
Processed 220000 samples
Processed 230000 samples
Processed 240000 samples
Processed 250000 samples
Processed 260000 samples
Processed 270000 samples
Processed 280000 samples
Processed 290000 samples
Processed 300000 samples
Processed 310000 samples
Processed 320000 samples
Processed 330000 samples
Processed 340000 samples
Processed 350000 samples
Processed 360000 samples
Processed 370000 samples
Processed 380000 samples
Processed 390000 samp

AP vs IP

In [2]:
import time
from cassandra.cluster import Cluster
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Connect to Cassandra ---
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# --- Step 1: Extract data with time measurement ---
start_extract = time.time()

# Query to get region ids for window_size=256
region_query = "SELECT id, region_type FROM region WHERE window_size=256 ALLOW FILTERING"
regions = session.execute(region_query)

region_dict = {row.id: row.region_type for row in regions}
print(f"Regions with window_size=256: {len(region_dict)}")

features_list = []
labels_list = []

# Query epigenomic_features (no filtering in query; filter in code)
query = "SELECT features, region_id, tpm FROM epigenomic_features"
future = session.execute_async(query)
result = future.result()

count = 0
for row in result:
    if row.region_id not in region_dict:
        continue

    region_type = region_dict[row.region_id]
    if region_type !='promoter':
        continue

    # Label inactive (TPM == 0) as 0, active (TPM > 0) as 1
    label = 0 if row.tpm == 0.0 else 1

    # Append features and label together here
    feat_dict = dict(row.features)
    features_list.append(feat_dict)
    labels_list.append(label)

    count += 1
    if count % 10000 == 0:
        print(f"Processed {count} samples")

print(f"Total samples collected: {len(labels_list)}")

# Get union of all feature keys to create fixed feature vector columns
all_features = set()
for d in features_list:
    all_features.update(d.keys())
all_features = sorted(all_features)

print(f"Total unique features: {len(all_features)}")

# Build numpy array X of shape (samples, features)
X = np.zeros((len(labels_list), len(all_features)), dtype=np.float32)
for i, feat_dict in enumerate(features_list):
    for j, key in enumerate(all_features):
        X[i, j] = feat_dict.get(key, 0.0)

y = np.array(labels_list, dtype=np.int64)

end_extract = time.time()
print(f"Data extraction time: {end_extract - start_extract:.2f} seconds")

# --- Step 2: Prepare data for training ---
start_train = time.time()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train)
X_test_t = torch.tensor(X_test)
y_test_t = torch.tensor(y_test)

# Dataset and DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

# --- Define FFNN model with updated config ---
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FFNN(input_dim=X.shape[1])

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.5, weight_decay=0.0)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)  # decay learning rate

# Use batch size 32
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

# Training loop
epochs = 64
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()  # Apply learning rate decay
    epoch_loss = running_loss / len(train_dl.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.5f}")


# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(X_test_t).squeeze()
    preds_label = (preds > 0.5).long()
    accuracy = (preds_label == y_test_t).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

end_train = time.time()
print(f"Training time: {end_train - start_train:.2f} seconds")


Regions with window_size=256: 1142428
Processed 10000 samples
Processed 20000 samples
Processed 30000 samples
Processed 40000 samples
Processed 50000 samples
Processed 60000 samples
Processed 70000 samples
Processed 80000 samples
Processed 90000 samples
Processed 100000 samples
Processed 110000 samples
Processed 120000 samples
Processed 130000 samples
Processed 140000 samples
Processed 150000 samples
Processed 160000 samples
Processed 170000 samples
Processed 180000 samples
Processed 190000 samples
Processed 200000 samples
Processed 210000 samples
Processed 220000 samples
Processed 230000 samples
Processed 240000 samples
Processed 250000 samples
Processed 260000 samples
Processed 270000 samples
Processed 280000 samples
Processed 290000 samples
Processed 300000 samples
Processed 310000 samples
Processed 320000 samples
Processed 330000 samples
Processed 340000 samples
Processed 350000 samples
Processed 360000 samples
Processed 370000 samples
Processed 380000 samples
Processed 390000 samp

AE vs IE

In [3]:
import time
from cassandra.cluster import Cluster
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Connect to Cassandra ---
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# --- Step 1: Extract data with time measurement ---
start_extract = time.time()

# Query to get region ids for window_size=256
region_query = "SELECT id, region_type FROM region WHERE window_size=256 ALLOW FILTERING"
regions = session.execute(region_query)

region_dict = {row.id: row.region_type for row in regions}
print(f"Regions with window_size=256: {len(region_dict)}")

features_list = []
labels_list = []

# Query epigenomic_features (no filtering in query; filter in code)
query = "SELECT features, region_id, tpm FROM epigenomic_features"
future = session.execute_async(query)
result = future.result()

count = 0
for row in result:
    if row.region_id not in region_dict:
        continue

    region_type = region_dict[row.region_id]
    if region_type != 'enhancer':
        continue

    # Label inactive (TPM == 0) as 0, active (TPM > 0) as 1
    label = 0 if row.tpm == 0.0 else 1

    feat_dict = dict(row.features)
    features_list.append(feat_dict)
    labels_list.append(label)

    count += 1
    if count % 10000 == 0:
        print(f"Processed {count} samples")

print(f"Total samples collected: {len(labels_list)}")

# Get union of all feature keys to create fixed feature vector columns
all_features = set()
for d in features_list:
    all_features.update(d.keys())
all_features = sorted(all_features)

print(f"Total unique features: {len(all_features)}")

# Build numpy array X of shape (samples, features)
X = np.zeros((len(labels_list), len(all_features)), dtype=np.float32)
for i, feat_dict in enumerate(features_list):
    for j, key in enumerate(all_features):
        X[i, j] = feat_dict.get(key, 0.0)

y = np.array(labels_list, dtype=np.int64)

end_extract = time.time()
print(f"Data extraction time: {end_extract - start_extract:.2f} seconds")

# --- Step 2: Prepare data for training ---
start_train = time.time()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train)
X_test_t = torch.tensor(X_test)
y_test_t = torch.tensor(y_test)

# Dataset and DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

# --- Define FFNN model with updated config ---
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FFNN(input_dim=X.shape[1])

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.5, weight_decay=0.0)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)  # decay learning rate

# Use batch size 32
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

# Training loop
epochs = 64
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()  # Apply learning rate decay
    epoch_loss = running_loss / len(train_dl.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.5f}")


# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(X_test_t).squeeze()
    preds_label = (preds > 0.5).long()
    accuracy = (preds_label == y_test_t).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

end_train = time.time()
print(f"Training time: {end_train - start_train:.2f} seconds")


Regions with window_size=256: 1142428
Processed 10000 samples
Processed 20000 samples
Processed 30000 samples
Processed 40000 samples
Processed 50000 samples
Processed 60000 samples
Processed 70000 samples
Processed 80000 samples
Processed 90000 samples
Processed 100000 samples
Processed 110000 samples
Processed 120000 samples
Processed 130000 samples
Processed 140000 samples
Processed 150000 samples
Processed 160000 samples
Processed 170000 samples
Processed 180000 samples
Processed 190000 samples
Processed 200000 samples
Processed 210000 samples
Processed 220000 samples
Processed 230000 samples
Processed 240000 samples
Processed 250000 samples
Processed 260000 samples
Processed 270000 samples
Processed 280000 samples
Processed 290000 samples
Processed 300000 samples
Processed 310000 samples
Processed 320000 samples
Processed 330000 samples
Processed 340000 samples
Processed 350000 samples
Processed 360000 samples
Processed 370000 samples
Processed 380000 samples
Processed 390000 samp

AE vs AP

In [4]:
import time
from cassandra.cluster import Cluster
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Connect to Cassandra ---
cluster = Cluster(['127.0.0.1'])
session = cluster.connect()
session.set_keyspace('bioinfo')

# --- Step 1: Extract data with time measurement ---
start_extract = time.time()

# Query to get region ids for window_size=256
region_query = "SELECT id, region_type FROM region WHERE window_size=256 ALLOW FILTERING"
regions = session.execute(region_query)

region_dict = {row.id: row.region_type for row in regions}
print(f"Regions with window_size=256: {len(region_dict)}")

features_list = []
labels_list = []

# Query epigenomic_features (no filtering in query; filter in code)
query = "SELECT features, region_id, tpm FROM epigenomic_features"
future = session.execute_async(query)
result = future.result()

count = 0
for row in result:
    if row.region_id not in region_dict:
        continue

    region_type = region_dict[row.region_id]
    if region_type not in ('enhancer', 'promoter'):
        continue

    # Only consider active regions (TPM > 0)
    if row.tpm > 0.0:
        label = 0 if region_type == 'enhancer' else 1  # enhancer=0, promoter=1

        feat_dict = dict(row.features)
        features_list.append(feat_dict)
        labels_list.append(label)

        count += 1
        if count % 10000 == 0:
            print(f"Processed {count} samples")
    else:
        continue  # Skip inactive

print(f"Total samples collected: {len(labels_list)}")

# Get union of all feature keys to create fixed feature vector columns
all_features = set()
for d in features_list:
    all_features.update(d.keys())
all_features = sorted(all_features)

print(f"Total unique features: {len(all_features)}")

# Build numpy array X of shape (samples, features)
X = np.zeros((len(labels_list), len(all_features)), dtype=np.float32)
for i, feat_dict in enumerate(features_list):
    for j, key in enumerate(all_features):
        X[i, j] = feat_dict.get(key, 0.0)

y = np.array(labels_list, dtype=np.int64)

end_extract = time.time()
print(f"Data extraction time: {end_extract - start_extract:.2f} seconds")

# --- Step 2: Prepare data for training ---
start_train = time.time()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to torch tensors
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train)
X_test_t = torch.tensor(X_test)
y_test_t = torch.tensor(y_test)

# Dataset and DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)

# --- Define FFNN model with updated config ---
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super(FFNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FFNN(input_dim=X.shape[1])

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.5, weight_decay=0.0)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)  # decay learning rate

# Use batch size 32
train_ds = TensorDataset(X_train_t, y_train_t)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

# Training loop
epochs = 64
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb).squeeze()
        loss = criterion(preds, yb.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()  # Apply learning rate decay
    epoch_loss = running_loss / len(train_dl.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.5f}")


# Evaluate on test set
model.eval()
with torch.no_grad():
    preds = model(X_test_t).squeeze()
    preds_label = (preds > 0.5).long()
    accuracy = (preds_label == y_test_t).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

end_train = time.time()
print(f"Training time: {end_train - start_train:.2f} seconds")


Regions with window_size=256: 1142428
Processed 10000 samples
Processed 20000 samples
Processed 30000 samples
Processed 40000 samples
Processed 50000 samples
Processed 60000 samples
Processed 70000 samples
Processed 80000 samples
Total samples collected: 89815
Total unique features: 58
Data extraction time: 474.99 seconds
Epoch 1/64, Loss: 0.3499, LR: 0.05000
Epoch 2/64, Loss: 0.2919, LR: 0.00500
Epoch 3/64, Loss: 0.2850, LR: 0.00050
Epoch 4/64, Loss: 0.2839, LR: 0.00005
Epoch 5/64, Loss: 0.2838, LR: 0.00001
Epoch 6/64, Loss: 0.2838, LR: 0.00000
Epoch 7/64, Loss: 0.2838, LR: 0.00000
Epoch 8/64, Loss: 0.2838, LR: 0.00000
Epoch 9/64, Loss: 0.2838, LR: 0.00000
Epoch 10/64, Loss: 0.2838, LR: 0.00000
Epoch 11/64, Loss: 0.2838, LR: 0.00000
Epoch 12/64, Loss: 0.2838, LR: 0.00000
Epoch 13/64, Loss: 0.2838, LR: 0.00000
Epoch 14/64, Loss: 0.2838, LR: 0.00000
Epoch 15/64, Loss: 0.2838, LR: 0.00000
Epoch 16/64, Loss: 0.2838, LR: 0.00000
Epoch 17/64, Loss: 0.2838, LR: 0.00000
Epoch 18/64, Loss: 0.2