In [13]:
# 📦 Required Libraries
import pandas as pd
import os
import time
from glob import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import gc
import joblib

# 📁 Path to All CSVs
BASE_PATH = r"I:\Final2\Max Data"
all_csv_files = glob(os.path.join(BASE_PATH, "*.csv"))

# ✅ Settings
CELL_LINES = ["A549", "GM12878", "H1", "HEK293", "HepG2", "K562", "MCF-7"]
REGION_TASKS = {
    "IE vs IP": (["enhancer", "promoter"], [0.0]),
    "AP vs IP": (["promoter"], [1.0, 0.0]),
    "AE vs IE": (["enhancer"], [1.0, 0.0]),
    "AE vs AP": (["enhancer", "promoter"], [1.0])
}
SELECTED_TASK = "AP vs IP"
REGION_TYPES, TPM_VALUES = REGION_TASKS[SELECTED_TASK]

# --- Feature Columns ---
FIXED_COLUMNS = ["chrom", "start", "end", "strand", "TPM"]

# --- Data Loader Function ---
def load_all_data_from_csv(cell_lines, region_types, tpm_values):
    print("🔍 Scanning CSVs for all features...")
    all_feature_names = set()

    # 1. Collect all feature names
    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")
        if cell_line not in cell_lines or region_type not in region_types:
            continue
        df = pd.read_csv(file_path, nrows=1)  # only read header
        feature_cols = [col for col in df.columns if col not in FIXED_COLUMNS]
        all_feature_names.update(feature_cols)

    all_feature_names = sorted(all_feature_names)
    print(f"📈 Total unified features: {len(all_feature_names)}")

    # 2. Load all data and align to same features
    X, y = [], []
    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")

        if cell_line not in cell_lines or region_type not in region_types:
            continue

        df = pd.read_csv(file_path)
        df = df[df["TPM"].isin(tpm_values)]
        if df.empty:
            continue

        feature_df = df.drop(columns=FIXED_COLUMNS, errors="ignore")
        aligned_features = feature_df.reindex(columns=all_feature_names, fill_value=0).values
        X.append(aligned_features)
        y.append((df["TPM"] > 0).astype(int).values)

    if not X:
        raise ValueError("No matching data found for selected filters.")

    X_all = np.vstack(X)
    y_all = np.concatenate(y)
    return X_all, y_all, all_feature_names

# --- Main Training ---
print("🚀 Starting CSV-based FFNN Training")

start_data_time = time.time()
X, y, used_features = load_all_data_from_csv(CELL_LINES, REGION_TYPES, TPM_VALUES)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
data_time = time.time() - start_data_time

print(f"📊 Data loaded: {X.shape[0]} samples, {X.shape[1]} features")
print(f"🕒 Data load time: {data_time:.2f} sec")

start_train_time = time.time()
model = build_model(X.shape[1])
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, verbose=1, validation_data=(X_val, y_val))
train_time = time.time() - start_train_time

# --- Evaluation ---
val_pred = (model.predict(X_val) > 0.5).astype(int)
val_acc = accuracy_score(y_val, val_pred)

test_pred = (model.predict(X_test) > 0.5).astype(int)
test_acc = accuracy_score(y_test, test_pred)

print(f"✅ Validation Accuracy: {val_acc:.4f}")
print(f"🔍 Test Accuracy: {test_acc:.4f}")
print(f"🧠 Training time: {train_time:.2f} sec")

# --- Save Model
model_path = "trained_ffnn_model_csv.pkl"
joblib.dump(model, model_path)
print(f"💾 Model saved to: {model_path}")

# --- Cleanup
K.clear_session()
gc.collect()


🚀 Starting CSV-based FFNN Training
🔍 Scanning CSVs for all features...
📈 Total unified features: 947


MemoryError: Unable to allocate 12.5 GiB for an array with shape (1772960, 947) and data type float64

In [15]:
# 📦 Required Libraries
import pandas as pd
import os
import time
from glob import glob
import numpy as np
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import gc
import joblib

# 📁 Path to All CSVs
BASE_PATH = r"I:\Final2\Max Data"
all_csv_files = glob(os.path.join(BASE_PATH, "*.csv"))

# ✅ Settings
CELL_LINES = ["A549", "GM12878", "H1", "HEK293", "HepG2", "K562", "MCF-7"]
REGION_TASKS = {
    "IE vs IP": (["enhancer", "promoter"], [0.0]),
    "AP vs IP": (["promoter"], [1.0, 0.0]),
    "AE vs IE": (["enhancer"], [1.0, 0.0]),
    "AE vs AP": (["enhancer", "promoter"], [1.0])
}
SELECTED_TASK = "AP vs IP"
REGION_TYPES, TPM_VALUES = REGION_TASKS[SELECTED_TASK]
FIXED_COLUMNS = ["chrom", "start", "end", "strand", "TPM"]
NUM_BATCHES = 10

# 🔢 Count total rows and store file info
def count_total_rows(cell_lines, region_types, tpm_values):
    total = 0
    file_info = []

    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")
        if cell_line in cell_lines and region_type in region_types:
            try:
                df = pd.read_csv(file_path, usecols=["TPM"])
                count = df[df["TPM"].isin(tpm_values)].shape[0]
                total += count
                file_info.append((file_path, count))
            except Exception as e:
                print(f"⚠️ Error reading {file_path}: {e}")
    return total, file_info

# 🧬 Get unified feature list
def get_unified_feature_names(cell_lines, region_types):
    feature_names = set()
    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")
        if cell_line in cell_lines and region_type in region_types:
            try:
                df = pd.read_csv(file_path, nrows=1)
                feature_names.update([col for col in df.columns if col not in FIXED_COLUMNS])
            except Exception as e:
                print(f"⚠️ Skipping {file_path} due to error: {e}")
    return sorted(feature_names)

# 📥 Load next batch of rows
def load_next_batch(file_info, feature_list, tpm_values, batch_size, offset):
    X_batch, y_batch = [], []
    rows_loaded = 0

    for file_path, row_count in file_info:
        if rows_loaded >= batch_size:
            break

        df = pd.read_csv(file_path)
        df = df[df["TPM"].isin(tpm_values)]

        if offset > 0:
            if offset >= len(df):
                offset -= len(df)
                continue
            df = df.iloc[offset:]
            offset = 0

        to_take = min(batch_size - rows_loaded, len(df))
        df = df.iloc[:to_take]
        aligned = df.drop(columns=FIXED_COLUMNS, errors="ignore").reindex(columns=feature_list, fill_value=0).values
        labels = (df["TPM"] > 0).astype(int).values

        X_batch.append(aligned)
        y_batch.append(labels)
        rows_loaded += len(df)

    if not X_batch:
        return None, None

    return np.vstack(X_batch), np.concatenate(y_batch)

# 🧠 Build a simple FFNN model
def build_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

# 🏁 MAIN PROCESS
print("🚀 Starting Batch-based FFNN Training")
start_time = time.time()

# Step 1: Count rows and get info
total_rows, file_info = count_total_rows(CELL_LINES, REGION_TYPES, TPM_VALUES)
rows_per_batch = total_rows // NUM_BATCHES
print(f"📊 Total matched rows: {total_rows:,}")
print(f"📦 Batching into {NUM_BATCHES} batches of ~{rows_per_batch:,} rows")

# Step 2: Get feature list
ALL_FEATURES = get_unified_feature_names(CELL_LINES, REGION_TYPES)
print(f"🧬 Total unified features: {len(ALL_FEATURES)}")

# Step 3: Initialize and compile model
model = build_model(len(ALL_FEATURES))
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train in batches
offset = 0
for batch in range(NUM_BATCHES):
    print(f"\n🔁 Loading batch {batch+1}/{NUM_BATCHES} ...")
    X_batch, y_batch = load_next_batch(file_info, ALL_FEATURES, TPM_VALUES, rows_per_batch, offset)
    offset += rows_per_batch

    if X_batch is None:
        print("⚠️ No data for batch, skipping...")
        continue

    print(f"🧠 Training on {X_batch.shape[0]} samples")
    model.fit(X_batch, y_batch, epochs=10, verbose=1)

# 🧪 Optional: evaluate on a final held-out portion
# (This can be improved by separating out a small validation set in `load_next_batch`)
# For now, you could save the model:

model_path = "trained_ffnn_model_csv_batches.h5"
model.save(model_path)
print(f"\n💾 Model saved to: {model_path}")

# 🧹 Cleanup
K.clear_session()
gc.collect()

print(f"✅ Total training time: {time.time() - start_time:.2f} sec")


🚀 Starting Batch-based FFNN Training
📊 Total matched rows: 1,772,960
📦 Batching into 10 batches of ~177,296 rows
🧬 Total unified features: 947


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



🔁 Loading batch 1/10 ...
🧠 Training on 177296 samples
Epoch 1/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.8648 - loss: 0.4140
Epoch 2/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8644 - loss: 0.3971
Epoch 3/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8652 - loss: 0.3957
Epoch 4/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8644 - loss: 0.3971
Epoch 5/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8650 - loss: 0.3960
Epoch 6/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8634 - loss: 0.3989
Epoch 7/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.8640 - loss: 0.3979
Epoch 8/10
[1m5541/5541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - a




💾 Model saved to: trained_ffnn_model_csv_batches.h5
✅ Total training time: 1358.34 sec


In [None]:
# 📦 Required Libraries
import pandas as pd
import os
import time
from glob import glob
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import gc
import joblib

# 📁 Path to All CSVs
BASE_PATH = r"I:\Final2\Max Data"
all_csv_files = glob(os.path.join(BASE_PATH, "*.csv"))

# ✅ Settings
CELL_LINES = ["A549", "GM12878", "H1", "HEK293", "HepG2", "K562", "MCF-7"]
REGION_TASKS = {
    "IE vs IP": (["enhancer", "promoter"], [0.0]),
    "AP vs IP": (["promoter"], [1.0, 0.0]),
    "AE vs IE": (["enhancer"], [1.0, 0.0]),
    "AE vs AP": (["enhancer", "promoter"], [1.0])
}
SELECTED_TASK = "AP vs IP"
REGION_TYPES, TPM_VALUES = REGION_TASKS[SELECTED_TASK]
FIXED_COLUMNS = ["chrom", "start", "end", "strand", "TPM"]

# 🔢 Count total rows and store file info
def count_total_rows(cell_lines, region_types, tpm_values):
    total = 0
    file_info = []
    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")
        if cell_line in cell_lines and region_type in region_types:
            try:
                df = pd.read_csv(file_path, usecols=["TPM"])
                count = df[df["TPM"].isin(tpm_values)].shape[0]
                total += count
                file_info.append((file_path, count))
            except Exception as e:
                print(f"⚠️ Error reading {file_path}: {e}")
    return total, file_info

# 🧬 Get unified feature list
def get_unified_feature_names(cell_lines, region_types):
    feature_names = set()
    for file_path in all_csv_files:
        filename = os.path.basename(file_path)
        parts = filename.split("_")
        if len(parts) < 3:
            continue
        cell_line, _, region_type = parts[0], parts[1], parts[2].replace("_processed.csv", "")
        if cell_line in cell_lines and region_type in region_types:
            try:
                df = pd.read_csv(file_path, nrows=1)
                feature_names.update([col for col in df.columns if col not in FIXED_COLUMNS])
            except Exception as e:
                print(f"⚠️ Skipping {file_path} due to error: {e}")
    return sorted(feature_names)

# 📥 Load all filtered data
def load_all_data(file_info, feature_list, tpm_values):
    X_all, y_all = [], []
    for file_path, _ in file_info:
        try:
            df = pd.read_csv(file_path)
            df = df[df["TPM"].isin(tpm_values)]
            X = df.drop(columns=FIXED_COLUMNS, errors="ignore").reindex(columns=feature_list, fill_value=0).values
            y = (df["TPM"] > 0).astype(int).values
            X_all.append(X)
            y_all.append(y)
        except Exception as e:
            print(f"⚠️ Error loading {file_path}: {e}")
    return np.vstack(X_all), np.concatenate(y_all)

# 🧠 Build a simple FFNN model
def build_model(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

# 🏁 MAIN PROCESS
print("🚀 Starting FFNN Training (CSV Version)")

start_data_time = time.time()

# Step 1: Count rows and get info
total_rows, file_info = count_total_rows(CELL_LINES, REGION_TYPES, TPM_VALUES)
print(f"📊 Total matched rows: {total_rows:,}")

# Step 2: Get feature list
ALL_FEATURES = get_unified_feature_names(CELL_LINES, REGION_TYPES)
print(f"🧬 Total unified features: {len(ALL_FEATURES)}")

# Step 3: Load all filtered data
X, y = load_all_data(file_info, ALL_FEATURES, TPM_VALUES)
data_load_time = time.time() - start_data_time
print(f"📥 Data loaded in {data_load_time:.2f} seconds")

# Step 4: Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Step 5: Train the model
start_train_time = time.time()
model = build_model(X.shape[1])
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=512, verbose=1)
train_time = time.time() - start_train_time
print(f"🧠 Training completed in {train_time:.2f} seconds")

# Step 6: Evaluate
val_pred = (model.predict(X_val) > 0.5).astype(int)
val_acc = accuracy_score(y_val, val_pred)
print(f"✅ Validation Accuracy: {val_acc:.4f}")

test_pred = (model.predict(X_test) > 0.5).astype(int)
test_acc = accuracy_score(y_test, test_pred)
print(f"🔍 Test Accuracy: {test_acc:.4f}")

# Save model
model_path = "trained_ffnn_model_csv_full.h5"
model.save(model_path)
print(f"💾 Model saved to: {model_path}")

# Cleanup
K.clear_session()
gc.collect()

print(f"🏁 All done! Total time: {time.time() - start_data_time:.2f} sec")


🚀 Starting FFNN Training (CSV Version)
📊 Total matched rows: 1,772,960
🧬 Total unified features: 947
📥 Data loaded in 222.70 seconds


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
