In [1]:
import torch
from torch.utils.data import DataLoader
import numpy as np
from utils import EnhancerDataset, split_dataset, train_model, regression_model_plot,EnhancerDatasetWithID
import pandas as pd
from tqdm import tqdm
import glob
import torch.nn as nn
import torch.nn.modules.activation as activation
import sys
from sklearn.model_selection import train_test_split
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import interpretation
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append('../model')  
from model import ConvNetDeep, DanQ, ExplaiNN,ConvNetDeep2, ExplaiNN2, ExplaiNN3,DeepSTARR


In [4]:
# Directory to save the result
seed = 42
batch = 168
num_cnns = 70
learning_rate= 1e-2
target_labels = ["GFP+","GFP-"]
output_dir = '/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP_70NN'

df = pd.read_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/filtered_input_data.csv')

#train, test = split_dataset(df, split_type='fragment', key= 0, seed = 42)
train_df, val_df, test_df = split_dataset(df, split_type='random', split_pattern=[0.7, 0.15, 0.15], seed=seed)

# Process datasets
train = EnhancerDatasetWithID(train_df, feature_list=['G+','G-'], scale_mode = 'none')
test = EnhancerDatasetWithID(test_df, feature_list=['G+','G-'], scale_mode = 'none')
validation = EnhancerDatasetWithID(val_df, feature_list=['G+','G-'], scale_mode = 'none')

# DataLoader setup
train_loader = DataLoader(dataset=train, batch_size=batch, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=batch, shuffle=False)
val_loader = DataLoader(dataset=validation, batch_size=168, shuffle=False)


input_model = ExplaiNN3(num_cnns = num_cnns, input_length = 608, num_classes = 2, 
                 filter_size = 19, num_fc=2, pool_size=7, pool_stride=7, 
                 drop_out = 0.3, weight_path = None)# Training

_, _, model, train_losses_by_batch, test_losses_by_batch, results, best_pearson_epoch, best_r2_epoch, peasron_metric, r2_metric, device  = train_model(input_model, train_loader, val_loader,test_loader, target_labels=target_labels,
                                                                                                                            num_epochs=200, 
                                                                                         batch_size=batch, learning_rate=learning_rate, 
                                                                                         criteria='mse',optimizer_type = "adam", patience=15, 
                                                                                         seed = seed, save_model= False, dir_path=output_dir)

Using device: cuda
Model is on device: cuda:0
Epoch 1/200, Step 1/60, Loss: 73.9672
Epoch 1/200 -- Train Loss: 11.5048 , Validation Loss: 0.3904
------------------------Evaluation------------------------
Label 1: MSE=0.4032, RMSE=0.6350, MAE=0.5204, R^2=-5.6400, Pearson=0.2230, Spearman=0.2222
Label 2: MSE=0.3763, RMSE=0.6134, MAE=0.5003, R^2=-9.9570, Pearson=0.0926, Spearman=0.0899
Overall (Flattened): MSE=0.3898, RMSE=0.6243, MAE=0.5103, R^2=-6.7794, Pearson=0.1852, Spearman=0.1729
----------------------------------------------------------
Epoch 2/200, Step 1/60, Loss: 0.2187
Epoch 2/200 -- Train Loss: 0.1489 , Validation Loss: 0.1087
------------------------Evaluation------------------------
Label 1: MSE=0.1165, RMSE=0.3413, MAE=0.2713, R^2=-0.9189, Pearson=0.2733, Spearman=0.2544
Label 2: MSE=0.1011, RMSE=0.3180, MAE=0.2553, R^2=-1.9437, Pearson=0.2098, Spearman=0.1871
Overall (Flattened): MSE=0.1088, RMSE=0.3299, MAE=0.2633, R^2=-1.1718, Pearson=0.2726, Spearman=0.2441
-----------

In [6]:
# Load the dataset
import os
df = pd.read_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/filtered_input_data.csv')
target_labels = ["GFP+","GFP-"]

# Initialize the R_square list
seed_list = []
batch_list = []
lr_list = []
dropout_list = []

mse_list_p = []
rmse_list_p = []
mae_list_p = []
r2_list_p = []
pearson_corr_list_p = []
spearman_corr_list_p = []

mse_list_r = []
rmse_list_r = []
mae_list_r = []
r2_list_r = []
pearson_corr_list_r = []
spearman_corr_list_r = []

best_pearson_epochs = []
best_r2_epochs = []

batches = [96]
seeds = [9]
learning_rates = [5e-3]

output_dir = '/pmglocal/ty2514/Enhancer/Enhancer/data/ConvNetDeep_G+G-'
os.makedirs(output_dir, exist_ok=True)
# Save the R_square results to a CSV file
filename = os.path.join(output_dir, 'ConvNetDeep_G+G-_Metrics.csv')

# Split the dataset
for seed in seeds: 
    for batch in batches:
        train_df, val_df, test_df = split_dataset(df, split_type='random', split_pattern=[0.7, 0.15, 0.15], seed=seed)

        # Process datasets
        train = EnhancerDatasetWithID(train_df, feature_list=['G+','G-'], scale_mode='none')
        val = EnhancerDatasetWithID(val_df, feature_list=['G+','G-'], scale_mode='none')
        test = EnhancerDatasetWithID(test_df, feature_list=['G+','G-'], scale_mode='none')

        # DataLoader setup
        train_loader = DataLoader(dataset=train, batch_size=batch, shuffle=True)
        val_loader = DataLoader(dataset=val, batch_size=batch, shuffle=False)
        test_loader = DataLoader(dataset=test, batch_size=batch, shuffle=False)

        # Hyperparameter search
        for dropout in [0.3]:
            # Model setup
            input_model = ConvNetDeep(num_classes=2, drop_out=dropout)
            for learning_rate in learning_rates:
                formatted_lr = "{:.5f}".format(learning_rate)
                print(f"dropout{dropout}_ba{batch}_lr{formatted_lr}_seed{seed}")

                _, _, model, train_losses_by_batch, test_losses_by_batch, results, best_pearson_epoch, best_r2_epoch,  pearson_metrics, r2_metrics, device  = train_model(
                    input_model, train_loader, val_loader, test_loader,target_labels=target_labels, num_epochs=200, batch_size=batch, learning_rate=learning_rate, 
                    criteria='mse',optimizer_type = "adam", patience=15, seed = seed, save_model= False, dir_path=output_dir)
                
                # Saving all metrics for best r2 model and pearson model respectively
                mse_list_p.append(pearson_metrics['mse'][-1])
                rmse_list_p.append(pearson_metrics['rmse'][-1])
                mae_list_p.append(pearson_metrics['mae'][-1])
                r2_list_p.append(pearson_metrics['r2'][-1])
                pearson_corr_list_p.append(pearson_metrics['pearson_corr'][-1])
                spearman_corr_list_p.append(pearson_metrics['spearman_corr'][-1])
                
                mse_list_r.append(r2_metrics['mse'][-1])
                rmse_list_r.append(r2_metrics['rmse'][-1])
                mae_list_r.append(r2_metrics['mae'][-1])
                r2_list_r.append(r2_metrics['r2'][-1])
                pearson_corr_list_r.append(r2_metrics['pearson_corr'][-1])
                spearman_corr_list_r.append(r2_metrics['spearman_corr'][-1])

                seed_list.append(seed)
                batch_list.append(batch)
                lr_list.append(formatted_lr)
                dropout_list.append(dropout)
                best_pearson_epochs.append(best_pearson_epoch)
                best_r2_epochs.append(best_r2_epoch)

results_df = pd.DataFrame({
    "batch": batch_list,
    "lr": lr_list,
    "drop_out": dropout_list,
    "seed": seed_list,
    "mse_p":mse_list_p,
    "rmse_p":rmse_list_p,
    "mae_p":mae_list_p,
    "r2_p":r2_list_p,
    "pearson_corr_p":pearson_corr_list_p,
    "spearman_corr_p":spearman_corr_list_p,
    "mse_r":mse_list_r,
    "rmse_r":rmse_list_r,
    "mae_r":mae_list_r,
    "r2_r":r2_list_r,
    "pearson_corr_r":pearson_corr_list_r,
    "spearman_corr_r":spearman_corr_list_r,
    "best_pearson_epoch": best_pearson_epochs,
    "best_r2_epoch": best_r2_epochs
})
results_df.to_csv(filename, index=False)
print(f"R_square values saved to {filename}")

dropout0.3_ba96_lr0.00500_seed9
Using device: cuda
Model is on device: cuda:0
Epoch 1/200, Step 1/105, Loss: 77.4678
Epoch 1/200 -- Train Loss: 2.8041 , Validation Loss: 0.2304
------------------------Evaluation------------------------
Label 1: MSE=0.2537, RMSE=0.5037, MAE=0.3887, R^2=-3.3235, Pearson=0.1795, Spearman=0.1797
Label 2: MSE=0.2083, RMSE=0.4564, MAE=0.3534, R^2=-4.8316, Pearson=0.1390, Spearman=0.1427
Overall (Flattened): MSE=0.2310, RMSE=0.4806, MAE=0.3710, R^2=-3.7020, Pearson=0.1828, Spearman=0.1814
----------------------------------------------------------
Epoch 2/200, Step 1/105, Loss: 0.3827
Epoch 2/200 -- Train Loss: 0.2950 , Validation Loss: 0.1602
------------------------Evaluation------------------------
Label 1: MSE=0.1789, RMSE=0.4230, MAE=0.3357, R^2=-2.0489, Pearson=0.2065, Spearman=0.2109
Label 2: MSE=0.1405, RMSE=0.3748, MAE=0.2970, R^2=-2.9319, Pearson=0.1302, Spearman=0.1289
Overall (Flattened): MSE=0.1597, RMSE=0.3996, MAE=0.3163, R^2=-2.2502, Pearson=0.

In [None]:
file_list = glob.glob('/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP_70NN/best_r2*.pth')

# Function to evaluate the model with a specific set of weights
def evaluate_model_with_weights(model, test_loader, weight_file):
    # Load the saved weights
    model.load_state_dict(torch.load(weight_file))
    
    # Set model to evaluation mode
    model.eval()
    
    all_predictions = []
    all_labels = []
    all_fragment_ids = []

    # Disable gradient calculation for inference
    with torch.no_grad():
        for data in test_loader:
            inputs, labels, fragment_ids = data  # Get inputs, labels, and fragment IDs
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Get predictions
            outputs = model(inputs)
            
            # Store predictions, true labels, and fragment IDs
            all_predictions.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            all_fragment_ids.append(fragment_ids)  # Keep track of the sample IDs

    # Convert lists of numpy arrays into a single numpy array
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    all_fragment_ids = np.concatenate(all_fragment_ids, axis=0)
    
    return all_predictions, all_labels, all_fragment_ids

# Loop over each weight file and evaluate the model
for weight_file in file_list:
    print(f"Evaluating model with weights from: {weight_file}")
    
    # Call the evaluation function
    predictions, labels, fragment_ids = evaluate_model_with_weights(input_model, test_loader, weight_file)
    
    # Print or save results
    df_predictions = pd.DataFrame({
        'Fragment_ID': fragment_ids,  # Add the fragment IDs
        'Predictions': predictions.flatten(),  # Assuming 1D output
        'True Labels': labels.flatten()
    })
    
    # Save results to a CSV file
    output_file = f'/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP_70NN/prediction_results.csv'
    df_predictions.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")


# Train ExplaiNN Predict GFP+

In [None]:
# Directory to save the result
seed = 42
batch = 168
num_cnns = 90
learning_rate= 5e-5
target_labels = ["GFP+"]
output_dir = '/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP+_90NN'

df = pd.read_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/input_data.csv')

input_model = ExplaiNN3(num_cnns = num_cnns, input_length = 608, num_classes = 1, 
                 filter_size = 19, num_fc=2, pool_size=7, pool_stride=7, 
                 drop_out = 0.3, weight_path = None)# Training

_, _, model, train_losses_by_batch, test_losses_by_batch, results, best_pearson_epoch, best_r2_epoch, device  = train_model(input_model, train_loader, test_loader, target_labels=target_labels,
                                                                                                                            num_epochs=200, 
                                                                                         batch_size=batch, learning_rate=learning_rate, 
                                                                                         criteria='mse',optimizer_type = "adam", patience=15, 
                                                                                         seed = seed, save_model= True, dir_path=output_dir)

In [None]:
file_list = glob.glob('/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP+_90NN/best_r2*.pth')

# Function to evaluate the model with a specific set of weights
def evaluate_model_with_weights(model, test_loader, weight_file):
    # Load the saved weights
    model.load_state_dict(torch.load(weight_file))
    
    # Set model to evaluation mode
    model.eval()
    
    all_predictions = []
    all_labels = []
    all_fragment_ids = []

    # Disable gradient calculation for inference
    with torch.no_grad():
        for data in test_loader:
            inputs, labels, fragment_ids = data  # Get inputs, labels, and fragment IDs
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Get predictions
            outputs = model(inputs)
            
            # Store predictions, true labels, and fragment IDs
            all_predictions.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            all_fragment_ids.append(fragment_ids)  # Keep track of the sample IDs

    # Convert lists of numpy arrays into a single numpy array
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    all_fragment_ids = np.concatenate(all_fragment_ids, axis=0)
    
    return all_predictions, all_labels, all_fragment_ids

# Loop over each weight file and evaluate the model
for weight_file in file_list:
    print(f"Evaluating model with weights from: {weight_file}")
    
    # Call the evaluation function
    predictions, labels, fragment_ids = evaluate_model_with_weights(input_model, test_loader, weight_file)
    
    # Print or save results
    df_predictions = pd.DataFrame({
        'Fragment_ID': fragment_ids,  # Add the fragment IDs
        'Predictions': predictions.flatten(),  # Assuming 1D output
        'True Labels': labels.flatten()
    })
    
    # Save results to a CSV file
    output_file = f'/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP+_90NN/prediction_results.csv'
    df_predictions.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")


# Train DeepSTARR

In [None]:
# Directory to save the result
seed = 42
batch = 168
learning_rate= 5e-5
target_labels = ["GFP"]
output_dir = '/pmglocal/ty2514/Enhancer/Enhancer/data/DeepSTARR_GFP'


input_model = DeepSTARR(num_classes = 1)

_, _, model_deepstarr, train_losses_by_batch, test_losses_by_batch, results, best_pearson_epoch, best_r2_epoch, device  = train_model(input_model, train_loader, test_loader, target_labels=target_labels,
                                                                                                                            num_epochs=200, 
                                                                                         batch_size=batch, learning_rate=learning_rate, 
                                                                                         criteria='mse',optimizer_type = "adam", patience=15, 
                                                                                         seed = seed, save_model= True, dir_path=output_dir)

In [None]:
file_list = glob.glob('/pmglocal/ty2514/Enhancer/Enhancer/data/DeepSTARR_GFP/best_r2*.pth')

for weight_file in file_list:
    print(f"Evaluating model with weights from: {weight_file}")
    
    # Call the evaluation function
    predictions_d, labels_d, fragment_ids_d = evaluate_model_with_weights(input_model, test_loader, weight_file)
    
    # Print or save results
    df_predictions_d = pd.DataFrame({
        'Fragment_ID': fragment_ids_d,  # Add the fragment IDs
        'Predictions': predictions_d.flatten(),  # Assuming 1D output
        'True Labels': labels_d.flatten()
    })
    
    # Save results to a CSV file
    #output_file = f'/pmglocal/ty2514/Enhancer/Enhancer/data/ExplaiNN_GFP_70NN/prediction_results.csv'
    #df_predictions.to_csv(output_file, index=False)
    #print(f"Results saved to {output_file}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure that the DataFrames are aligned by index (if necessary)
# For example, you can merge them if they have different indices:
# combined_df = pd.merge(df_predictions_d[['Predictions']], df_predictions[['Predictions']], left_index=True, right_index=True, suffixes=('_model1', '_model2'))

# Create a scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_predictions_d['Predictions'], y=df_predictions['Predictions'])

# Add labels and title
plt.xlabel('Predictions from Model 1')
plt.ylabel('Predictions from Model 2')
plt.title('Correlation between Predictions of Model 1 and Model 2')

# Show the plot
plt.show()
