In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.neighbors import kneighbors_graph
from GraphRicciCurvature.OllivierRicci import OllivierRicci
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, classification_report
from scipy.stats import ttest_ind

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path,engine='python',encoding='utf-8',nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(data)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_data)

# Construct k-NN graph
def construct_knn_graph(X, k):
    knn_graph = kneighbors_graph(X, k, mode='connectivity', include_self=False)
    coo_graph = knn_graph.tocoo()  # Convert to COO format
    G = nx.from_edgelist(zip(coo_graph.row, coo_graph.col))  # Create graph from COO format
    return G

# Calculate Ollivier-Ricci curvature
def compute_ricci_curvature(G, alpha=0.5):
    orc = OllivierRicci(G, alpha=alpha, verbose="INFO")
    orc.compute_ricci_curvature()
    return nx.get_edge_attributes(orc.G, "ricciCurvature")

# Identify high proliferation index cells with alternative curvature aggregation methods
def identify_high_proliferation_nodes(ricci_curvatures, method='threshold', threshold=None, top_percentile=None, centrality_measure=None):
    if method == 'threshold':
        high_proliferation_edges = [(u, v) for (u, v), curvature in ricci_curvatures.items() if curvature > threshold]
    elif method == 'top_percentile':
        sorted_curvatures = sorted(ricci_curvatures.items(), key=lambda x: x[1], reverse=True)
        top_n = int(len(sorted_curvatures) * top_percentile / 100)
        high_proliferation_edges = [(u, v) for (u, v), _ in sorted_curvatures[:top_n]]
    elif method == 'centrality':
        centrality_scores = nx.betweenness_centrality(G, weight='ricciCurvature')
        sorted_centrality = sorted(centrality_scores.items(), key=lambda x: x[1], reverse=True)
        top_n = int(len(sorted_centrality) * top_percentile / 100)
        high_proliferation_nodes = [node for node, _ in sorted_centrality[:top_n]]
        high_proliferation_edges = [(u, v) for (u, v) in G.edges() if u in high_proliferation_nodes or v in high_proliferation_nodes]
    else:
        raise ValueError("Invalid method. Choose from 'threshold', 'top_percentile', or 'centrality'.")
    
    high_proliferation_nodes = set([node for edge in high_proliferation_edges for node in edge])
    return high_proliferation_nodes

# Differential Expression Analysis
def differential_expression_analysis(data, high_proliferation_nodes):
    if 'persister_label' in data.columns:
        persister_labels = data['persister_label'].values
        high_proliferation_labels = np.zeros_like(persister_labels)
        high_proliferation_labels[list(high_proliferation_nodes)] = 1

        # Perform differential expression analysis
        differential_genes = []
        for gene_index in range(data.shape[1]):
            gene_expr_high_prolif = data[high_proliferation_labels == 1, gene_index]
            gene_expr_non_high_prolif = data[high_proliferation_labels == 0, gene_index]
            _, p_value = ttest_ind(gene_expr_high_prolif, gene_expr_non_high_prolif)
            if p_value < 0.05:  # Adjust significance level as needed
                differential_genes.append(gene_index)
        return differential_genes
    else:
        print("Ground truth labels ('persister_label') not available. Skipping differential expression analysis.")
        return None

# Usage
k_values = [5, 10, 15, 20]
threshold = 0.5  # Threshold for identifying high proliferation cells
top_percentile = 5  # Top percentile for alternative aggregation methods
centrality_percentile = 5  # Percentile of nodes for centrality-based aggregation

# Experiment with different k values for kNN graph
for k in k_values:
    G = construct_knn_graph(X_pca, k)
    ricci_curvatures = compute_ricci_curvature(G)
    
    # Identify high proliferation cells using thresholding method
    high_proliferation_nodes_threshold = identify_high_proliferation_nodes(ricci_curvatures, method='threshold', threshold=threshold)
    print(f"High Proliferation Cells with threshold method and k={k}:", high_proliferation_nodes_threshold)
    
    # Identify high proliferation cells using top percentile method
    high_proliferation_nodes_percentile = identify_high_proliferation_nodes(ricci_curvatures, method='top_percentile', top_percentile=top_percentile)
    print(f"High Proliferation Cells with top percentile method and k={k}:", high_proliferation_nodes_percentile)
    
    # Identify high proliferation cells using centrality method
    high_proliferation_nodes_centrality = identify_high_proliferation_nodes(ricci_curvatures, method='centrality', top_percentile=centrality_percentile)
    print(f"High Proliferation Cells with centrality method and k={k}:", high_proliferation_nodes_centrality)

    # Differential expression analysis
    differential_genes = differential_expression_analysis(X_data, high_proliferation_nodes_threshold)
    if differential_genes is not None:
        print("Differential Genes with threshold method:", differential_genes)
    
    differential_genes = differential_expression_analysis(X_data, high_proliferation_nodes_percentile)
    if differential_genes is not None:
        print("Differential Genes with top percentile method:", differential_genes)
    
    differential_genes = differential_expression_analysis(X_data, high_proliferation_nodes_centrality)
    if differential_genes is not None:
        print("Differential Genes with centrality method:", differential_genes)

    # Visualize curvature distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(list(ricci_curvatures.values()), bins=50, kde=True)
    plt.axvline(np.mean(list(ricci_curvatures.values())), color='r', linestyle='--')
    plt.title(f'Distribution of Ricci Curvatures with k={k}')
    plt.xlabel('Ricci Curvature')
    plt.ylabel('Frequency')
    plt.show()

    # Visualize high proliferation nodes on the PCA plot
    plt.figure(figsize=(10, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c='gray', alpha=0.5, label='All Cells')
    plt.scatter(X_pca[list(high_proliferation_nodes_threshold), 0], X_pca[list(high_proliferation_nodes_threshold), 1], c='red', label='High Proliferation Cells (Threshold)')
    plt.scatter(X_pca[list(high_proliferation_nodes_percentile), 0], X_pca[list(high_proliferation_nodes_percentile), 1], c='blue', label='High Proliferation Cells (Percentile)')
    plt.scatter(X_pca[list(high_proliferation_nodes_centrality), 0], X_pca[list(high_proliferation_nodes_centrality), 1], c='green', label='High Proliferation Cells (Centrality)')
    plt.title(f'PCA of scRNA-seq Data with k={k}')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.legend()
    plt.show()




In [7]:
import numpy as np
import pandas as pd
import scanpy as sc
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import kneighbors_graph
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(data)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = self.X[idx]
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)[0]

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 32
num_epochs = 10
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size)

train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')
train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate)

# Training loop for scBERT model
for epoch in range(num_epochs):
    scbert_model.train()
    running_loss = 0.0
    for inputs, labels in train_dataloader_bert:
        optimizer_bert.zero_grad()

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = scbert_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_bert.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss / len(train_dataloader_bert)}")

# Evaluation loop for scBERT model
scbert_model.eval()
correct_bert = 0
total_bert = 0
with torch.no_grad():
    for inputs, labels in test_dataloader_bert:
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = scbert_model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total_bert += labels.size(0)
        correct_bert += (predicted == labels).sum().item()

print('Accuracy of scBERT model on test set:', (correct_bert / total_bert))

# Training loop for scGPT model
for epoch in range(num_epochs):
    scgpt_model.train()
    running_loss = 0.0
    for inputs, labels in train_dataloader_gpt:
        optimizer_gpt.zero_grad()

        input_ids = inputs
        outputs = scgpt_model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_gpt.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss / len(train_dataloader_gpt)}")

# Evaluation loop for scGPT model
scgpt_model.eval()
correct_gpt = 0
total_gpt = 0
with torch.no_grad():
    for inputs, labels in test_dataloader_gpt:
        input_ids = inputs
        outputs = scgpt_model(input_ids)
        _, predicted = torch.max(outputs, 1)
        total_gpt += labels.size(0)
        correct_gpt += (predicted == labels).sum().item()

print('Accuracy of scGPT model on test set:', (correct_gpt / total_gpt))


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
29/05/2024

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path,nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 32
num_epochs = 10
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_dataloader)}')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)

    best_loss = float('inf')
    early_stop_count = 0
    for epoch in range(num_epochs):
        scbert_model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader_bert:
            optimizer_bert.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = scbert_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_bert.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader_bert)
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                break
    return best_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
best_params = study.best_params

batch_size = best_params['batch_size']
learning_rate = best_params['learning_rate']
dropout_rate = best_params['dropout_rate']

# Train final scBERT model with best hyperparameters
scbert_model = SCBERTClassifier()
scbert_model.dropout = nn.Dropout(dropout_rate)
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)
train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
train_model(scbert_model, optimizer_bert, train_dataloader_bert, criterion, num_epochs)

# Evaluate scBERT model on test set
test_accuracy_bert = evaluate_model(scbert_model, test_dataloader_bert)
print('Accuracy of scBERT model on test set:', test_accuracy_bert)

# Hyperparameter tuning with Optuna for scGPT
def objective_gpt(trial):
    batch_size_gpt = trial.suggest_categorical('batch_size_gpt', [16, 32, 64])
    learning_rate_gpt = trial.suggest_float('learning_rate_gpt', 1e-5, 1e-3, log=True)
    dropout_rate_gpt = trial.suggest_float('dropout_rate_gpt', 0.1, 0.5)

    scgpt_model = SCGPTClassifier()
    scgpt_model.dropout = nn.Dropout(dropout_rate_gpt)
    optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate_gpt)
    train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size_gpt, shuffle=True)

    best_loss_gpt = float('inf')
    early_stop_count_gpt = 0
    for epoch in range(num_epochs):
        scgpt_model.train()
        running_loss_gpt = 0.0
        for inputs, labels in train_dataloader_gpt:
            optimizer_gpt.zero_grad()
            input_ids = inputs
            outputs = scgpt_model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_gpt.step()
            running_loss_gpt += loss.item()
        epoch_loss_gpt = running_loss_gpt / len(train_dataloader_gpt)
        if epoch_loss_gpt < best_loss_gpt:
            best_loss_gpt = epoch_loss_gpt
            early_stop_count_gpt = 0
        else:
            early_stop_count_gpt += 1
            if early_stop_count_gpt >= 3:
                break
    return best_loss_gpt

study_gpt = optuna.create_study(direction='minimize')
study_gpt.optimize(objective_gpt, n_trials=50)
best_params_gpt = study_gpt.best_params

batch_size_gpt = best_params_gpt['batch_size_gpt']
learning_rate_gpt = best_params_gpt['learning_rate_gpt']
dropout_rate_gpt = best_params_gpt['dropout_rate_gpt']

# Train final scGPT model with best hyperparameters
scgpt_model = SCGPTClassifier() 
scgpt_model.dropout = nn.Dropout(dropout_rate_gpt)
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate_gpt)
train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size_gpt, shuffle=True)
train_model(scgpt_model, optimizer_gpt, train_dataloader_gpt, criterion, num_epochs)

# Evaluate scGPT model on test set
test_accuracy_gpt = evaluate_model(scgpt_model, test_dataloader_gpt)
print('Accuracy of scGPT model on test set:', test_accuracy_gpt)



  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-05-31 19:44:39,948] A new study created in memory with name: no-name-0b0af047-c31c-4005-b676-f8ad60cf71b3
[I 2024-05-31 20:13:10,376] Trial 0 finished with value: 0.6778001487255096 and parameters: {'batch_size': 64, 'learning_rate': 0.0003695669422931764, 'dropout_rate': 0.1523855534373104}. Best is trial 0 with value: 0.6778001487255096.
[I 2024-05-31 20:30:33,620] Trial 1 finished with value: 0.683274507522583 and parameters: {'batch_size': 64, 'learning_rate': 1.843960872354099e-05, 'dropout_rate': 0.3438750063315392}. Best is trial 0 with value: 0.6778001487255096.
[I 2024-05-31 21:02:00,670] Trial 2 finished with value: 0.7191212475299835 and parameters: {'batch_size': 64, 'learning_rate': 0.0008053380662886468, 'dropout_rate': 0.49462457821963646}. Best is trial 0 with value: 0.6778001487255096.
[I 2024-05-31 21:19:16,700] Trial 3 finished with value: 0.6739

In [None]:
01/06/2024

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16  # Reduced batch size
num_epochs = 5  # Reduced number of epochs
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}')
        
        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)

    best_loss = float('inf')
    early_stop_count = 0
    for epoch in range(num_epochs):
        scbert_model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader_bert:
            optimizer_bert.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = scbert_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_bert.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader_bert)
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                break
    return best_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Reduced number of trials
best_params = study.best_params

batch_size = best_params['batch_size']
learning_rate = best_params['learning_rate']
dropout_rate = best_params['dropout_rate']

# Train final scBERT model with best hyperparameters
scbert_model = SCBERTClassifier()
scbert_model.dropout = nn.Dropout(dropout_rate)
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)
train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
train_model(scbert_model, optimizer_bert, train_dataloader_bert, criterion, num_epochs)

# Evaluate scBERT model on test set
test_accuracy_bert = evaluate_model(scbert_model, test_dataloader_bert)
print('Accuracy of scBERT model on test set:', test_accuracy_bert)

# Hyperparameter tuning with Optuna for scGPT
def objective_gpt(trial):
    batch_size_gpt = trial.suggest_categorical('batch_size_gpt', [16, 32])
    learning_rate_gpt = trial.suggest_float('learning_rate_gpt', 1e-5, 1e-3, log=True)
    dropout_rate_gpt = trial.suggest_float('dropout_rate_gpt', 0.1, 0.5)

    scgpt_model = SCGPTClassifier()
    scgpt_model.dropout = nn.Dropout(dropout_rate_gpt)
    optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate_gpt)
    train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size_gpt, shuffle=True)

    best_loss_gpt = float('inf')
    early_stop_count_gpt = 0
    for epoch in range(num_epochs):
        scgpt_model.train()
        running_loss_gpt = 0.0
        for inputs, labels in train_dataloader_gpt:
            optimizer_gpt.zero_grad()
            input_ids = inputs
            outputs = scgpt_model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_gpt.step()
            running_loss_gpt += loss.item()
        epoch_loss_gpt = running_loss_gpt / len(train_dataloader_gpt)
        if epoch_loss_gpt < best_loss_gpt:
            best_loss_gpt = epoch_loss_gpt
            early_stop_count_gpt = 0
        else:
            early_stop_count_gpt += 1
            if early_stop_count_gpt >= 3:
                break
    return best_loss_gpt

study_gpt = optuna.create_study(direction='minimize')
study_gpt.optimize(objective_gpt, n_trials=20)  # Reduced number of trials
best_params_gpt = study_gpt.best_params

batch_size_gpt = best_params_gpt['batch_size_gpt']
learning_rate_gpt = best_params_gpt['learning_rate_gpt']
dropout_rate_gpt = best_params_gpt['dropout_rate_gpt']

# Train final scGPT model with best hyperparameters
scgpt_model = SCGPTClassifier() 
scgpt_model.dropout = nn.Dropout(dropout_rate_gpt)
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate_gpt)
train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size_gpt, shuffle=True)
train_model(scgpt_model, optimizer_gpt, train_dataloader_gpt, criterion, num_epochs)

# Evaluate scGPT model on test set
test_accuracy_gpt = evaluate_model(scgpt_model, test_dataloader_gpt)
print('Accuracy of scGPT model on test set:', test_accuracy_gpt)


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16  # Reduced batch size
num_epochs = 5  # Reduced number of epochs
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}')
        
        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)

    best_loss = float('inf')
    early_stop_count = 0
    for epoch in range(num_epochs):
        scbert_model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader_bert:
            optimizer_bert.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = scbert_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_bert.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader_bert)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                break
    return best_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Train and evaluate the scBERT model with optimized hyperparameters
best_params = study.best_params
scbert_model = SCBERTClassifier()
scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
train_model(scbert_model, optimizer_bert, train_dataloader_bert, criterion, num_epochs)
accuracy_bert = evaluate_model(scbert_model, test_dataloader_bert)
print(f'scBERT Test Accuracy: {accuracy_bert}')

# Train and evaluate the scGPT model
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate)
train_model(scgpt_model, optimizer_gpt, train_dataloader_gpt, criterion, num_epochs)
accuracy_gpt = evaluate_model(scgpt_model, test_dataloader_gpt)
print(f'scGPT Test Accuracy: {accuracy_gpt}')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-01 11:16:29,609] A new study created in memory with name: no-name-5358ac86-67c1-4aeb-a511-4d5431ece3ef
[I 2024-06-01 11:31:35,849] Trial 0 finished with value: 0.6889887094497681 and parameters: {'batch_size': 16, 'learning_rate': 0.0007606855527820207, 'dropout_rate': 0.21249282974002137}. Best is trial 0 with value: 0.6889887094497681.
[I 2024-06-01 11:46:40,141] Trial 1 finished with value: 0.6890106558799743 and parameters: {'batch_size': 16, 'learning_rate': 3.56352398770645e-05, 'dropout_rate': 0.23395271216020364}. Best is trial 0 with value: 0.6889887094497681.
[I 2024-06-01 12:01:15,211] Trial 2 finished with value: 0.6768436312675477 and parameters: {'batch_size': 16, 'learning_rate': 5.567404829001719e-05, 'dropout_rate': 0.2123694068744817}. Best is trial 2 with value: 0.6768436312675477.
[I 2024-06-01 12:16:57,781] Trial 3 finished with value: 0.737

Epoch 1/5, Loss: 0.7756344079971313
Epoch 2/5, Loss: 0.7250105857849121
Epoch 3/5, Loss: 0.7645009517669678
Epoch 4/5, Loss: 0.6893216490745544
Epoch 5/5, Loss: 0.7204572916030884
scBERT Test Accuracy: 0.6
Epoch 1/5, Loss: 2.0250087141990663
Epoch 2/5, Loss: 1.107263731956482
Epoch 3/5, Loss: 0.8096065640449523
Epoch 4/5, Loss: 0.7738061070442199
Epoch 5/5, Loss: 0.6968650698661805
scGPT Test Accuracy: 0.4


In [4]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16  # Reduced batch size
num_epochs = 5  # Reduced number of epochs
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            start_time = time.time()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

    accuracy = correct / total
    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    return accuracy

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)

    best_loss = float('inf')
    early_stop_count = 0

    for epoch in range(num_epochs):
        scbert_model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader_bert:
            optimizer_bert.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = scbert_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_bert.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader_bert)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                break
    return best_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Train and evaluate the scBERT model with optimized hyperparameters
best_params = study.best_params
scbert_model = SCBERTClassifier()
scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
train_start_time = time.time()
train_model(scbert_model, optimizer_bert, train_dataloader_bert, criterion, num_epochs)
train_end_time = time.time()

test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)
test_start_time = time.time()
accuracy_bert = evaluate_model(scbert_model, test_dataloader_bert)
test_end_time = time.time()

print(f'scBERT Test Accuracy: {accuracy_bert}')
print(f'Training Time: {train_end_time - train_start_time:.2f}s')
print(f'Test Time: {test_end_time - test_start_time:.2f}s')

# Train and evaluate the scGPT model
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate)

train_start_time = time.time()
train_model(scgpt_model, optimizer_gpt, train_dataloader_gpt, criterion, num_epochs)
train_end_time = time.time()

test_start_time = time.time()
accuracy_gpt = evaluate_model(scgpt_model, test_dataloader_gpt)
test_end_time = time.time()

print(f'scGPT Test Accuracy: {accuracy_gpt}')
print(f'Training Time: {train_end_time - train_start_time:.2f}s')
print(f'Test Time: {test_end_time - test_start_time:.2f}s')



  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-01 20:28:43,299] A new study created in memory with name: no-name-0ef83726-2d1b-4e55-9bb7-3291e85ed4e8
[I 2024-06-01 20:43:53,611] Trial 0 finished with value: 0.7099897623062134 and parameters: {'batch_size': 16, 'learning_rate': 0.00013957952949142743, 'dropout_rate': 0.4683348289467878}. Best is trial 0 with value: 0.7099897623062134.
[I 2024-06-01 20:59:58,904] Trial 1 finished with value: 0.6867982149124146 and parameters: {'batch_size': 32, 'learning_rate': 3.102497578731916e-05, 'dropout_rate': 0.14048672967982923}. Best is trial 1 with value: 0.6867982149124146.
[I 2024-06-01 21:12:03,396] Trial 2 finished with value: 0.6781678080558777 and parameters: {'batch_size': 16, 'learning_rate': 0.0007219338176082561, 'dropout_rate': 0.4184725725831453}. Best is trial 2 with value: 0.6781678080558777.
[I 2024-06-01 21:28:17,696] Trial 3 finished with value: 0.70

Epoch 1/5, Loss: 0.7949920694033304, Time: 189.37s
Epoch 2/5, Loss: 0.7408809463183085, Time: 191.53s
Epoch 3/5, Loss: 0.7009690006573995, Time: 188.27s
Epoch 4/5, Loss: 0.7023180921872457, Time: 190.60s
Epoch 5/5, Loss: 0.6998785336812338, Time: 191.97s
Total Training Time: 951.74s
Total Evaluation Time: 12.07s
scBERT Test Accuracy: 0.6
Training Time: 951.74s
Test Time: 15.05s
Epoch 1/5, Loss: 1.9480788826942443, Time: 217.04s
Epoch 2/5, Loss: 0.8745065212249756, Time: 214.57s
Epoch 3/5, Loss: 0.8125643372535706, Time: 215.01s
Epoch 4/5, Loss: 0.7565965414047241, Time: 215.47s
Epoch 5/5, Loss: 0.7439954400062561, Time: 216.46s
Total Training Time: 1078.54s
Total Evaluation Time: 14.78s
scGPT Test Accuracy: 0.6
Training Time: 1078.54s
Test Time: 15.60s


In [None]:
02/10/2024

In [5]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16  # Reduced batch size
num_epochs = 5  # Reduced number of epochs
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            start_time = time.time()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

    accuracy = correct / total
    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    return accuracy

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)

    best_loss = float('inf')
    early_stop_count = 0

    for epoch in range(num_epochs):
        scbert_model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader_bert:
            optimizer_bert.zero_grad()
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            outputs = scbert_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_bert.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader_bert)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                break
    return best_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Train and evaluate the scBERT model with optimized hyperparameters
best_params = study.best_params
scbert_model = SCBERTClassifier()
scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
optimizer_bert = optim.AdamW(scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
train_start_time = time.time()
train_model(scbert_model, optimizer_bert, train_dataloader_bert, criterion, num_epochs)
train_end_time = time.time()

test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)
test_start_time = time.time()
accuracy_bert = evaluate_model(scbert_model, test_dataloader_bert)
test_end_time = time.time()

print(f'scBERT Test Accuracy: {accuracy_bert}')
print(f'Training Time: {train_end_time - train_start_time:.2f}s')
print(f'Test Time: {test_end_time - test_start_time:.2f}s')

# Train and evaluate the scGPT model
optimizer_gpt = optim.AdamW(scgpt_model.parameters(), lr=learning_rate)

train_start_time = time.time()
train_model(scgpt_model, optimizer_gpt, train_dataloader_gpt, criterion, num_epochs)
train_end_time = time.time()

test_start_time = time.time()
accuracy_gpt = evaluate_model(scgpt_model, test_dataloader_gpt)
test_end_time = time.time()

print(f'scGPT Test Accuracy: {accuracy_gpt}')
print(f'Training Time: {train_end_time - train_start_time:.2f}s')
print(f'Test Time: {test_end_time - test_start_time:.2f}s')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-01 23:51:43,145] A new study created in memory with name: no-name-161f78c0-7cac-4b3e-8d44-9be182b5f44b
[I 2024-06-02 00:06:52,297] Trial 0 finished with value: 0.6884732246398926 and parameters: {'batch_size': 16, 'learning_rate': 0.0005801227582949602, 'dropout_rate': 0.34200829077452943}. Best is trial 0 with value: 0.6884732246398926.
[I 2024-06-02 00:21:55,879] Trial 1 finished with value: 0.7250295877456665 and parameters: {'batch_size': 16, 'learning_rate': 0.0008760008270346263, 'dropout_rate': 0.2576031911629467}. Best is trial 0 with value: 0.6884732246398926.
[I 2024-06-02 00:38:06,246] Trial 2 finished with value: 0.6748929619789124 and parameters: {'batch_size': 32, 'learning_rate': 0.0003391808215470797, 'dropout_rate': 0.2904514869515771}. Best is trial 2 with value: 0.6748929619789124.
[I 2024-06-02 00:53:16,047] Trial 3 finished with value: 0.689

Epoch 1/5, Loss: 0.9131436944007874, Time: 192.25s
Epoch 2/5, Loss: 0.8749574820200602, Time: 192.09s
Epoch 3/5, Loss: 0.739409883817037, Time: 191.20s
Epoch 4/5, Loss: 0.8021972974141439, Time: 193.72s
Epoch 5/5, Loss: 0.7857325077056885, Time: 194.10s
Total Training Time: 963.35s
Total Evaluation Time: 11.42s
scBERT Test Accuracy: 0.4
Training Time: 963.35s
Test Time: 14.42s
Epoch 1/5, Loss: 1.1051663517951966, Time: 222.40s
Epoch 2/5, Loss: 0.7276015043258667, Time: 223.12s
Epoch 3/5, Loss: 0.7035327911376953, Time: 220.05s
Epoch 4/5, Loss: 0.6972214102745056, Time: 216.57s
Epoch 5/5, Loss: 0.7177702903747558, Time: 218.97s
Total Training Time: 1101.12s
Total Evaluation Time: 14.76s
scGPT Test Accuracy: 0.6
Training Time: 1101.12s
Test Time: 15.59s


In [6]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16  # Reduced batch size
num_epochs = 5  # Reduced number of epochs
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            start_time = time.time()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

    accuracy = correct / total
    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    return accuracy

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

    train_model(scbert_model, optimizer, train_dataloader_bert, criterion, num_epochs=5)
    accuracy = evaluate_model(scbert_model, test_dataloader_bert)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
print(f'Best Parameters: {best_params}')

# Train and evaluate the models with the best hyperparameters
scbert_optimizer = optim.Adam(scbert_model.parameters(), lr=best_params['learning_rate'])
train_model(scbert_model, scbert_optimizer, train_dataloader_bert, criterion, num_epochs)
scbert_accuracy = evaluate_model(scbert_model, test_dataloader_bert)
print(f'scBERT Test Accuracy: {scbert_accuracy * 100:.2f}%')

scgpt_optimizer = optim.Adam(scgpt_model.parameters(), lr=learning_rate)
train_model(scgpt_model, scgpt_optimizer, train_dataloader_gpt, criterion, num_epochs)
scgpt_accuracy = evaluate_model(scgpt_model, test_dataloader_gpt)
print(f'scGPT Test Accuracy: {scgpt_accuracy * 100:.2f}%')

# Based on the best accuracy, classify the entire dataset
if scbert_accuracy > scgpt_accuracy:
    best_model = scbert_model
    best_tokenizer = bert_tokenizer
else:
    best_model = scgpt_model
    best_tokenizer = gpt_tokenizer

# Create a dataloader for the entire dataset
full_dataset = RNASeqDataset(X_scaled, actual_labels, best_tokenizer, model_type='bert' if best_model == scbert_model else 'gpt')
full_dataloader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False)

# Classify the entire dataset
def classify_dataset(model, dataloader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for inputs, _ in dataloader:
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())

    return np.array(predictions)

# Get predictions
predictions = classify_dataset(best_model, full_dataloader)
data['predicted_label'] = predictions

# Save the classified dataset
data['persister_label'] = actual_labels  # Add the actual labels back to the data
data.to_csv('/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA_classified.csv', index=False)

print('Classification completed and saved to GSE150949_scRNA_classified.csv')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-02 10:26:09,472] A new study created in memory with name: no-name-aab75bb0-a22f-4637-b6aa-16e01287bb21


Epoch 1/5, Loss: 0.7172313531239828, Time: 218.94s
Epoch 2/5, Loss: 0.6894198656082153, Time: 205.49s
Epoch 3/5, Loss: 0.6763014793395996, Time: 206.48s
Epoch 4/5, Loss: 0.6869503657023112, Time: 207.51s
Epoch 5/5, Loss: 0.6914328535397848, Time: 205.59s
Total Training Time: 1044.00s


[I 2024-06-02 10:43:49,282] Trial 0 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.098253023730156e-05, 'dropout_rate': 0.19555086246733772}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.21s
Epoch 1/5, Loss: 0.8275733947753906, Time: 197.33s
Epoch 2/5, Loss: 1.1343012690544128, Time: 197.61s
Epoch 3/5, Loss: 0.8109597086906433, Time: 199.33s
Epoch 4/5, Loss: 0.7754833459854126, Time: 199.11s
Epoch 5/5, Loss: 0.752349579334259, Time: 199.21s
Total Training Time: 992.60s


[I 2024-06-02 11:00:37,180] Trial 1 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.0008388677688757641, 'dropout_rate': 0.4327451740040592}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.72s
Epoch 1/5, Loss: 0.6863109111785889, Time: 198.70s
Epoch 2/5, Loss: 0.7264795780181885, Time: 198.94s
Epoch 3/5, Loss: 0.6964643478393555, Time: 197.94s
Epoch 4/5, Loss: 0.7082552313804626, Time: 198.87s
Early stopping at epoch 4
Total Training Time: 794.45s


[I 2024-06-02 11:14:06,889] Trial 2 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 2.3286739200669534e-05, 'dropout_rate': 0.40626538961752445}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.68s
Epoch 1/5, Loss: 0.6752307415008545, Time: 197.31s
Epoch 2/5, Loss: 0.6860402822494507, Time: 197.78s
Epoch 3/5, Loss: 0.6839542388916016, Time: 197.77s
Epoch 4/5, Loss: 0.7701284766197205, Time: 199.22s
Early stopping at epoch 4
Total Training Time: 792.08s


[I 2024-06-02 11:27:34,217] Trial 3 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 1.3507522438771335e-05, 'dropout_rate': 0.3720618926402155}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.68s
Epoch 1/5, Loss: 0.8068356037139892, Time: 197.82s
Epoch 2/5, Loss: 0.6772196292877197, Time: 204.28s
Epoch 3/5, Loss: 0.7117452502250672, Time: 199.76s
Epoch 4/5, Loss: 0.7425118684768677, Time: 200.01s
Epoch 5/5, Loss: 0.7200944662094116, Time: 200.62s
Early stopping at epoch 5
Total Training Time: 1002.49s


[I 2024-06-02 11:44:32,148] Trial 4 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 2.4009374827438337e-05, 'dropout_rate': 0.4657248275799887}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.81s
Epoch 1/5, Loss: 0.7106072505315145, Time: 206.25s
Epoch 2/5, Loss: 0.6830528577168783, Time: 205.46s
Epoch 3/5, Loss: 0.6962872346242269, Time: 205.06s
Epoch 4/5, Loss: 0.6830102602640787, Time: 205.52s
Epoch 5/5, Loss: 0.6973384817441305, Time: 205.00s
Total Training Time: 1027.30s


[I 2024-06-02 12:01:58,968] Trial 5 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 5.345235488360063e-05, 'dropout_rate': 0.16607426917231832}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.69s
Epoch 1/5, Loss: 0.8741228699684143, Time: 199.69s
Epoch 2/5, Loss: 0.7158250570297241, Time: 198.81s
Epoch 3/5, Loss: 0.7003739953041077, Time: 200.52s
Epoch 4/5, Loss: 0.6879575967788696, Time: 198.80s
Epoch 5/5, Loss: 0.7203323006629944, Time: 199.33s
Total Training Time: 997.16s


[I 2024-06-02 12:18:51,450] Trial 6 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 6.403628387660492e-05, 'dropout_rate': 0.1620680100232344}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.69s
Epoch 1/5, Loss: 0.7190415461858114, Time: 205.26s
Epoch 2/5, Loss: 0.7090791463851929, Time: 205.42s
Epoch 3/5, Loss: 0.7228625814119974, Time: 205.11s
Epoch 4/5, Loss: 0.6798882285753886, Time: 205.19s
Epoch 5/5, Loss: 0.7156826257705688, Time: 204.64s
Total Training Time: 1025.63s


[I 2024-06-02 12:36:12,367] Trial 7 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.0002760435489246661, 'dropout_rate': 0.18976868622477316}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.68s
Epoch 1/5, Loss: 0.7654791275660197, Time: 206.58s
Epoch 2/5, Loss: 0.7141641775767008, Time: 205.31s
Epoch 3/5, Loss: 0.6947280168533325, Time: 206.03s
Epoch 4/5, Loss: 0.692451000213623, Time: 212.50s
Epoch 5/5, Loss: 0.6840793490409851, Time: 209.10s
Total Training Time: 1039.53s


[I 2024-06-02 12:53:47,638] Trial 8 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.3895994627989614e-05, 'dropout_rate': 0.2335692743329812}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.12s
Epoch 1/5, Loss: 0.7838709553082784, Time: 206.17s
Epoch 2/5, Loss: 0.7584385474522909, Time: 205.07s
Epoch 3/5, Loss: 0.6967947284380595, Time: 204.66s
Epoch 4/5, Loss: 0.7047132651011149, Time: 205.08s
Epoch 5/5, Loss: 0.676358699798584, Time: 206.14s
Total Training Time: 1027.12s


[I 2024-06-02 13:11:10,760] Trial 9 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.00014005290868161893, 'dropout_rate': 0.30871518847676394}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.69s
Best Parameters: {'batch_size': 32, 'learning_rate': 2.098253023730156e-05, 'dropout_rate': 0.19555086246733772}
Epoch 1/5, Loss: 0.7070582509040833, Time: 197.87s
Epoch 2/5, Loss: 0.7088041543960572, Time: 200.04s
Epoch 3/5, Loss: 0.7335042834281922, Time: 199.59s
Epoch 4/5, Loss: 0.7085669636726379, Time: 199.79s
Early stopping at epoch 4
Total Training Time: 797.28s
Total Evaluation Time: 11.67s
scBERT Test Accuracy: 60.00%
Epoch 1/5, Loss: 1.1985112190246583, Time: 253.46s
Epoch 2/5, Loss: 0.7165852546691894, Time: 251.75s
Epoch 3/5, Loss: 0.7542230725288391, Time: 248.90s
Epoch 4/5, Loss: 0.7002256631851196, Time: 247.27s
Epoch 5/5, Loss: 0.7284901022911072, Time: 248.42s
Total Training Time: 1249.78s
Total Evaluation Time: 17.04s
scGPT Test Accuracy: 60.00%
Classification completed and saved to GSE150949_scRNA_classified.csv


In [None]:
´02/06/2024

In [7]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            start_time = time.time()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)

    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    return accuracy, f1, precision, recall

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

    train_model(scbert_model, optimizer, train_dataloader, criterion, num_epochs=3)  # Reduce epochs for tuning
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Increase number of trials

best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate with best hyperparameters
best_scbert_model = SCBERTClassifier()
best_scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer = optim.Adam(best_scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_best = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader_best = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)

train_model(best_scbert_model, best_optimizer, train_dataloader_best, criterion, num_epochs)
evaluate_model(best_scbert_model, test_dataloader_best)

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, model1, model2):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.fc = nn.Linear(4, 2)  # Combining logits from two models

    def forward(self, inputs1, inputs2):
        logits1 = self.model1(**inputs1)
        logits2 = self.model2(inputs2)
        combined_logits = torch.cat((logits1, logits2), dim=1)
        return self.fc(combined_logits)

# Instantiate and train ensemble model
best_scgpt_model = SCGPTClassifier()
best_scgpt_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer_gpt = optim.Adam(best_scgpt_model.parameters(), lr=best_params['learning_rate'])

ensemble_model = EnsembleModel(best_scbert_model, best_scgpt_model)
ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=best_params['learning_rate'])

train_model(ensemble_model, ensemble_optimizer, zip(train_dataloader_best, train_dataloader_gpt), criterion, num_epochs)
evaluate_model(ensemble_model, zip(test_dataloader_best, test_dataloader_gpt))

# Save the models
torch.save(best_scbert_model.state_dict(), 'best_scbert_model.pth')
torch.save(best_scgpt_model.state_dict(), 'best_scgpt_model.pth')
torch.save(ensemble_model.state_dict(), 'ensemble_model.pth')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-02 15:54:40,990] A new study created in memory with name: no-name-f544af6d-538b-4225-8605-442b94ae1e4d


Epoch 1/3, Loss: 0.6780230601628622, Time: 218.71s
Epoch 2/3, Loss: 0.6905593673388163, Time: 209.72s
Epoch 3/3, Loss: 0.7033189733823141, Time: 211.26s
Total Training Time: 639.70s


[I 2024-06-02 16:05:36,822] Trial 0 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.8963127214339294e-05, 'dropout_rate': 0.1776864212297564}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.50s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6987577875455221, Time: 208.31s
Epoch 2/3, Loss: 0.7386711835861206, Time: 209.64s
Epoch 3/3, Loss: 0.7096268534660339, Time: 208.14s
Total Training Time: 626.09s


[I 2024-06-02 16:16:18,258] Trial 1 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 5.488470193938184e-05, 'dropout_rate': 0.42130184772980217}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.72s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.70217231909434, Time: 207.83s
Epoch 2/3, Loss: 0.7106437484423319, Time: 207.39s
Epoch 3/3, Loss: 0.6777363220850626, Time: 207.69s
Total Training Time: 622.91s


[I 2024-06-02 16:26:56,432] Trial 2 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.303218334756295e-05, 'dropout_rate': 0.15382115938372817}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.69s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7346393267313639, Time: 199.22s
Epoch 2/3, Loss: 0.6964345375696818, Time: 201.69s
Epoch 3/3, Loss: 0.7583057085673014, Time: 199.21s
Total Training Time: 600.13s


[I 2024-06-02 16:37:11,121] Trial 3 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.0127508697564435e-05, 'dropout_rate': 0.47708033618956713}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.94s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7293882369995117, Time: 199.37s
Epoch 2/3, Loss: 0.735450804233551, Time: 198.52s
Epoch 3/3, Loss: 0.7473359704017639, Time: 201.01s
Total Training Time: 598.90s


[I 2024-06-02 16:47:24,587] Trial 4 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.4583251996602115e-05, 'dropout_rate': 0.3442859817851834}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.96s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7789265712102255, Time: 199.23s
Epoch 2/3, Loss: 0.7214029232660929, Time: 200.19s
Epoch 3/3, Loss: 0.7364567518234253, Time: 200.18s
Total Training Time: 599.60s


[I 2024-06-02 16:57:41,496] Trial 5 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 5.841198893993964e-05, 'dropout_rate': 0.3291469630481474}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.01s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7202614188194275, Time: 187.00s
Epoch 2/3, Loss: 0.8547983884811401, Time: 185.33s
Epoch 3/3, Loss: 0.7221110701560974, Time: 184.73s
Total Training Time: 557.06s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-02 17:07:13,249] Trial 6 finished with value: 0.4 and parameters: {'batch_size': 16, 'learning_rate': 0.000862713350175145, 'dropout_rate': 0.25593902467449203}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.02s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.6983048915863037, Time: 201.74s
Epoch 2/3, Loss: 0.7184461951255798, Time: 198.30s
Epoch 3/3, Loss: 0.7039341330528259, Time: 206.14s
Total Training Time: 606.17s


[I 2024-06-02 17:17:36,919] Trial 7 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 5.3402766642799504e-05, 'dropout_rate': 0.3932810915551864}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.37s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.8009879231452942, Time: 187.90s
Epoch 2/3, Loss: 0.7198788166046143, Time: 185.66s
Epoch 3/3, Loss: 0.6847321629524231, Time: 186.62s
Total Training Time: 560.19s


[I 2024-06-02 17:27:12,706] Trial 8 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 1.449005987349287e-05, 'dropout_rate': 0.4444108593916093}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.01s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7537136077880859, Time: 196.75s
Epoch 2/3, Loss: 0.7983022729555765, Time: 198.78s
Epoch 3/3, Loss: 0.7587332924207052, Time: 195.67s
Total Training Time: 591.20s


[I 2024-06-02 17:37:20,786] Trial 9 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.0002903787488948635, 'dropout_rate': 0.3443461500554773}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.99s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7988574504852295, Time: 186.80s
Epoch 2/3, Loss: 0.7149532675743103, Time: 183.45s
Epoch 3/3, Loss: 0.727476167678833, Time: 185.51s
Total Training Time: 555.76s


[I 2024-06-02 17:46:51,147] Trial 10 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.00018588779964394118, 'dropout_rate': 0.125892721738189}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.98s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7186281879742941, Time: 198.35s
Epoch 2/3, Loss: 0.7152939438819885, Time: 196.71s
Epoch 3/3, Loss: 0.7310625910758972, Time: 198.44s
Total Training Time: 593.49s


[I 2024-06-02 17:56:59,264] Trial 11 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.0001068435676095799, 'dropout_rate': 0.22081780856714772}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.01s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6890195608139038, Time: 199.41s
Epoch 2/3, Loss: 0.7587713400522867, Time: 199.36s
Epoch 3/3, Loss: 0.7292925914128622, Time: 198.52s
Total Training Time: 597.29s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-02 18:07:11,303] Trial 12 finished with value: 0.4 and parameters: {'batch_size': 32, 'learning_rate': 2.2982433487900423e-05, 'dropout_rate': 0.20995601508782558}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.95s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.8025000691413879, Time: 199.02s
Epoch 2/3, Loss: 0.7087345719337463, Time: 202.34s
Epoch 3/3, Loss: 0.7082057396570841, Time: 200.38s
Total Training Time: 601.74s


[I 2024-06-02 18:17:27,576] Trial 13 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 6.021779112254828e-05, 'dropout_rate': 0.4083065663867058}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.92s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.8293628295262655, Time: 198.86s
Epoch 2/3, Loss: 0.682129979133606, Time: 204.57s
Epoch 3/3, Loss: 0.6906513770421346, Time: 201.14s
Total Training Time: 604.57s


[I 2024-06-02 18:27:46,873] Trial 14 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.42734524910798e-05, 'dropout_rate': 0.2759039831008053}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.12s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7036460041999817, Time: 186.43s
Epoch 2/3, Loss: 0.7658825397491456, Time: 186.66s
Epoch 3/3, Loss: 0.6866638422012329, Time: 188.30s
Total Training Time: 561.39s


[I 2024-06-02 18:37:23,292] Trial 15 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.0001205439860713912, 'dropout_rate': 0.17147119402704455}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.00s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6978557308514913, Time: 198.68s
Epoch 2/3, Loss: 0.7352397044499716, Time: 201.14s
Epoch 3/3, Loss: 0.7074801127115885, Time: 198.45s
Total Training Time: 598.27s


[I 2024-06-02 18:47:36,234] Trial 16 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.572766744104568e-05, 'dropout_rate': 0.10928264087808867}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.97s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7576595346132914, Time: 198.72s
Epoch 2/3, Loss: 0.7842166423797607, Time: 201.07s
Epoch 3/3, Loss: 0.7398517529169718, Time: 199.40s
Total Training Time: 599.19s


[I 2024-06-02 18:57:50,074] Trial 17 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.00038390197175170837, 'dropout_rate': 0.4978866575936191}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.06s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.765293276309967, Time: 188.14s
Epoch 2/3, Loss: 0.7261101484298706, Time: 188.24s
Epoch 3/3, Loss: 0.7108487010002136, Time: 186.90s
Total Training Time: 563.28s


[I 2024-06-02 19:07:27,919] Trial 18 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 4.269425696292457e-05, 'dropout_rate': 0.391581084816316}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.97s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7436188062032064, Time: 197.42s
Epoch 2/3, Loss: 0.7019838492075602, Time: 196.13s
Epoch 3/3, Loss: 0.7225064039230347, Time: 197.42s
Total Training Time: 590.97s


[I 2024-06-02 19:17:33,458] Trial 19 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 7.709929351599386e-05, 'dropout_rate': 0.21635759857365777}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.95s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7517266472180685, Time: 199.09s
Epoch 2/3, Loss: 0.726882537206014, Time: 201.08s
Epoch 3/3, Loss: 0.6961719989776611, Time: 199.33s
Total Training Time: 599.51s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-02 19:27:47,557] Trial 20 finished with value: 0.4 and parameters: {'batch_size': 32, 'learning_rate': 2.1118254026193032e-05, 'dropout_rate': 0.2936073071180133}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.02s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.7186567783355713, Time: 199.84s
Epoch 2/3, Loss: 0.7235923409461975, Time: 205.24s
Epoch 3/3, Loss: 0.7187032500902811, Time: 201.13s
Total Training Time: 606.20s


[I 2024-06-02 19:38:08,467] Trial 21 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.289985233675352e-05, 'dropout_rate': 0.16106600322438408}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.06s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.735877255598704, Time: 199.18s
Epoch 2/3, Loss: 0.6784356832504272, Time: 200.49s
Epoch 3/3, Loss: 0.7201308409372965, Time: 197.76s
Total Training Time: 597.43s


[I 2024-06-02 19:48:20,845] Trial 22 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.0183695912242028e-05, 'dropout_rate': 0.1592722856637663}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.00s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7513393759727478, Time: 198.87s
Epoch 2/3, Loss: 0.7036523421605428, Time: 198.70s
Epoch 3/3, Loss: 0.7002967596054077, Time: 198.14s
Total Training Time: 595.71s


[I 2024-06-02 19:58:31,238] Trial 23 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.171776769193733e-05, 'dropout_rate': 0.2395270918202026}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.94s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7008519967397054, Time: 199.01s
Epoch 2/3, Loss: 0.7074482043584188, Time: 199.92s
Epoch 3/3, Loss: 0.6965252757072449, Time: 199.99s
Total Training Time: 598.92s


[I 2024-06-02 20:08:44,688] Trial 24 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.668552378794278e-05, 'dropout_rate': 0.18612218040262074}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.97s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7588207523028055, Time: 199.07s
Epoch 2/3, Loss: 0.7170271476109823, Time: 199.80s
Epoch 3/3, Loss: 0.7246663769086202, Time: 199.74s
Total Training Time: 598.61s


[I 2024-06-02 20:18:57,876] Trial 25 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 8.41783904384156e-05, 'dropout_rate': 0.13904083091842243}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 10.98s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7641647060712179, Time: 199.93s
Epoch 2/3, Loss: 0.7293467322985331, Time: 200.51s
Epoch 3/3, Loss: 0.7164691090583801, Time: 199.97s
Total Training Time: 600.40s


[I 2024-06-02 20:29:13,017] Trial 26 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.9474189339080366e-05, 'dropout_rate': 0.10365509147368349}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.13s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6896176218986512, Time: 186.91s
Epoch 2/3, Loss: 0.791819941997528, Time: 185.85s
Epoch 3/3, Loss: 0.7040169596672058, Time: 186.49s
Total Training Time: 559.25s


[I 2024-06-02 20:38:46,962] Trial 27 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.00018020541246474735, 'dropout_rate': 0.1877795398878549}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.03s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.8308999141057333, Time: 210.70s
Epoch 2/3, Loss: 0.7734154860178629, Time: 211.70s
Epoch 3/3, Loss: 0.7818954785664877, Time: 208.26s
Total Training Time: 630.66s


[I 2024-06-02 20:49:35,313] Trial 28 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.6739592273235555e-05, 'dropout_rate': 0.3107424928476582}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.96s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6449164748191833, Time: 208.26s
Epoch 2/3, Loss: 0.7140778501828512, Time: 209.59s
Epoch 3/3, Loss: 0.7631835540135702, Time: 208.62s
Total Training Time: 626.46s


[I 2024-06-02 21:00:20,004] Trial 29 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 4.5836778258557655e-05, 'dropout_rate': 0.4536538187616566}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.76s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.675674041112264, Time: 209.26s
Epoch 2/3, Loss: 0.761301060517629, Time: 210.26s
Epoch 3/3, Loss: 0.6997840404510498, Time: 208.64s
Total Training Time: 628.15s


[I 2024-06-02 21:11:03,454] Trial 30 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.256459225869341e-05, 'dropout_rate': 0.37402420279287}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.70s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7297466397285461, Time: 208.50s
Epoch 2/3, Loss: 0.738485316435496, Time: 209.59s
Epoch 3/3, Loss: 0.7557408809661865, Time: 208.33s
Total Training Time: 626.42s


[I 2024-06-02 21:21:45,199] Trial 31 finished with value: 0.55 and parameters: {'batch_size': 32, 'learning_rate': 1.0210040555535325e-05, 'dropout_rate': 0.4908921124835622}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.67s
Accuracy: 0.5500, F1 Score: 0.7097, Precision: 0.5789, Recall: 0.9167
Epoch 1/3, Loss: 0.7661988337834676, Time: 209.03s
Epoch 2/3, Loss: 0.6905753016471863, Time: 208.75s
Epoch 3/3, Loss: 0.7472078998883566, Time: 208.83s
Total Training Time: 626.62s


[I 2024-06-02 21:32:27,158] Trial 32 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.906418098743823e-05, 'dropout_rate': 0.4541572502965262}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.70s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.799682100613912, Time: 209.83s
Epoch 2/3, Loss: 0.714357594648997, Time: 208.74s
Epoch 3/3, Loss: 0.7020119825998942, Time: 209.06s
Total Training Time: 627.63s


[I 2024-06-02 21:43:10,147] Trial 33 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.578264651593899e-05, 'dropout_rate': 0.42589860157697174}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.76s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7158677379290262, Time: 209.25s
Epoch 2/3, Loss: 0.7859827280044556, Time: 208.35s
Epoch 3/3, Loss: 0.7356532017389933, Time: 209.35s
Total Training Time: 626.96s


[I 2024-06-02 21:53:52,400] Trial 34 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 6.17786859647773e-05, 'dropout_rate': 0.47557982078478045}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.70s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7221959829330444, Time: 210.65s
Epoch 2/3, Loss: 0.7272068659464518, Time: 209.53s
Epoch 3/3, Loss: 0.6980534195899963, Time: 208.05s
Total Training Time: 628.23s


[I 2024-06-02 22:04:35,932] Trial 35 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.3288226988707822e-05, 'dropout_rate': 0.3417334746214215}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.70s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7867297649383544, Time: 201.15s
Epoch 2/3, Loss: 0.7513758897781372, Time: 201.20s
Epoch 3/3, Loss: 0.764035415649414, Time: 199.27s
Total Training Time: 601.62s


[I 2024-06-02 22:14:53,063] Trial 36 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.0008996523847039539, 'dropout_rate': 0.36859550216733095}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.85s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7293193538983663, Time: 206.03s
Epoch 2/3, Loss: 0.7136633992195129, Time: 206.98s
Epoch 3/3, Loss: 0.7072166800498962, Time: 206.69s
Total Training Time: 619.71s


[I 2024-06-02 22:25:28,376] Trial 37 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.888986068305369e-05, 'dropout_rate': 0.4277482034006551}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.76s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7404240369796753, Time: 200.04s
Epoch 2/3, Loss: 0.7131534099578858, Time: 197.82s
Epoch 3/3, Loss: 0.7132641315460205, Time: 198.30s
Total Training Time: 596.15s


[I 2024-06-02 22:35:39,987] Trial 38 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 2.8682714745587966e-05, 'dropout_rate': 0.254383284708576}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.67s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.718638559182485, Time: 223.20s
Epoch 2/3, Loss: 0.7038586735725403, Time: 213.10s
Epoch 3/3, Loss: 0.7099942962328593, Time: 209.11s
Total Training Time: 645.41s


[I 2024-06-02 22:46:41,723] Trial 39 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.2482917885082042e-05, 'dropout_rate': 0.1327426035210855}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.68s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7806624372800192, Time: 208.21s
Epoch 2/3, Loss: 0.9660837451616923, Time: 209.11s
Epoch 3/3, Loss: 0.8069780071576437, Time: 207.60s
Total Training Time: 624.93s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-02 22:57:25,512] Trial 40 finished with value: 0.4 and parameters: {'batch_size': 32, 'learning_rate': 0.0005721761462474053, 'dropout_rate': 0.46771592743627227}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.67s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.7251812815666199, Time: 208.62s
Epoch 2/3, Loss: 0.6857437690099081, Time: 209.23s
Epoch 3/3, Loss: 0.8009778062502543, Time: 209.61s
Total Training Time: 627.46s


[I 2024-06-02 23:08:08,306] Trial 41 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 5.257439327942294e-05, 'dropout_rate': 0.4280649311166451}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.73s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7740238308906555, Time: 208.49s
Epoch 2/3, Loss: 0.7429904341697693, Time: 208.95s
Epoch 3/3, Loss: 0.7024082541465759, Time: 207.12s
Total Training Time: 624.56s


[I 2024-06-02 23:18:48,583] Trial 42 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.112027877132872e-05, 'dropout_rate': 0.40105142740511385}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.69s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.717708170413971, Time: 208.56s
Epoch 2/3, Loss: 0.6793968876202902, Time: 209.46s
Epoch 3/3, Loss: 0.7118041117986044, Time: 208.43s
Total Training Time: 626.46s


[I 2024-06-02 23:29:30,355] Trial 43 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 1.5969697712454295e-05, 'dropout_rate': 0.3745184687076213}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.75s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7693594296773275, Time: 209.14s
Epoch 2/3, Loss: 0.6955845952033997, Time: 211.13s
Epoch 3/3, Loss: 0.7101577520370483, Time: 210.16s
Total Training Time: 630.44s


[I 2024-06-02 23:40:16,362] Trial 44 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 3.8813891732356314e-05, 'dropout_rate': 0.30533620544368517}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.74s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.8932833870251974, Time: 209.25s
Epoch 2/3, Loss: 0.7134016752243042, Time: 208.78s
Epoch 3/3, Loss: 0.7088051438331604, Time: 209.57s
Total Training Time: 627.61s


[I 2024-06-02 23:50:59,279] Trial 45 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.00013155192501685339, 'dropout_rate': 0.330144348159911}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.73s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7697397112846375, Time: 209.88s
Epoch 2/3, Loss: 0.7378343939781189, Time: 195.23s
Epoch 3/3, Loss: 0.7536140203475952, Time: 193.45s
Total Training Time: 598.56s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-03 00:01:13,979] Trial 46 finished with value: 0.4 and parameters: {'batch_size': 16, 'learning_rate': 7.249077726730233e-05, 'dropout_rate': 0.35688882322312443}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.08s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.7973875602086385, Time: 209.04s
Epoch 2/3, Loss: 0.7007066210110983, Time: 208.26s
Epoch 3/3, Loss: 0.707445482412974, Time: 208.99s
Total Training Time: 626.29s


[I 2024-06-03 00:11:56,774] Trial 47 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 2.416623922900222e-05, 'dropout_rate': 0.2882603195640222}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.71s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7847066720326742, Time: 209.18s
Epoch 2/3, Loss: 0.7196443875630697, Time: 208.49s
Epoch 3/3, Loss: 0.7231016953786215, Time: 209.10s
Total Training Time: 626.76s


[I 2024-06-03 00:22:38,834] Trial 48 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 4.75848864219207e-05, 'dropout_rate': 0.19662721825039392}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.67s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7092327872912089, Time: 210.78s
Epoch 2/3, Loss: 0.7518264452616373, Time: 208.16s
Epoch 3/3, Loss: 0.6895715991655985, Time: 208.64s
Total Training Time: 627.58s


[I 2024-06-03 00:33:21,732] Trial 49 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 6.264783949620875e-05, 'dropout_rate': 0.4163041036611652}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.71s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Best hyperparameters: {'batch_size': 32, 'learning_rate': 1.8963127214339294e-05, 'dropout_rate': 0.1776864212297564}
Epoch 1/5, Loss: 0.7181537946065267, Time: 207.93s
Epoch 2/5, Loss: 0.734636922677358, Time: 209.49s
Epoch 3/5, Loss: 0.7008355657259623, Time: 209.35s
Epoch 4/5, Loss: 0.7021482586860657, Time: 209.70s
Epoch 5/5, Loss: 0.6874314546585083, Time: 208.76s
Total Training Time: 1045.23s
Total Evaluation Time: 11.74s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000


TypeError: EnsembleModel.forward() missing 1 required positional argument: 'inputs2'

In [None]:
03/06/2024

In [None]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 1#16
num_epochs = 5
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            start_time = time.time()
            if isinstance(inputs, dict):  # For BERT
                input_ids = inputs['input_ids']
                attention_mask = inputs['attention_mask']
                outputs = model(input_ids, attention_mask)
            else:  # For GPT
                input_ids = inputs
                outputs = model(input_ids)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)

    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    return accuracy, f1, precision, recall

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

    train_model(scbert_model, optimizer, train_dataloader, criterion, num_epochs=3)  # Reduce epochs for tuning
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Increase number of trials

best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate with best hyperparameters
best_scbert_model = SCBERTClassifier()
best_scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer = optim.Adam(best_scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_best = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader_best = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)

train_model(best_scbert_model, best_optimizer, train_dataloader_best, criterion, num_epochs)
evaluate_model(best_scbert_model, test_dataloader_best)

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, model1, model2):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.fc = nn.Linear(4, 2)  # Combining logits from two models

    def forward(self, inputs1, inputs2):
        logits1 = self.model1(**inputs1)
        logits2 = self.model2(inputs2)
        combined_logits = torch.cat((logits1, logits2), dim=1)
        return self.fc(combined_logits)

# Instantiate and train ensemble model
best_scgpt_model = SCGPTClassifier()
best_scgpt_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer_gpt = optim.Adam(best_scgpt_model.parameters(), lr=best_params['learning_rate'])

ensemble_model = EnsembleModel(best_scbert_model, best_scgpt_model)
ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=best_params['learning_rate'])

train_model(ensemble_model, ensemble_optimizer, zip(train_dataloader_best, train_dataloader_gpt), criterion, num_epochs)
evaluate_model(ensemble_model, zip(test_dataloader_best, test_dataloader_gpt))

# Save the models
torch.save(best_scbert_model.state_dict(), 'best_scbert_model.pth')
torch.save(best_scgpt_model.state_dict(), 'best_scgpt_model.pth')
torch.save(ensemble_model.state_dict(), 'ensemble_model.pth')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-04 13:47:08,493] A new study created in memory with name: no-name-b8030d7f-91c5-492b-b9f1-b0b3796a3313


Epoch 1/3, Loss: 0.7010463873545328, Time: 225.95s
Epoch 2/3, Loss: 0.7061854799588522, Time: 206.63s
Epoch 3/3, Loss: 0.6527900894482931, Time: 202.96s
Total Training Time: 635.55s


[I 2024-06-04 13:58:08,812] Trial 0 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 8.782272962051413e-05, 'dropout_rate': 0.441188394919498}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.23s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7641678849856058, Time: 207.05s
Epoch 2/3, Loss: 0.8038041392962137, Time: 199.93s
Epoch 3/3, Loss: 0.7548534870147705, Time: 203.89s
Total Training Time: 610.87s


[I 2024-06-04 14:08:36,808] Trial 1 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.0005209071139540421, 'dropout_rate': 0.32882505099643244}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.05s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7437606851259867, Time: 217.27s
Epoch 2/3, Loss: 0.7165540655454, Time: 206.84s
Epoch 3/3, Loss: 0.7097341219584147, Time: 200.56s
Total Training Time: 624.66s


[I 2024-06-04 14:19:17,441] Trial 2 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 6.376323347192863e-05, 'dropout_rate': 0.11293828588690565}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.12s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.6624381343523661, Time: 207.61s
Epoch 2/3, Loss: 0.7225381135940552, Time: 200.55s
Epoch 3/3, Loss: 0.729839007059733, Time: 202.79s
Total Training Time: 610.96s


[I 2024-06-04 14:29:47,329] Trial 3 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 8.925534817242588e-05, 'dropout_rate': 0.370963610795752}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.05s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7582910299301148, Time: 186.95s
Epoch 2/3, Loss: 0.7165068507194519, Time: 177.35s
Epoch 3/3, Loss: 0.7481189370155334, Time: 178.78s
Total Training Time: 543.08s


  _warn_prf(average, modifier, msg_start, len(result))


Total Evaluation Time: 11.19s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000


[I 2024-06-04 14:39:05,334] Trial 4 finished with value: 0.4 and parameters: {'batch_size': 16, 'learning_rate': 0.0003400122231910321, 'dropout_rate': 0.14478625885724875}. Best is trial 0 with value: 0.6.


Epoch 1/3, Loss: 0.7639226198196412, Time: 185.75s
Epoch 2/3, Loss: 0.7270947337150574, Time: 178.50s
Epoch 3/3, Loss: 0.701966392993927, Time: 178.66s
Total Training Time: 542.91s


[I 2024-06-04 14:48:23,362] Trial 5 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 1.5781728619168713e-05, 'dropout_rate': 0.3927669632104295}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 11.09s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.8313389619191488, Time: 203.81s
Epoch 2/3, Loss: 0.7869373758633932, Time: 228.37s
Epoch 3/3, Loss: 0.8699963092803955, Time: 206.80s
Total Training Time: 638.98s


[I 2024-06-04 14:59:20,439] Trial 6 finished with value: 0.6 and parameters: {'batch_size': 32, 'learning_rate': 0.0006036824085327791, 'dropout_rate': 0.4367715100594728}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.38s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.7481949329376221, Time: 199.69s
Epoch 2/3, Loss: 0.7169508814811707, Time: 197.64s
Epoch 3/3, Loss: 0.6947306275367737, Time: 200.74s
Total Training Time: 598.07s


[I 2024-06-04 15:09:35,265] Trial 7 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 2.57978758279113e-05, 'dropout_rate': 0.13874409649615696}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.32s
Accuracy: 0.6000, F1 Score: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 1/3, Loss: 0.9380324482917786, Time: 221.56s
Epoch 2/3, Loss: 0.6679794589678446, Time: 206.62s
Epoch 3/3, Loss: 0.6652452150980631, Time: 205.09s
Total Training Time: 633.28s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-04 15:20:24,772] Trial 8 finished with value: 0.4 and parameters: {'batch_size': 32, 'learning_rate': 0.00029522730851772353, 'dropout_rate': 0.36382286656284335}. Best is trial 0 with value: 0.6.


Total Evaluation Time: 12.62s
Accuracy: 0.4000, F1 Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch 1/3, Loss: 0.7401591658592224, Time: 200.25s


In [None]:
04.06.2024 Ensemble Model

In [None]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for (inputs1, labels1), (inputs2, labels2) in train_dataloader:
            optimizer.zero_grad()
            
            # Ensure labels are the same
            assert torch.equal(labels1, labels2), "Labels must be the same for both models"
            
            outputs = model(inputs1, inputs2)
            loss = criterion(outputs, labels1)  # or labels2 since they are the same
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for (inputs1, labels1), (inputs2, labels2) in test_dataloader:
            start_time = time.time()
            
            # Ensure labels are the same
            assert torch.equal(labels1, labels2), "Labels must be the same for both models"
            
            outputs = model(inputs1, inputs2)
            _, predicted = torch.max(outputs.data, 1)
            total += labels1.size(0)
            correct += (predicted == labels1).sum().item()
            end_time = time.time()
            total_evaluation_time += (end_time - start_time)

            all_labels.extend(labels1.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = correct / total
    f1 = f1_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)

    print(f'Total Evaluation Time: {total_evaluation_time:.2f}s')
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    return accuracy, f1, precision, recall

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

    train_model(scbert_model, optimizer, train_dataloader, criterion, num_epochs=3)  # Reduce epochs for tuning
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Increase number of trials

best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate with best hyperparameters
best_scbert_model = SCBERTClassifier()
best_scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer = optim.Adam(best_scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_best = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader_best = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)

train_model(best_scbert_model, best_optimizer, train_dataloader_best, criterion, num_epochs)
evaluate_model(best_scbert_model, test_dataloader_best)

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, model1, model2):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.fc = nn.Linear(model1.fc.out_features + model2.fc.out_features, 2)  # Adjusted the input size

    def forward(self, inputs1, inputs2):
        logits1 = self.model1(**inputs1)
        logits2 = self.model2(inputs2)
        combined_logits = torch.cat((logits1, logits2), dim=1)
        return self.fc(combined_logits)

# Instantiate and train ensemble model
best_scgpt_model = SCGPTClassifier()
best_scgpt_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer_gpt = optim.Adam(best_scgpt_model.parameters(), lr=best_params['learning_rate'])

ensemble_model = EnsembleModel(best_scbert_model, best_scgpt_model)
ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=best_params['learning_rate'])

train_model(ensemble_model, ensemble_optimizer, zip(train_dataloader_best, train_dataloader_gpt), criterion, num_epochs)
evaluate_model(ensemble_model, zip(test_dataloader_best, test_dataloader_gpt))

# Save the models
torch.save(best_scbert_model.state_dict(), 'best_scbert_model.pth')
torch.save(best_scgpt_model.state_dict(), 'best_scgpt_model.pth')
torch.save(ensemble_model.state_dict(), 'ensemble_model.pth')


In [None]:
05-06-2024

In [27]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label  # Return a single tuple


# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, device, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for batch in train_dataloader:
            optimizer.zero_grad()

            # Unpack batch and ensure inputs are properly formatted
            inputs, labels = batch
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')




def evaluate_model(model, test_dataloader):
    model.eval()
    correct = 0
    total = 0
    total_evaluation_time = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in test_dataloader:
            if not batch or not batch[0]:  # Check if batch or inputs are empty
                continue

            inputs, labels = batch  # Unpack batch

            if isinstance(inputs, tuple):
                inputs1 = inputs[0]  # Get inputs for first model
                inputs2 = inputs[1]  # Get inputs for second model
            else:
                inputs1 = inputs

            outputs = model(inputs1, inputs2) if isinstance(inputs, tuple) else model(inputs1)  # Model prediction
            _, predicted = torch.max(outputs.data, 1)  # Get predicted labels

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    if total == 0:
        print("No samples in the dataset. Skipping evaluation.")
        return 0, 0, 0, 0

    accuracy = correct / total
    f1 = f1_score(all_labels, all_predictions, average='macro')
    precision = precision_score(all_labels, all_predictions, average='macro')
    recall = recall_score(all_labels, all_predictions, average='macro')

    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    return accuracy, f1, precision, recall





# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    
    # Define device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)
    
    train_model(scbert_model, optimizer, train_dataloader, criterion, num_epochs=1, device='cpu')  # Pass device here

    
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)  # Reduce number of trials for quick testing

best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate with best hyperparameters
best_scbert_model = SCBERTClassifier()
best_scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer = optim.Adam(best_scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_best = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader_best = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)


train_model(best_scbert_model, best_optimizer, train_dataloader_best, criterion, num_epochs)
evaluate_model(best_scbert_model, test_dataloader_best)

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, model1, model2):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.fc = nn.Linear(model1.fc.out_features + model2.fc.out_features, 2)  # Adjusted the input size

    def forward(self, inputs1, inputs2):
        logits1 = self.model1(**inputs1)
        logits2 = self.model2(inputs2)
        combined_logits = torch.cat((logits1, logits2), dim=1)
        return self.fc(combined_logits)

# Instantiate and train ensemble model
best_scgpt_model = SCGPTClassifier()
best_scgpt_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer_gpt = optim.Adam(best_scgpt_model.parameters(), lr=best_params['learning_rate'])

ensemble_model = EnsembleModel(best_scbert_model, best_scgpt_model)
ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=best_params['learning_rate'])

train_model(ensemble_model, ensemble_optimizer, zip(train_dataloader_best, train_dataloader_gpt), criterion, num_epochs)
evaluate_model(ensemble_model, zip(test_dataloader_best, test_dataloader_gpt))

# Save the models
torch.save(best_scbert_model.state_dict(), 'best_scbert_model.pth')
torch.save(best_scgpt_model.state_dict(), 'best_scgpt_model.pth')
torch.save(ensemble_model.state_dict(), 'ensemble_model.pth')



  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-05 00:49:18,935] A new study created in memory with name: no-name-668d32a6-c86c-40ad-9e0f-98718631555c


Epoch 1/1, Loss: 0.7687811454137167, Time: 218.52s
Total Training Time: 218.52s


[W 2024-06-05 00:53:01,649] Trial 0 failed with parameters: {'batch_size': 32, 'learning_rate': 1.0114738546174063e-05, 'dropout_rate': 0.3057514067490589} because of the following error: TypeError("unhashable type: 'slice'").
Traceback (most recent call last):
  File "/users/barmanjy/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/run/nvme/job_21849910/tmp/ipykernel_3973949/552733189.py", line 248, in objective
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader)
  File "/run/nvme/job_21849910/tmp/ipykernel_3973949/552733189.py", line 204, in evaluate_model
    outputs = model(inputs1, inputs2) if isinstance(inputs, tuple) else model(inputs1)  # Model prediction
  File "/users/barmanjy/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/users/barmanjy/.local/lib/python3.10/site-packages/to

TypeError: unhashable type: 'slice'

In [None]:
05/06/2023 v2

In [None]:
import time
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import scipy.sparse as sparse

# Load scRNA data
data_path = '/users/barmanjy/Desktop/Persister Cell/GSE150949_scRNA.csv'
data = pd.read_csv(data_path, nrows=100)

# Ensure 'persister_label' column exists
if 'persister_label' not in data.columns:
    np.random.seed(42)  # For reproducibility
    data['persister_label'] = np.random.randint(0, 2, size=len(data))

# Extract actual labels
actual_labels = data['persister_label'].values

# Drop the label column from the data
data = data.drop(columns=['persister_label'])

# Preprocess the scRNA data
def preprocess_scRNA_data(data):
    adata = sc.AnnData(sparse.csr_matrix(data.values))  # Convert to sparse matrix
    sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize
    sc.pp.log1p(adata)  # Logarithmic transformation
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)  # Select highly variable genes
    adata.obs['batch'] = np.random.randint(0, 2, size=adata.shape[0])  # Dummy batch column for batch correction
    sc.pp.combat(adata, key='batch')  # Batch correction
    return adata.X

# Preprocess scRNA data
X_data = preprocess_scRNA_data(data)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, actual_labels, test_size=0.2, random_state=42)

# Define custom dataset for DataLoader
class RNASeqDataset(Dataset):
    def __init__(self, X, y, tokenizer, model_type='bert', max_length=512):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.model_type = model_type
        self.max_length = max_length

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = ' '.join(map(str, self.X[idx]))
        label = self.y[idx]

        if self.model_type == 'bert':
            inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            inputs = {key: tensor.squeeze(0) for key, tensor in inputs.items()}
        elif self.model_type == 'gpt':
            inputs = self.tokenizer.encode(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length).squeeze(0)

        return inputs, label  # Return a single tuple

# Define scBERT model
class SCBERTClassifier(nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', num_classes=2):
        super(SCBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define scGPT model
class SCGPTClassifier(nn.Module):
    def __init__(self, pretrained_model='gpt2', num_classes=2):
        super(SCGPTClassifier, self).__init__()
        self.gpt = GPT2Model.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.gpt.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Add a padding token to the GPT2 tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Instantiate tokenizer and models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
scbert_model = SCBERTClassifier()
scgpt_model = SCGPTClassifier()

# Define training parameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-4

# Create datasets and dataloaders
train_dataset_bert = RNASeqDataset(X_train, y_train, bert_tokenizer, model_type='bert')
train_dataset_gpt = RNASeqDataset(X_train, y_train, gpt_tokenizer, model_type='gpt')

test_dataset_bert = RNASeqDataset(X_test, y_test, bert_tokenizer, model_type='bert')
test_dataset_gpt = RNASeqDataset(X_test, y_test, gpt_tokenizer, model_type='gpt')

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)

train_dataloader_gpt = DataLoader(train_dataset_gpt, batch_size=batch_size, shuffle=True)
test_dataloader_gpt = DataLoader(test_dataset_gpt, batch_size=batch_size, shuffle=False)

# Define training and evaluation functions
criterion = nn.CrossEntropyLoss()

def train_model(model, optimizer, train_dataloader, criterion, num_epochs, device, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for batch in train_dataloader:
            optimizer.zero_grad()

            # Unpack batch and ensure inputs are properly formatted
            inputs, labels = batch
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

def evaluate_model(model, test_dataloader, device):
    model.eval()
    correct = 0
    total = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in test_dataloader:
            if not batch or not batch[0]:  # Check if batch or inputs are empty
                continue

            inputs, labels = batch  # Unpack batch
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)  # Model prediction
            _, predicted = torch.max(outputs.data, 1)  # Get predicted labels

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    if total == 0:
        print("No samples in the dataset. Skipping evaluation.")
        return 0, 0, 0, 0

    accuracy = correct / total
    f1 = f1_score(all_labels, all_predictions, average='macro')
    precision = precision_score(all_labels, all_predictions, average='macro')
    recall = recall_score(all_labels, all_predictions, average='macro')

    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')
    return accuracy, f1, precision, recall

# Hyperparameter tuning with Optuna for scBERT
def objective(trial):
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    
    # Define device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    scbert_model = SCBERTClassifier()
    scbert_model.dropout = nn.Dropout(dropout_rate)
    optimizer = optim.Adam(scbert_model.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)
    
    train_model(scbert_model, optimizer, train_dataloader, criterion, num_epochs=1, device=device)  # Pass device here

    
    accuracy, _, _, _ = evaluate_model(scbert_model, test_dataloader, device)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)  # Reduce number of trials for quick testing

best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate with best hyperparameters
best_scbert_model = SCBERTClassifier()
best_scbert_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer = optim.Adam(best_scbert_model.parameters(), lr=best_params['learning_rate'])

train_dataloader_best = DataLoader(train_dataset_bert, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader_best = DataLoader(test_dataset_bert, batch_size=best_params['batch_size'], shuffle=False)

train_model(best_scbert_model, best_optimizer, train_dataloader_best, criterion, num_epochs, device='cpu')
evaluate_model(best_scbert_model, test_dataloader_best, device='cpu')

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, model1, model2):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.fc = nn.Linear(model1.fc.out_features + model2.fc.out_features, 2)  # Adjusted the input size

    def forward(self, inputs1, inputs2):
        logits1 = self.model1(inputs1['input_ids'].to(device), attention_mask=inputs1['attention_mask'].to(device))
        logits2 = self.model2(inputs2.to(device))
        combined_logits = torch.cat((logits1, logits2), dim=1)
        return self.fc(combined_logits)

# Instantiate and train ensemble model
best_scgpt_model = SCGPTClassifier()
best_scgpt_model.dropout = nn.Dropout(best_params['dropout_rate'])
best_optimizer_gpt = optim.Adam(best_scgpt_model.parameters(), lr=best_params['learning_rate'])

ensemble_model = EnsembleModel(best_scbert_model, best_scgpt_model)
ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=best_params['learning_rate'])

train_ensemble_dataloader = list(zip(train_dataloader_best, train_dataloader_gpt))
test_ensemble_dataloader = list(zip(test_dataloader_best, test_dataloader_gpt))

def train_ensemble_model(model, optimizer, train_dataloader, criterion, num_epochs, device, patience=3):
    model.train()
    best_loss = float('inf')
    early_stop_count = 0
    total_training_time = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        running_loss = 0.0
        for batch_bert, batch_gpt in train_dataloader:
            optimizer.zero_grad()

            # Unpack batches and ensure inputs are properly formatted
            inputs_bert, labels_bert = batch_bert
            inputs_gpt, labels_gpt = batch_gpt
            input_ids_bert = inputs_bert['input_ids'].to(device)
            attention_mask_bert = inputs_bert['attention_mask'].to(device)
            input_ids_gpt = inputs_gpt.to(device)
            labels = labels_bert.to(device)

            # Forward pass
            outputs = model({'input_ids': input_ids_bert, 'attention_mask': attention_mask_bert}, input_ids_gpt)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_dataloader)
        end_time = time.time()
        epoch_time = end_time - start_time
        total_training_time += epoch_time
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss}, Time: {epoch_time:.2f}s')

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    print(f'Total Training Time: {total_training_time:.2f}s')

train_ensemble_model(ensemble_model, ensemble_optimizer, train_ensemble_dataloader, criterion, num_epochs, device='cpu')
evaluate_model(ensemble_model, test_ensemble_dataloader, device='cpu')

# Save the models
torch.save(best_scbert_model.state_dict(), 'best_scbert_model.pth')
torch.save(best_scgpt_model.state_dict(), 'best_scgpt_model.pth')
torch.save(ensemble_model.state_dict(), 'ensemble_model.pth')


  return _pandas_is_categorical_dtype(dt)
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
[I 2024-06-05 06:25:09,811] A new study created in memory with name: no-name-cdae0e37-26b2-4cf1-9aa4-3d768ce5c99a


Epoch 1/1, Loss: 0.7439374446868896, Time: 211.56s
Total Training Time: 211.56s


  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-06-05 06:28:58,038] Trial 0 finished with value: 0.6 and parameters: {'batch_size': 16, 'learning_rate': 0.0004757005425973431, 'dropout_rate': 0.4097687427988205}. Best is trial 0 with value: 0.6.


Accuracy: 0.6000, F1 Score: 0.3750, Precision: 0.3000, Recall: 0.5000
Best hyperparameters: {'batch_size': 16, 'learning_rate': 0.0004757005425973431, 'dropout_rate': 0.4097687427988205}
Epoch 1/5, Loss: 0.7737460374832154, Time: 213.84s
