In [3]:
# Add the directory to sys.path
import sys
sys.path.append('/scratch/project_2010376')

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Function to load data in chunks
def load_data_in_chunks(file_path, chunk_size=1000):
    return pd.read_csv(file_path, engine='python', encoding='utf-8', chunksize=chunk_size)

# Preprocessing function
def preprocess_data(metadata_path, data_path, additional_datasets=None):
    metadata_df = pd.read_csv(metadata_path, sep='\t')
    scRNA_data = pd.concat(load_data_in_chunks(data_path)).transpose()

    metadata_df.index.rename('cell', inplace=True)
    metadata_df.reset_index(inplace=True)
    metadata_df['cell'] = metadata_df['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
    scRNA_data.reset_index(inplace=True)
    scRNA_data.rename(columns={'index': 'cell'}, inplace=True)
    scRNA_data['cell'] = scRNA_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()

    common_cells = set(metadata_df['cell']).intersection(set(scRNA_data['cell']))

    filtered_metadata_df = metadata_df[metadata_df['cell'].isin(common_cells)]
    filtered_scRNA_data = scRNA_data[scRNA_data['cell'].isin(common_cells)]

    merged_data = pd.merge(filtered_scRNA_data, filtered_metadata_df[['cell', 'sample_name', 'sample_type']], on='cell', how='inner')

    if additional_datasets:
        for file_path, sample_type in additional_datasets:
            additional_data = pd.concat(load_data_in_chunks(file_path)).transpose()
            additional_data.reset_index(inplace=True)
            additional_data.rename(columns={'index': 'cell'}, inplace=True)
            additional_data['cell'] = additional_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
            additional_data['sample_type'] = sample_type
            merged_data = pd.concat([merged_data, additional_data], ignore_index=True, sort=False)

    gene_names = merged_data.columns[1:-2]  # Assuming the first column is 'cell', and last two are 'sample_name' and 'sample_type'

    X = merged_data.drop(columns=['cell', 'sample_name', 'sample_type'], errors='ignore')
    X.columns = X.columns.astype(str)
    X = X.apply(pd.to_numeric, errors='coerce')

    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    y = merged_data['sample_type']
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    return X, y, merged_data, label_encoder, gene_names

# CNN-RNN Hybrid Model
def create_cnn_rnn_model(input_shape):
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        
        LSTM(64),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        Dropout(0.3),
        
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Learning Rate Scheduler
def lr_scheduler(epoch, lr):
    if epoch > 10:
        lr = lr * 0.85
    return lr

# Main Code Execution
metadata_path = '/scratch/project_2010376/GSE150949_metaData_with_lineage.txt'
data_path = '/scratch/project_2010376/normalized_GSE150949_pc9_count.csv'
additional_datasets = [
    ('/scratch/project_2010751/GSM8118468_ob_treated.csv', 'Persister'),
    ('/scratch/project_2010751/GSE134836_GSM3972651_PC9D0_untreated_filtered.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE134839_GSM3972657_PC90D0_untreated.dge.csv', 'Non-Persister')
]

# Preprocess the data
X, y, merged_data, label_encoder, gene_names = preprocess_data(metadata_path, data_path, additional_datasets)

# Confirm class 0 is non-persister and class 1 is persister
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", label_mapping)

if label_mapping['Non-Persister'] == 0 and label_mapping['Persister'] == 1:
    print("Class 0 corresponds to Non-Persister, and class 1 corresponds to Persister.")
else:
    print("Class 0 corresponds to Persister, and class 1 corresponds to Non-Persister.")

# Apply PCA for dimensionality reduction
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X)

# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_reduced, y)

# Extract features for non-persister (class 0) and persister (class 1)
non_persister_features = X_resampled[y_resampled == 0]
persister_features = X_resampled[y_resampled == 1]

# Ensure all gene names are strings
gene_names = [str(gene) for gene in gene_names]

# Save the features to text files with gene names as headers
np.savetxt('cnn_rnn_non_persister_features.txt', non_persister_features, delimiter=',', header=','.join(gene_names), fmt='%.6f', comments='')
np.savetxt('cnn_rnn_persister_features.txt', persister_features, delimiter=',', header=','.join(gene_names), fmt='%.6f', comments='')

print("Features for non-persister (class 0) saved to 'cnn_rnn_non_persister_features.txt'")
print("Features for persister (class 1) saved to 'cnn_rnn_persister_features.txt'")

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Reshape for CNN input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_validation = X_validation.reshape(X_validation.shape[0], X_validation.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Fix input shape by making it a tuple
input_shape = (X_train.shape[1], 1)

# Train the CNN-RNN model
model = create_cnn_rnn_model(input_shape)

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_cnn_rnn_model.keras', save_best_only=True, monitor='val_loss', mode='min')
lr_schedule = LearningRateScheduler(lr_scheduler)

# Model training
history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation),
                    epochs=100, batch_size=64, callbacks=[early_stopping, model_checkpoint, lr_schedule])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Generate predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Persister', 'Persister'], yticklabels=['Non-Persister', 'Persister'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Save the final model
model.save('final_cnn_rnn_model.keras')

# Function to preprocess independent datasets
def preprocess_independent_data(file_path, train_columns, n_components):
    independent_data = pd.concat(load_data_in_chunks(file_path)).transpose()
    independent_data.reset_index(inplace=True)
    independent_data.rename(columns={'index': 'cell'}, inplace=True)

    if independent_data.shape[0] > independent_data.shape[1]:
        independent_data = independent_data.transpose()

    X_independent = independent_data.drop(columns=['cell', 'sample_name', 'sample_type'], errors='ignore')
    X_independent.columns = X_independent.columns.astype(str)
    train_columns = train_columns.astype(str)

    X_independent = X_independent.reindex(columns=train_columns, fill_value=0)
    X_independent = X_independent.apply(pd.to_numeric, errors='coerce')
    X_independent.fillna(0, inplace=True)

    pca = PCA(n_components=n_components)
    X_independent = pca.fit_transform(X_independent)
    
    return X_independent

# Function to predict on independent datasets
def predict_independent_dataset(model, file_path, train_columns, n_components):
    try:
        X_independent = preprocess_independent_data(file_path, train_columns, n_components)
        X_independent = X_independent.reshape(X_independent.shape[0], X_independent.shape[1], 1)
        predictions = model.predict(X_independent)
        predicted_labels = (predictions > 0.5).astype(int)
        counts = Counter(predicted_labels.flatten())
        return counts
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return Counter()

# Load the trained model
model = tf.keras.models.load_model('final_cnn_rnn_model.keras')

# Get the feature columns from merged_data used in training
train_columns = merged_data.drop(columns=['cell', 'sample_name', 'sample_type']).columns  # Exclude the target column

# Use the same number of PCA components as used in the model
n_components = model.input_shape[1]

# Independent datasets
independent_datasets = {
    "GSE134836_GSM3972651_PC9D0_untreated_filtered.csv": "/scratch/project_2010751/GSE134836_GSM3972651_PC9D0_untreated_filtered.csv",
    "GSM4869650_xCtrl": "/scratch/project_2010376/GSM4869650_xCtrl.dge.csv",
    "new_GSM4869650_xCtrl.dge.csv": "/scratch/project_2010376/new_GSM4869650_xCtrl.dge.csv",
    "GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv": "/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv",
    "GSM4869653_xOsiCriz":"/scratch/project_2010751/GSE134839_GSM3972657_PC90D0_untreated.dge.csv",
    "GSE149383_GSM3972669_D0_untreated.dge.csv":"/scratch/project_2010751/GSE149383_GSM3972669_D0_untreated.dge.csv",
    "GSE160244_GSM4869650_day3_untreated.dge.csv":"/scratch/project_2010751/GSE160244_GSM4869650_day3_untreated.dge.csv",
    "GSE160244_GSM4869652_xOsi_day3_dge.csv":"/scratch/project_2010751/GSE160244_GSM4869652_xOsi_day3_dge.csv",
    "GSE260499_GSM8118463_Osi.RDS":"/scratch/project_2010751/GSE260499_GSM8118463_Osi.RDS",
    "normalized_GSE150949_pc9_count.csv":"/scratch/project_2010751/normalized_GSE150949_pc9_count.csv"
}

# Process each independent dataset and make predictions
results = {}
for dataset_name, file_path in independent_datasets.items():
    print(f"\nProcessing dataset: {dataset_name}")
    counts = predict_independent_dataset(model, file_path, train_columns, n_components)
    results[f"Transfer Learning - {dataset_name}"] = counts
    print(f"{dataset_name} - Transfer Learning Predictions: {counts}")

# Print final results
print("\nFinal Prediction Results:")
for model_dataset, counts in results.items():
    print(f"{model_dataset}: Predictions = {counts}")


Label Encoding Mapping: {'Non-Persister': 0, 'Persister': 1}
Class 0 corresponds to Non-Persister, and class 1 corresponds to Persister.
Features for non-persister (class 0) saved to 'cnn_rnn_non_persister_features.txt'
Features for persister (class 1) saved to 'cnn_rnn_persister_features.txt'


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-08-21 11:40:18.748353: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Epoch 1/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 51ms/step - accuracy: 0.6703 - loss: 0.5853 - val_accuracy: 0.6196 - val_loss: 0.6895 - learning_rate: 1.0000e-04
Epoch 2/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 50ms/step - accuracy: 0.9719 - loss: 0.0862 - val_accuracy: 0.5017 - val_loss: 0.7952 - learning_rate: 1.0000e-04
Epoch 3/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 51ms/step - accuracy: 0.9890 - loss: 0.0331 - val_accuracy: 0.5017 - val_loss: 0.8576 - learning_rate: 1.0000e-04
Epoch 4/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 50ms/step - accuracy: 0.9924 - loss: 0.0238 - val_accuracy: 0.4983 - val_loss: 0.7162 - learning_rate: 1.0000e-04
Epoch 5/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 50ms/step - accuracy: 0.9953 - loss: 0.0170 - val_accuracy: 0.9143 - val_loss: 0.6139 - learning_rate: 1.0000e-04
Epoch 6/100
[1m524/524[0m [32m━━

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 

In [10]:
# Extract gene names from the merged data
gene_names = merged_data.columns[1:]  # Assuming 'cell' is the first column and all others are gene names

# Convert gene names to strings (in case some aren't strings already)
gene_names = [str(gene) for gene in gene_names]

# Save gene names for non-persisters (this will be all gene names)
with open('non_persister_gene_names.txt', 'w') as file:
    for gene in gene_names:
        file.write(f"{gene}\n")

# Save gene names for persisters (this will be all gene names)
with open('persister_gene_names.txt', 'w') as file:
    for gene in gene_names:
        file.write(f"{gene}\n")

print("Gene names for non-persisters saved to 'non_persister_gene_names.txt'")
print("Gene names for persisters saved to 'persister_gene_names.txt'")


Gene names for non-persisters saved to 'non_persister_gene_names.txt'
Gene names for persisters saved to 'persister_gene_names.txt'


In [12]:
# Load the trained model
model = tf.keras.models.load_model('final_cnn_rnn_model.keras')

# Get the feature columns from merged_data used in training
train_columns = merged_data.drop(columns=['cell', 'sample_name', 'sample_type']).columns  # Exclude the target column

# Use the same number of PCA components as used in the model
n_components = model.input_shape[1]

# Independent datasets
independent_datasets = {
    "GSM4869653_xOsiCriz":"/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv",
    
}

# Process each independent dataset and make predictions
results = {}
for dataset_name, file_path in independent_datasets.items():
    print(f"\nProcessing dataset: {dataset_name}")
    counts = predict_independent_dataset(model, file_path, train_columns, n_components)
    results[f"Transfer Learning - {dataset_name}"] = counts
    print(f"{dataset_name} - Transfer Learning Predictions: {counts}")

# Print final results
print("\nFinal Prediction Results:")
for model_dataset, counts in results.items():
    print(f"{model_dataset}: Predictions = {counts}")



Processing dataset: GSM4869653_xOsiCriz
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step
GSM4869653_xOsiCriz - Transfer Learning Predictions: Counter({0: 1274})

Final Prediction Results:
Transfer Learning - GSM4869653_xOsiCriz: Predictions = Counter({0: 1274})


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Function to load data in chunks
def load_data_in_chunks(file_path, chunk_size=1000):
    return pd.read_csv(file_path, engine='python', encoding='utf-8', chunksize=chunk_size)

# Preprocessing function
def preprocess_data(metadata_path, data_path, additional_datasets=None):
    metadata_df = pd.read_csv(metadata_path, sep='\t')
    scRNA_data = pd.concat(load_data_in_chunks(data_path)).transpose()

    metadata_df.index.rename('cell', inplace=True)
    metadata_df.reset_index(inplace=True)
    metadata_df['cell'] = metadata_df['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
    scRNA_data.reset_index(inplace=True)
    scRNA_data.rename(columns={'index': 'cell'}, inplace=True)
    scRNA_data['cell'] = scRNA_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()

    common_cells = set(metadata_df['cell']).intersection(set(scRNA_data['cell']))

    if len(common_cells) == 0:
        raise ValueError("No common cells found between metadata and scRNA data.")

    filtered_metadata_df = metadata_df[metadata_df['cell'].isin(common_cells)]
    filtered_scRNA_data = scRNA_data[scRNA_data['cell'].isin(common_cells)]

    merged_data = pd.merge(filtered_scRNA_data, filtered_metadata_df[['cell', 'sample_name', 'sample_type']], on='cell', how='inner')

    if additional_datasets:
        for file_path, sample_type in additional_datasets:
            additional_data = pd.concat(load_data_in_chunks(file_path)).transpose()
            additional_data.reset_index(inplace=True)
            additional_data.rename(columns={'index': 'cell'}, inplace=True)
            additional_data['cell'] = additional_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
            additional_data['sample_type'] = sample_type
            merged_data = pd.concat([merged_data, additional_data], ignore_index=True, sort=False)

    gene_names = merged_data.columns[1:-2]  # Assuming the first column is 'cell', and last two are 'sample_name' and 'sample_type'

    X = merged_data.drop(columns=['cell', 'sample_name', 'sample_type'], errors='ignore')
    X.columns = X.columns.astype(str)
    X = X.apply(pd.to_numeric, errors='coerce')

    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    # Update labels for the merged_data based on specific sample names
    persister_samples = ["7", "3", "14_high", "14_med", "14_low"]

    y_model1 = np.where(merged_data['sample_type'].isin(persister_samples), 1, 0)

    # Debug statements to check the unique values and their counts
    print("Unique values in y_model1:", np.unique(y_model1, return_counts=True))

    # Ensure there are samples for both classes
    if len(np.unique(y_model1)) != 2:
        print("Not enough classes in y_model1 for Model 1")
        return None, None, None, None

    return X, y_model1, merged_data, gene_names

# CNN-RNN Hybrid Model
def create_cnn_rnn_model(input_shape):
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        
        LSTM(64),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        Dropout(0.3),
        
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Learning Rate Scheduler
def lr_scheduler(epoch, lr):
    if epoch > 10:
        lr = lr * 0.85
    return lr

# Main Code Execution
metadata_path = '/scratch/project_2010376/GSE150949_metaData_with_lineage.txt'
data_path = '/scratch/project_2010376/GSE150949_pc9_count_matrix.csv'
additional_datasets = [
    ('/scratch/project_2010751/GSM8118468_ob_treated.csv', 'Persister'),
    ('/scratch/project_2010751/GSE134836_GSM3972651_PC9D0_untreated_filtered.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE134839_GSM3972657_PC90D0_untreated.dge.csv', 'Non-Persister')
]

# Preprocess the data
X, y, merged_data, gene_names = preprocess_data(metadata_path, data_path, additional_datasets)

# Proceed only if both classes are present
if X is not None and y is not None:
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=100)
    X_reduced = pca.fit_transform(X)

    # Apply SMOTE for balancing the dataset
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_reduced, y)

    # Train-validation-test split
    X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Reshape for CNN input
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_validation = X_validation.reshape(X_validation.shape[0], X_validation.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

    # Fix input shape by making it a tuple
    input_shape = (X_train.shape[1], 1)

    # Train the CNN-RNN model
    model = create_cnn_rnn_model(input_shape)

    # Define callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint('best_cnn_rnn_model.keras', save_best_only=True, monitor='val_loss', mode='min')
    lr_schedule = LearningRateScheduler(lr_scheduler)

    # Model training
    history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation),
                        epochs=100, batch_size=64, callbacks=[early_stopping, model_checkpoint, lr_schedule])

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy:.2f}")

    # Generate predictions on the test set
    y_pred = (model.predict(X_test) > 0.5).astype(int)

    # Generate the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Plot the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Persister', 'Persister'], yticklabels=['Non-Persister', 'Persister'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

    # Save the final model
    model.save('final_cnn_rnn_model.keras')
else:
    print("Skipping model training due to insufficient classes in the labeled data.")


2024-08-21 16:55:34.873214: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-21 16:55:36.954816: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-21 16:55:37.363403: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 16:55:37.363441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-21 16:55:40.168024: I tensorflow/core/platform/cpu_feature_gua

MemoryError: Unable to allocate 49.2 GiB for an array with shape (3298585886, 2) and data type int64

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Function to load data in chunks
def load_data_in_chunks(file_path, chunk_size=1000):
    return pd.read_csv(file_path, engine='python', encoding='utf-8', chunksize=chunk_size)

# Preprocessing function
def preprocess_data(metadata_path, data_path, additional_datasets=None):
    metadata_df = pd.read_csv(metadata_path, sep='\t')
    scRNA_data = pd.concat(load_data_in_chunks(data_path)).transpose()

    metadata_df.index.rename('cell', inplace=True)
    metadata_df.reset_index(inplace=True)
    metadata_df['cell'] = metadata_df['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
    scRNA_data.reset_index(inplace=True)
    scRNA_data.rename(columns={'index': 'cell'}, inplace=True)
    scRNA_data['cell'] = scRNA_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()

    common_cells = set(metadata_df['cell']).intersection(set(scRNA_data['cell']))

    filtered_metadata_df = metadata_df[metadata_df['cell'].isin(common_cells)]
    filtered_scRNA_data = scRNA_data[scRNA_data['cell'].isin(common_cells)]

    merged_data = pd.merge(filtered_scRNA_data, filtered_metadata_df[['cell', 'sample_name', 'sample_type']], on='cell', how='inner')

    if additional_datasets:
        for file_path, sample_type in additional_datasets:
            additional_data = pd.concat(load_data_in_chunks(file_path)).transpose()
            additional_data.reset_index(inplace=True)
            additional_data.rename(columns={'index': 'cell'}, inplace=True)
            additional_data['cell'] = additional_data['cell'].astype(str).str.split('-').str[0].str.strip().str.upper()
            additional_data['sample_type'] = sample_type
            merged_data = pd.concat([merged_data, additional_data], ignore_index=True, sort=False)

    gene_names = merged_data.columns[1:-2]  # Assuming the first column is 'cell', and last two are 'sample_name' and 'sample_type'

    X = merged_data.drop(columns=['cell', 'sample_name', 'sample_type'], errors='ignore')
    X.columns = X.columns.astype(str)
    X = X.apply(pd.to_numeric, errors='coerce')

    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)

    y = merged_data['sample_type']
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    return X, y, merged_data, label_encoder, gene_names

# CNN-RNN Hybrid Model
def create_cnn_rnn_model(input_shape):
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        
        LSTM(64),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        Dropout(0.3),
        
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Learning Rate Scheduler
def lr_scheduler(epoch, lr):
    if epoch > 10:
        lr = lr * 0.85
    return lr

# Main Code Execution
metadata_path = '/scratch/project_2010376/GSE150949_metaData_with_lineage.txt'
data_path = '/scratch/project_2010376/normalized_GSE150949_pc9_count.csv'
additional_datasets = [
    ('/scratch/project_2010751/GSM8118468_ob_treated.csv', 'Persister'),
    ('/scratch/project_2010751/GSE134836_GSM3972651_PC9D0_untreated_filtered.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv', 'Non-Persister'),
    ('/scratch/project_2010751/GSE134839_GSM3972657_PC90D0_untreated.dge.csv', 'Non-Persister')
]

# Preprocess the data
X, y, merged_data, label_encoder, gene_names = preprocess_data(metadata_path, data_path, additional_datasets)

# Confirm class 0 is non-persister and class 1 is persister
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", label_mapping)

if label_mapping['Non-Persister'] == 0 and label_mapping['Persister'] == 1:
    print("Class 0 corresponds to Non-Persister, and class 1 corresponds to Persister.")
else:
    print("Class 0 corresponds to Persister, and class 1 corresponds to Non-Persister.")

# Apply PCA for dimensionality reduction
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X)

# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_reduced, y)

# Extract features for non-persister (class 0) and persister (class 1)
non_persister_features = X_resampled[y_resampled == 0]
persister_features = X_resampled[y_resampled == 1]

# Ensure all gene names are strings
gene_names = [str(gene) for gene in gene_names]

# Save the features to text files with gene names as headers
np.savetxt('cnn_rnn_non_persister_features.txt', non_persister_features, delimiter=',', header=','.join(gene_names), fmt='%.6f', comments='')
np.savetxt('cnn_rnn_persister_features.txt', persister_features, delimiter=',', header=','.join(gene_names), fmt='%.6f', comments='')

print("Features for non-persister (class 0) saved to 'cnn_rnn_non_persister_features.txt'")
print("Features for persister (class 1) saved to 'cnn_rnn_persister_features.txt'")

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Reshape for CNN input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_validation = X_validation.reshape(X_validation.shape[0], X_validation.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Fix input shape by making it a tuple
input_shape = (X_train.shape[1], 1)

# Train the CNN-RNN model
model = create_cnn_rnn_model(input_shape)

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_cnn_rnn_model.keras', save_best_only=True, monitor='val_loss', mode='min')
lr_schedule = LearningRateScheduler(lr_scheduler)

# Model training
history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation),
                    epochs=100, batch_size=64, callbacks=[early_stopping, model_checkpoint, lr_schedule])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Generate predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Persister', 'Persister'], yticklabels=['Non-Persister', 'Persister'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Save the final model
model.save('final_cnn_rnn_model.keras')

# Function to preprocess independent datasets
def preprocess_independent_data(file_path, train_columns, n_components):
    independent_data = pd.concat(load_data_in_chunks(file_path)).transpose()
    independent_data.reset_index(inplace=True)
    independent_data.rename(columns={'index': 'cell'}, inplace=True)

    if independent_data.shape[0] > independent_data.shape[1]:
        independent_data = independent_data.transpose()

    X_independent = independent_data.drop(columns=['cell', 'sample_name', 'sample_type'], errors='ignore')
    X_independent.columns = X_independent.columns.astype(str)
    train_columns = train_columns.astype(str)

    X_independent = X_independent.reindex(columns=train_columns, fill_value=0)
    X_independent = X_independent.apply(pd.to_numeric, errors='coerce')
    X_independent.fillna(0, inplace=True)

    pca = PCA(n_components=n_components)
    X_independent = pca.fit_transform(X_independent)
    
    return X_independent

# Function to predict on independent datasets
def predict_independent_dataset(model, file_path, train_columns, n_components):
    try:
        X_independent = preprocess_independent_data(file_path, train_columns, n_components)
        X_independent = X_independent.reshape(X_independent.shape[0], X_independent.shape[1], 1)
        predictions = model.predict(X_independent)
        predicted_labels = (predictions > 0.5).astype(int)
        counts = Counter(predicted_labels.flatten())
        return counts
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return Counter()

# Load the trained model
model = tf.keras.models.load_model('final_cnn_rnn_model.keras')

# Get the feature columns from merged_data used in training
train_columns = merged_data.drop(columns=['cell', 'sample_name', 'sample_type']).columns  # Exclude the target column

# Use the same number of PCA components as used in the model
n_components = model.input_shape[1]

# Independent datasets
independent_datasets = {
    "GSE134836_GSM3972651_PC9D0_untreated_filtered.csv": "/scratch/project_2010751/GSE134836_GSM3972651_PC9D0_untreated_filtered.csv",
    "GSM4869650_xCtrl": "/scratch/project_2010376/GSM4869650_xCtrl.dge.csv",
    "new_GSM4869650_xCtrl.dge.csv": "/scratch/project_2010376/new_GSM4869650_xCtrl.dge.csv",
    "GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv": "/scratch/project_2010751/GSE138693_GSM4116265_PC9_1_invitro_normalized_untreated.csv",
    "GSM4869653_xOsiCriz":"/scratch/project_2010751/GSE134839_GSM3972657_PC90D0_untreated.dge.csv",
    "GSE149383_GSM3972669_D0_untreated.dge.csv":"/scratch/project_2010751/GSE149383_GSM3972669_D0_untreated.dge.csv",
    "GSE160244_GSM4869650_day3_untreated.dge.csv":"/scratch/project_2010751/GSE160244_GSM4869650_day3_untreated.dge.csv",
    "GSE160244_GSM4869652_xOsi_day3_dge.csv":"/scratch/project_2010751/GSE160244_GSM4869652_xOsi_day3_dge.csv",
    "GSE260499_GSM8118463_Osi.RDS":"/scratch/project_2010751/GSE260499_GSM8118463_Osi.RDS",
    "normalized_GSE150949_pc9_count.csv":"/scratch/project_2010751/normalized_GSE150949_pc9_count.csv"
}

# Process each independent dataset and make predictions
results = {}
for dataset_name, file_path in independent_datasets.items():
    print(f"\nProcessing dataset: {dataset_name}")
    counts = predict_independent_dataset(model, file_path, train_columns, n_components)
    results[f"Transfer Learning - {dataset_name}"] = counts
    print(f"{dataset_name} - Transfer Learning Predictions: {counts}")

# Print final results
print("\nFinal Prediction Results:")
for model_dataset, counts in results.items():
    print(f"{model_dataset}: Predictions = {counts}")


2024-08-21 21:14:56.783916: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-21 21:14:58.972570: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-21 21:14:59.364475: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-21 21:14:59.364500: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-21 21:15:03.024119: I tensorflow/core/platform/cpu_feature_gua

Label Encoding Mapping: {'Non-Persister': 0, 'Persister': 1}
Class 0 corresponds to Non-Persister, and class 1 corresponds to Persister.
Features for non-persister (class 0) saved to 'cnn_rnn_non_persister_features.txt'
Features for persister (class 1) saved to 'cnn_rnn_persister_features.txt'


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-08-21 22:05:11.414148: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Epoch 1/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 51ms/step - accuracy: 0.6585 - loss: 0.6191 - val_accuracy: 0.5427 - val_loss: 0.8375 - learning_rate: 1.0000e-04
Epoch 2/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 51ms/step - accuracy: 0.9669 - loss: 0.1056 - val_accuracy: 0.5704 - val_loss: 0.6887 - learning_rate: 1.0000e-04
Epoch 3/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 51ms/step - accuracy: 0.9897 - loss: 0.0342 - val_accuracy: 0.5449 - val_loss: 0.6829 - learning_rate: 1.0000e-04
Epoch 4/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 50ms/step - accuracy: 0.9924 - loss: 0.0260 - val_accuracy: 0.5017 - val_loss: 0.9314 - learning_rate: 1.0000e-04
Epoch 5/100
[1m524/524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 51ms/step - accuracy: 0.9920 - loss: 0.0272 - val_accuracy: 0.5506 - val_loss: 0.6471 - learning_rate: 1.0000e-04
Epoch 6/100
[1m524/524[0m [32m━━

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
GSM4869653_xOsiCriz - Transfer Learning Predictions: Counter({1: 2468, 0: 26})

Processing dataset: GSE149383_GSM3972669_D0_untreated.dge.csv
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
GSE149383_GSM3972669_D0_untreated.dge.csv - Transfer Learning Predictions: Counter({0: 1365, 1: 239})

Processing dataset: GSE160244_GSM4869650_day3_untreated.dge.csv
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
GSE160244_GSM4869650_day3_untreated.dge.csv - Transfer Learning Predictions: Counter({0: 4010, 1: 658})

Processing dataset: GSE160244_GSM4869652_xOsi_day3_dge.csv
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
GSE160244_GSM4869652_xOsi_day3_dge.csv - Transfer Learning Predictions: Counter({0: 2835, 1: 837})

Processing dataset: GSE260499_GSM8118463_Osi.RDS
Error processing /scratch/project_2010751/GSE260499_GSM8118463_Osi.RDS: 'utf-8

In [6]:


# Load the trained model
model = tf.keras.models.load_model('final_cnn_rnn_model.keras')

# Get the feature columns from merged_data used in training
train_columns = merged_data.drop(columns=['cell', 'sample_name', 'sample_type']).columns  # Exclude the target column

# Use the same number of PCA components as used in the model
n_components = model.input_shape[1]

# Independent datasets
independent_datasets = {

    "normalized_GSE150949_pc9_count.csv":"/scratch/project_2010376/GSE150949_pc9_count_matrix.csv"
}

# Process each independent dataset and make predictions
results = {}
for dataset_name, file_path in independent_datasets.items():
    print(f"\nProcessing dataset: {dataset_name}")
    counts = predict_independent_dataset(model, file_path, train_columns, n_components)
    results[f"Transfer Learning - {dataset_name}"] = counts
    print(f"{dataset_name} - Transfer Learning Predictions: {counts}")

# Print final results
print("\nFinal Prediction Results:")
for model_dataset, counts in results.items():
    print(f"{model_dataset}: Predictions = {counts}")



Processing dataset: normalized_GSE150949_pc9_count.csv
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step
normalized_GSE150949_pc9_count.csv - Transfer Learning Predictions: Counter({1: 20537, 0: 1630})

Final Prediction Results:
Transfer Learning - normalized_GSE150949_pc9_count.csv: Predictions = Counter({1: 20537, 0: 1630})
