In [1]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
import tensorflow as tf
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
import gc
# Reset TensorFlow graph
tf.compat.v1.reset_default_graph()
gc.collect()

5

Paths and data

In [2]:
checkpoint_path = "7000_if_new_best_model.h5"
save_path = "7000_if_new_prediction_plots/"
mutated_file = "/content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz"
nonmutated_file = "/content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz"
mutated_data = np.load("/content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz", allow_pickle=True, mmap_mode='r')
nonmutated_data = np.load("/content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz", allow_pickle=True, mmap_mode='r')
csv_file = "cry1realvariations (1).csv"  # Assuming this CSV contains mutation data for anomaly detection

Anomaly detection using Isolation Forest

In [3]:
def load_and_get_anomaly_scores(csv_file):
    df = pd.read_csv(csv_file)
    df = df[df["variation_type"].str.contains("intron_variant", na=False, case=False)]
    data_id_array = tf.convert_to_tensor(df['_displayName'].values[:6000], dtype=tf.string)
    array_data = tf.convert_to_tensor(df['AF'].values[:6000], dtype=tf.float32)

    # Isolation Forest anomaly detection
    clf = IsolationForest(contamination=0.01, random_state=42)
    clf.fit(array_data.numpy().reshape(-1, 1))  # Ensure array_data is a numpy array for sklearn
    anomaly_scores = clf.decision_function(array_data.numpy().reshape(-1, 1))
    predictions = clf.predict(array_data.numpy().reshape(-1, 1))
    inverted_anomaly_scores = -anomaly_scores

    return tf.convert_to_tensor(inverted_anomaly_scores, dtype=tf.float32), data_id_array, array_data, tf.convert_to_tensor(predictions, dtype=tf.int32)

# Call the function with the CSV file path
csv_file = "cry1realvariations (1).csv"  # Update this path to your CSV file
anomaly_scores, data_id_array, array_data, predictions = load_and_get_anomaly_scores(csv_file)

Get anomaly scores

In [4]:
anomaly_scores, _, _, _ = load_and_get_anomaly_scores(csv_file)

In [5]:
def load_data_in_batches(file_path, batch_size=1000):
    # Load the data file using tf.data API
    print(f"Loading data from {file_path}...")

    data = np.load(file_path, allow_pickle=True, mmap_mode='r')
    key = list(data.files)[0]  # Assuming only one key (e.g., 'arr_0')

    # Print the available keys to check
    print(f"Available keys in the .npz file: {list(data.files)}")

    full_data = tf.convert_to_tensor(data[key][:6000], dtype=tf.float32)  # Load the first 6000 sequences

    # Print shape of loaded data
    print(f"Shape of loaded data: {full_data.shape}")

    # Create the dataset and apply batching, shuffling, and prefetching
    dataset = tf.data.Dataset.from_tensor_slices(full_data)
    dataset = dataset.batch(batch_size).shuffle(buffer_size=10000).prefetch(tf.data.experimental.AUTOTUNE)

    # Print the shape of data in the first batch to ensure everything is correct
    for batch in dataset.take(1):
        print(f"First batch shape: {batch.shape}")

    return dataset

Function to load sequences and their labels

In [6]:
def load_sequences(data, label, file_path):
    encoded_sequences = None
    input_shape = None
    data_load = load_data_in_batches(file_path)  # Getting batches

    for batch_data in data_load:
        temp_sequences = batch_data  # This is the data for the current batch
        print(f"Batch shape: {temp_sequences.shape}")  # Debugging

        if temp_sequences.ndim == 2:  # If 2D, reshape to 3D for LSTM
            temp_sequences = np.expand_dims(temp_sequences, axis=1)  # (samples, 1, features)

        if temp_sequences.ndim == 3:
            encoded_sequences = temp_sequences
            input_shape = (encoded_sequences.shape[1], encoded_sequences.shape[2])
            print(f"reshaped shape {encoded_sequences.shape}")
            break
        else:
            print(f"Skipping: Unexpected shape {temp_sequences.shape}")

    if encoded_sequences is None:
        raise ValueError(f"No valid encoded sequences found in  file.")

    return encoded_sequences, input_shape

Load data

In [None]:
mutated_sequences = load_data_in_batches(mutated_file)
nonmutated_sequences = load_data_in_batches(nonmutated_file)

Loading data from /content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz...
Available keys in the .npz file: ['arr_0']
Shape of loaded data: (6000, 410000)
First batch shape: (1000, 410000)
Loading data from /content/E__datasets_processeddata_MUTATION_DATA_TRAINING_15000_1.npz...
Available keys in the .npz file: ['arr_0']


In [8]:
mutated_labels = tf.ones([6000], dtype=tf.float32)
nonmutated_labels = tf.zeros([6000], dtype=tf.float32)

Step 1: Generate random Mutation IDs for non-mutated data

In [9]:
nonmutated_data_with_ids = tf.convert_to_tensor([f"NonMut_{i}" for i in range(len(nonmutated_sequences))], dtype=tf.string)
allele_frequency_mutated = tf.convert_to_tensor(pd.read_csv(csv_file)['AF'].values[:6000], dtype=tf.float32)


Generate random noise (Gaussian noise with mean 0 and std deviation same as mutated data's AF)

In [10]:
noise = tf.random.normal([len(nonmutated_sequences)], mean=0.0, stddev=tf.math.reduce_std(allele_frequency_mutated))
nonmutated_allele_frequency = tf.random.normal([len(nonmutated_sequences)], mean=tf.math.reduce_mean(allele_frequency_mutated), stddev=tf.math.reduce_std(allele_frequency_mutated))


Step 4: Apply Isolation Forest on non-mutated data

In [11]:
  clf = IsolationForest(contamination=0.01, random_state=42)
  clf.fit(array_data.numpy().reshape(-1, 1))

Get anomaly scores and predictions for non-mutated data

In [12]:
mutated_sequences_batch = next(iter(mutated_sequences))  # Extract a single batch of data
print(f"Shape of mutated_sequences_batch: {mutated_sequences_batch.shape}")

# Get batch size and sequence length
mutated_sequences_batch_size = tf.shape(mutated_sequences_batch)[0]
sequence_length = tf.shape(mutated_sequences_batch)[1]  # Assuming second dimension is the sequence length

# Check that there are at least 3 dimensions
if len(mutated_sequences_batch.shape) < 3:
    raise ValueError("mutated_sequences_batch does not have 3 dimensions as expected!")

# Ensure the anomaly scores are reshaped properly
anomaly_scores_reshaped = tf.reshape(anomaly_scores[:mutated_sequences_batch_size], [-1, 1, 1])

# Apply anomaly scores to the mutated sequences
mutated_sequences_with_anomalies = tf.identity(mutated_sequences_batch)

# Assuming that the last index is where the anomaly score should be inserted
if sequence_length < 1:
    raise ValueError("Sequence length is less than 1, cannot apply anomaly scores.")

# Adjust tensor scatter logic based on the number of dimensions
mutated_sequences_with_anomalies = tf.tensor_scatter_nd_update(
    mutated_sequences_with_anomalies,
    [[i, 0, sequence_length - 1] for i in range(mutated_sequences_batch_size)],  # Last element in sequence
    tf.reshape(anomaly_scores_reshaped, [-1])  # Flatten the anomaly scores to match the scatter update
)

# Now mutated_sequences_with_anomalies contains the sequences with applied anomaly scores
print(f"Shape of mutated_sequences_with_anomalies: {mutated_sequences_with_anomalies.shape}")

Shape of mutated_sequences_batch: (1000, 1, 410000)
Shape of mutated_sequences_with_anomalies: (1000, 1, 410000)


Concatenate the mutated and non-mutated sequences (with anomalies added to mutated)

In [None]:
X_with_anomalies = tf.concat([mutated_sequences_with_anomalies, nonmutated_sequences], axis=0)
y = tf.concat([mutated_labels, nonmutated_labels], axis=0)

x_train, x_val, y_train, y_val = train_test_split(X_with_anomalies.numpy(), y.numpy(), test_size=0.1, random_state=42, stratify=y.numpy())
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify=y_train)


RNN Model

In [None]:
def rnn_model(input_shape):
    model = Sequential([
        LSTM(32, input_shape=input_shape, return_sequences=False, activation="relu"),
        Dropout(0.5),
        Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))
    ])
    model.compile(optimizer=tf.keras.optimizers.AdamW(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def reset_rnn(input_shape):
    # Recreate the model to reset weights
    model = rnn_model(input_shape)
    return model

Plotting function

In [None]:
def plot_learning_curve(history, save_path):
    # Extract training and validation loss (or accuracy)
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']
    training_acc = history.history['accuracy']
    validation_acc = history.history['val_accuracy']
    epochs = range(1, len(training_loss) + 1)

    # Plot loss curves
    plt.figure(figsize=(12, 6))

    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, training_loss, label='Training Loss')
    plt.plot(epochs, validation_loss, label='Validation Loss')
    plt.title('Loss Curve')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, training_acc, label='Training Accuracy')
    plt.plot(epochs, validation_acc, label='Validation Accuracy')
    plt.title('Accuracy Curve')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path)

Reset model

In [None]:
model = reset_rnn(input_shape)

Checkpoints and logging

In [None]:
checkpoint = ModelCheckpoint(checkpoint_path, monitor="val_loss", save_best_only=True, mode="min")
csv_log = CSVLogger("training_log_new_7000_if.csv", append=True)

Fit the model

In [None]:
rnn_fit = model.fit(x_train, y_train, batch_size=16, epochs=20, verbose=1, validation_data=(x_test, y_test), callbacks=[csv_log, checkpoint])

Evaluate the model

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Predictions

In [None]:
predictions = model.predict(x_test).flatten()
print("\nðŸ”¹ First 10 Predictions vs Actual Values ðŸ”¹")
for i in range(10):
    print(f"Sample {i+1}: Actual = {y_test[i]}, Predicted Probability = {predictions[i]:.4f}")

Save plots

In [None]:
os.makedirs(save_path, exist_ok=True)

Histogram of predictions

In [None]:
plt.figure(figsize=(14, 10))
plt.hist(predictions, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel("Predicted Probability")
plt.ylabel("Count")
plt.title("Distribution of Predicted Probabilities")
plt.savefig(save_path + "histogram_predictions.png")
plt.show()

Calculate the ROC curve and AUC

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)

Plot ROC curve

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.savefig(save_path + "7000_new_if_roc_curve.png")

In [None]:
print(f"Area Under the Curve (AUC): {roc_auc:.4f}")

Classification report

In [None]:
report = classification_report(y_test, (predictions > 0.5).astype(int))
print("\nClassification Report:" + report)
report = classification_report(y_test, (predictions > 0.5).astype(int), output_dict=True)

Convert the dictionary into a DataFrame and transpose it

In [None]:
report_df = pd.DataFrame(report).transpose()

Save the report to a CSV file

In [None]:
report_df.to_csv("7000_new_if_classification_report.csv")

Plot learning curves

In [None]:
plot_learning_curve(rnn_fit, save_path="7000_new_if_learning_curve")