In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Load data
train_df = pd.read_csv('tox21_train.csv')
val_df = pd.read_csv('tox21_val.csv')
test_df = pd.read_csv('tox21_test.csv')

print(f"Train set size: {train_df.shape}")
print(f"Validation set size: {val_df.shape}")
print(f"Test set size: {test_df.shape}")

# 2. Get task names
tasks = train_df.columns[2:]  # Skip 'smiles' and 'mol_id'
print(f"Prediction tasks: {list(tasks)}")

# 3. Process SMILES strings
# Create character-level tokenizer
smiles_tokenizer = Tokenizer(char_level=True)
smiles_tokenizer.fit_on_texts(train_df['smiles'])
print(f"Vocabulary size: {len(smiles_tokenizer.word_index) + 1}")  # +1 for reserved index 0

# Convert SMILES strings to sequences of integers
train_sequences = smiles_tokenizer.texts_to_sequences(train_df['smiles'])
val_sequences = smiles_tokenizer.texts_to_sequences(val_df['smiles'])
test_sequences = smiles_tokenizer.texts_to_sequences(test_df['smiles'])

# Get maximum sequence length for padding
max_length = max([len(seq) for seq in train_sequences + val_sequences + test_sequences])
print(f"Max SMILES sequence length: {max_length}")

# Pad sequences to uniform length
X_train = pad_sequences(train_sequences, maxlen=max_length)
X_val = pad_sequences(val_sequences, maxlen=max_length)
X_test = pad_sequences(test_sequences, maxlen=max_length)

# 4. Prepare target labels and handle missing values
task_datasets = {}

for task in tasks:
    # Filter non-missing samples in each dataset
    train_mask = ~train_df[task].isna()
    task_X_train = X_train[train_mask]
    task_Y_train = train_df.loc[train_mask, task].values.astype(float)

    val_mask = ~val_df[task].isna()
    task_X_val = X_val[val_mask]
    task_Y_val = val_df.loc[val_mask, task].values.astype(float)

    test_mask = ~test_df[task].isna()
    task_X_test = X_test[test_mask]
    task_Y_test = test_df.loc[test_mask, task].values.astype(float)

    if len(task_Y_train) > 0 and len(task_Y_val) > 0 and len(task_Y_test) > 0:
        # Check class balance
        train_pos = np.sum(task_Y_train)
        train_neg = len(task_Y_train) - train_pos
        print(f"{task}: Positive samples in train = {train_pos}, Negative = {train_neg}, Ratio = {train_pos/len(task_Y_train):.2f}")

        # Store datasets
        task_datasets[task] = {
            'X_train': task_X_train, 'Y_train': task_Y_train,
            'X_val': task_X_val, 'Y_val': task_Y_val,
            'X_test': task_X_test, 'Y_test': task_Y_test
        }

# 5. Build and train a model for each task
def build_1d_cnn_model(vocab_size, max_length):
    """Build a 1D-CNN model for binary classification"""
    model = models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        layers.SpatialDropout1D(0.2),

        layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),

        layers.Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),

        layers.Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.GlobalMaxPooling1D(),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    return model

task_models = {}
task_histories = {}
test_results = {}

vocab_size = len(smiles_tokenizer.word_index) + 1

for task in task_datasets:
    print(f"\nTraining model for {task}...")

    data = task_datasets[task]
    model = build_1d_cnn_model(vocab_size, max_length)

    callbacks_list = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc', 
            patience=10, 
            restore_best_weights=True,
            mode='max'
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_auc',
            factor=0.5,
            patience=5,
            mode='max'
        )
    ]

    history = model.fit(
        data['X_train'], data['Y_train'],
        epochs=90,
        batch_size=32,
        validation_data=(data['X_val'], data['Y_val']),
        callbacks=callbacks_list,
        verbose=1,
        class_weight={
            0: 1.0,
            1: np.sum(data['Y_train'] == 0) / np.sum(data['Y_train'] == 1) if np.sum(data['Y_train'] == 1) > 0 else 1.0
        }
    )

    y_pred = model.predict(data['X_test']).ravel()
    auc = roc_auc_score(data['Y_test'], y_pred)
    y_pred_binary = (y_pred > 0.5).astype(int)
    accuracy = accuracy_score(data['Y_test'], y_pred_binary)
    fpr, tpr, _ = roc_curve(data['Y_test'], y_pred)

    test_results[task] = {
        'auc': auc,
        'accuracy': accuracy,
        'roc': (fpr, tpr)
    }

    task_models[task] = model
    task_histories[task] = history

    print(f"{task} Test AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")

# 7. Visualization
# Create results dataframe
results_df = pd.DataFrame({
    'Task': list(test_results.keys()),
    'AUC': [test_results[task]['auc'] for task in test_results],
    'Accuracy': [test_results[task]['accuracy'] for task in test_results]
})

print("\nTest set results:")
print(results_df)
print(f"Average Test AUC: {results_df['AUC'].mean():.4f}")

# Plot training history
plt.figure(figsize=(15, 10))

for i, task in enumerate(task_histories):
    history = task_histories[task].history
    plt.subplot(3, 4, i+1)
    plt.plot(history['auc'], label='Train AUC')
    plt.plot(history['val_auc'], label='Validation AUC')
    plt.title(f'{task} AUC')
    plt.xlabel('Epoch')
    plt.ylabel('AUC')
    plt.legend()

plt.tight_layout()
plt.show()

# Plot bar chart of AUCs
plt.figure(figsize=(10, 6))
plt.bar(results_df['Task'], results_df['AUC'])
plt.axhline(y=results_df['AUC'].mean(), color='r', linestyle='-', label=f'Average AUC: {results_df["AUC"].mean():.4f}')
plt.xlabel('Task')
plt.ylabel('AUC')
plt.title('Test AUC for Each Task')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# ROC Curve Plotting
plt.figure(figsize=(10, 8))

for i, task in enumerate(test_results):
    fpr, tpr = test_results[task]['roc']
    auc = test_results[task]['auc']
    plt.plot(fpr, tpr, label=f'{task} (AUC = {auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('1D-CMM ROC Curves')
plt.legend(loc='lower right')

plt.show()