In [1]:
# import the libraries
import numpy as np
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from itertools import product
import matplotlib.pyplot as plt



In [2]:
# import, shuffle, and see the data
ddf = dd.read_csv('/kaggle/input/test-and-train-punpaired-dataset/train_data_p_unp.csv').iloc[:, 1:]
shfl_ddf = ddf.sample(frac = 1, random_state = 42)
shfl_ddf.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,p_unp_197,p_unp_198,p_unp_199,p_unp_200,p_unp_201,p_unp_202,p_unp_203,p_unp_204,p_unp_205,p_unp_206
5420,ab587c2d24eb,GGGAACGACUCGAGUAGAGUCGAAAAGGCCUAGGGCGGGCGGGAAU...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,0,0.0,0,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
16450,309b8583b119,GGGAACGACUCGAGUAGAGUCGAAAAAAGACCUACAUACAUUGUAU...,2A3_MaP,DasLabBigLib_OneMil_Coronavirus_genomes_SARS_r...,162,0.538,0,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4544,4ed1f0557392,GGGAACGACUCGAGUAGAGUCGAAAAGCUGAUUGCCUGGCGGCUAC...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,702,1.553,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17837,93235436cac4,GGGAACGACUCGAGUAGAGUCGAAAAAUUAUUAGAAGGGGGUAAUG...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,3,0.0,0,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
14752,969913b8537c,GGGAACGACUCGAGUAGAGUCGAAAACAUGGUCACCACUGUUGGCG...,2A3_MaP,DasLabBigLib_OneMil_Coronavirus_genomes_SARS_r...,504,0.91,0,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
dms_ddf = ddf.loc[ddf['experiment_type'] == "DMS_MaP"]
twoa3_ddf = ddf.loc[ddf['experiment_type'] == "2A3_MaP"]
dms_ddf.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reads,signal_to_noise,SN_filter,reactivity_0001,reactivity_0002,reactivity_0003,...,p_unp_197,p_unp_198,p_unp_199,p_unp_200,p_unp_201,p_unp_202,p_unp_203,p_unp_204,p_unp_205,p_unp_206
14,70cdf97f6392,GGGAACGACUCGAGUAGAGUCGAAAACCUGGAGGAGGAUGGAACAC...,DMS_MaP,PK50_AltChemMap_NovaSeq_DMS,103505,35.334,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
15,bad76a72215c,GGGAACGACUCGAGUAGAGUCGAAAAUAAAUUCAGCGGUAAUUCCU...,DMS_MaP,PK50_AltChemMap_NovaSeq_DMS,12725,8.874,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
16,f037cc8df765,GGGAACGACUCGAGUAGAGUCGAAAAUACCGAGAAAGAUCCUCGGU...,DMS_MaP,PK50_AltChemMap_NovaSeq_DMS,60600,24.826,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
17,e63fe5ebb069,GGGAACGACUCGAGUAGAGUCGAAAACGGGCAAACUAGAAAAGCCC...,DMS_MaP,PK50_AltChemMap_NovaSeq_DMS,16034,13.426,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
18,a4a9f51c982e,GGGAACGACUCGAGUAGAGUCGAAAAGGGUCCAGCCUGGAAAGGCU...,DMS_MaP,PK50_AltChemMap_NovaSeq_DMS,306828,41.648,1,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# Modified version to account for probability of unpaired bases from secondary structure predictor
bases={'A':0, 'C':1, 'G':2, 'U':3 }

def one_hot(string, p_unpaired):

    res = np.zeros((5, 206), # Now there are 5 rows in the input vector, 457 is maximum length
                   dtype=np.float32)
    res[4, :] = 1

    for j in range(len(string)):
        if string[j] in bases: # bases can be 'N' signifying missing: this corresponds to all 0 in the encoding
            res[ bases[ string[j] ], j ]= 1.
        res[4, j] = p_unpaired[j]

    return res

In [5]:
# For p_unpaired data (changed the yield output)
import torch
import torch.nn as nn
import torch.nn.functional as F

class BedPeaksDataset(torch.utils.data.IterableDataset):

    def __init__(self, seq, p_unpaired, reactivities):
        super(BedPeaksDataset, self).__init__()
        self.seq = seq
        self.reactivities = reactivities
        self.p_unpaired = p_unpaired

    def __iter__(self):
        for i in range(len(self.seq)):
            yield(one_hot(self.seq[i], self.p_unpaired[i]), self.reactivities[i]) # positive example

In [6]:
def run_one_epoch(train_flag, dataloader, cnn_1d, optimizer, device="cuda"):

    torch.set_grad_enabled(train_flag)
    cnn_1d.train() if train_flag else cnn_1d.eval()

    losses = []
    accuracies = []

    size = len(dataloader.dataset.seq)
    for batch, (x,y) in enumerate(dataloader): # collection of tuples with iterator
        x = x.float()
        y = y.float()
        (x, y) = ( x.to(device), y.to(device) ) # transfer data to GPU

        output = cnn_1d(x,y) # forward pass
        output = output.squeeze() # remove spurious channel dimension
        loss = F.mse_loss(output, y).float()

        if train_flag:
            loss.backward() # back propagation
            optimizer.step()
            optimizer.zero_grad()
            
            #current = batch * len(x)
            #print(f"[{current:>5d}/{size:>5d}]")

        losses.append(loss.detach().cpu().numpy())

    return( np.mean(losses))

In [7]:
def train_model(cnn_1d, train_dataloader, valid_dataloader, epochs=100, patience=10, verbose = True, lr = 0.001, weight_decay = 0):
    """
    Train a 1D CNN model and record accuracy metrics.
    """
    # Move the model to the GPU here to make it runs there, and set "device" as above
    # TODO CODE
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cnn_1d.to(device)

    # 1. Make new BedPeakDataset and DataLoader objects for both training and validation data.
    # TODO CODE
    # train_dataset = BedPeaksDataset(train_data, genome, cnn_1d.seq_len)
    # train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=10, num_workers = 0)
    # validation_dataset = BedPeaksDataset(validation_data, genome, cnn_1d.seq_len)
    # validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=1000)

    # 2. Instantiates an optimizer for the model.
    # TODO CODE
    optimizer = torch.optim.Adam(cnn_1d.parameters(), amsgrad=True, lr = lr, weight_decay = weight_decay)

    # 3. Run the training loop with early stopping.
    # TODO CODE
    train_losses = []
    valid_losses = []
    # patience_counter = patience
    best_valid_loss = np.inf
    check_point_filename = 'cnn_1d_checkpoint.pt' # to save the best model fit to date
    for epoch in range(epochs):
        start_time = timeit.default_timer()
        train_loss = run_one_epoch(True, train_dataloader, cnn_1d, optimizer, device)
        valid_loss = run_one_epoch(False, valid_dataloader, cnn_1d, optimizer, device)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        if valid_loss < best_valid_loss:
            torch.save(cnn_1d.state_dict(), check_point_filename)
            best_valid_loss = valid_loss
            patience_counter = patience
        else:
            patience_counter -= 1
            if patience_counter <= 0:
                cnn_1d.load_state_dict(torch.load(check_point_filename)) # recover the best model so far
                break
        elapsed = float(timeit.default_timer() - start_time)
        print("Epoch {} took {:.2f}s. Train loss: {:.4f}., Valid loss: {:.4f}. Patience: {}".format(epoch+1, elapsed, train_loss, valid_loss, patience_counter))

    # 4. Return the fitted model (not strictly necessary since this happens "in place"), train and validation accuracies.
    # TODO CODE
    return(cnn_1d, train_losses, valid_losses)

In [8]:
! pip install dask_ml

Collecting dask_ml
  Downloading dask_ml-2023.3.24-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.7/148.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dask-glm>=0.2.0 (from dask_ml)
  Obtaining dependency information for dask-glm>=0.2.0 from https://files.pythonhosted.org/packages/8a/8e/cd1502dd2d00d54fb3e10880d4c8cb6699320a239da7a39c9f55044afdee/dask_glm-0.3.2-py2.py3-none-any.whl.metadata
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask_ml)
  Downloading sparse-0.14.0-py2.py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Downloading dask_glm-0.3.2-py2.py3-none-any.whl (13 kB)
Installing collected packages: sparse, dask-glm, dask_ml
Successfully installed dask-glm-0.3.2 dask_ml-2023.3.24 sparse-0.14.0


In [9]:
# apply SN-filter (with p_unpaired calculation)
from dask_ml.model_selection import train_test_split
df_sn = ddf[ddf["SN_filter"]==1]

# split into 2A3 MaP and DMS MaP datasets
df_2A3 = df_sn[df_sn["experiment_type"]=="2A3_MaP"]
df_DMS = df_sn[df_sn["experiment_type"]=="DMS_MaP"]

# split into train and test
X_2A3_seq = df_2A3["sequence"]
X_2A3_p_unpaired = df_2A3.loc[:, "p_unp_1":"p_unp_206"]
y_2A3 = df_2A3.loc[:, df_2A3.columns.str.fullmatch("reactivity_\d\d\d\d")]
X_2A3_train_seq, X_2A3_test_seq, X_2A3_train_p_unpaired, X_2A3_test_p_unpaired, y_2A3_train, y_2A3_test = train_test_split(X_2A3_seq, X_2A3_p_unpaired, y_2A3, test_size=0.2, shuffle=True, blockwise=True, random_state=42)
X_2A3_train_seq, X_2A3_validation_seq, X_2A3_train_p_unpaired, X_2A3_validation_p_unpaired, y_2A3_train, y_2A3_validation = train_test_split(X_2A3_train_seq, X_2A3_train_p_unpaired, y_2A3_train, test_size=0.25, shuffle=True, blockwise=True, random_state=42)

X_DMS = df_DMS["sequence"]
y_DMS = df_DMS.loc[:, df_DMS.columns.str.fullmatch("reactivity_\d\d\d\d")]
X_DMS_train, X_DMS_test, y_DMS_train, y_DMS_test = train_test_split(X_DMS, y_DMS, test_size=0.2, shuffle=True, blockwise=True, random_state=42)
X_DMS_train, X_DMS_validation, y_DMS_train, y_DMS_validation = train_test_split(X_DMS_train, y_DMS_train, test_size=0.25, shuffle=True, blockwise=True, random_state=42)

In [10]:
def df_toArray_train(ddf1A, ddf1B, ddf2): # for sequence, p_unpaired, and reactivity
    with ProgressBar():
        subset_columns = []
        for i in range(206):
            subset_columns.append("reactivity_0"+str(i+1).zfill(3))

        # Compute the subset of the Dask DataFrame and convert it to a Pandas DataFrame
        reactivities = ddf2.compute().to_numpy()
        p_unpaired = ddf1B.compute().to_numpy()

        row_means = np.nanmean(reactivities, axis=1)

        # Iterate over each element and replace NaN with the row mean
        for i, row in enumerate(reactivities):
            mask = np.isnan(row)
            reactivities[i, mask] = row_means[i]

        seqs = ddf1A.compute().tolist()

        return seqs, p_unpaired, reactivities

In [11]:
def df_toArray_test(ddf1A, ddf1B, ddf2): # for sequence, p_unpaired, and reactivity
    with ProgressBar():
        subset_columns = []
        for i in range(206):
            subset_columns.append("reactivity_0"+str(i+1).zfill(3))

        # Compute the subset of the Dask DataFrame and convert it to a Pandas DataFrame
        reactivities = ddf2.compute().to_numpy()
        p_unpaired = ddf1B.compute().to_numpy()

        row_means = np.nanmean(reactivities, axis=1)

        seqs = ddf1A.compute().tolist()

        return seqs, p_unpaired, reactivities

In [12]:
# Using 2A3 Data - with p_unpaired
seqs, p_unpaired, reactivities = df_toArray_train(X_2A3_train_seq, X_2A3_train_p_unpaired, y_2A3_train)
train_dataset = BedPeaksDataset(seqs, p_unpaired, reactivities)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=50, num_workers = 0)

seqs, p_unpaired, reactivities = df_toArray_train(X_2A3_validation_seq, X_2A3_validation_p_unpaired, y_2A3_validation)
validation_dataset = BedPeaksDataset(seqs, p_unpaired, reactivities)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=50, num_workers = 0)

seqs, p_unpaired, reactivities = df_toArray_test(X_2A3_test_seq, X_2A3_test_p_unpaired, y_2A3_test)
test_dataset = BedPeaksDataset(seqs, p_unpaired, reactivities)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=50, num_workers = 0)

[########################################] | 100% Completed | 100.54 s
[########################################] | 100% Completed | 107.36 s
[########################################] | 100% Completed | 101.76 s
[########################################] | 100% Completed | 103.49 s
[########################################] | 100% Completed | 99.97 s
[########################################] | 100% Completed | 100.03 s
[########################################] | 100% Completed | 98.35 s
[########################################] | 100% Completed | 93.56 s
[########################################] | 100% Completed | 95.23 s


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class TransformerModel(nn.Module):
    def __init__(self, input_dim=5, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, n_output_channels=10, dropout=0.2):
        super(TransformerModel, self).__init__()
        self.input_dim = input_dim
        # Embedding layers that will transform input to match d_model size
        self.embedding = nn.Linear(input_dim, d_model)
        self.target_embedding = nn.Linear(1, d_model)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.pos_decoder = PositionalEncoding(d_model, dropout)

        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)

        # Transformer Decoder
        decoder_layers = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, num_layers=num_decoder_layers)

        # Decoder layer to bring the output to the desired n_output_channels
        self.decoder = nn.Linear(d_model, n_output_channels)

    def forward(self, src, tgt):
        # Reshape input to [seq_len, batch_size, input_dim]
        src = src.view(-1, src.size(0), 5)  # Reshaped to [src_seq_len, batch_size, input_dim]
        tgt = tgt.view(-1, tgt.size(0), 1)  # Reshaped to [tgt_seq_len, batch_size, input_dim]
        
        # Pass through the embedding layers
        src = self.embedding(src)
        tgt = self.target_embedding(tgt)

        # Add positional encoding
        src = self.pos_encoder(src)
        tgt = self.pos_decoder(tgt)

        # Pass through the Transformer Encoder
        memory = self.transformer_encoder(src)

        # Pass through the Transformer Decoder
        output = self.transformer_decoder(tgt, memory)

        output = torch.mean(output, dim=0)  # Now the shape should be [batch_size, d_model]
    
        # Now reshape it to match the linear layer's input expectation
        output = output.view(output.size(0), -1)  # Shape should be [batch_size, d_model]

        # Pass through the decoder
        output = self.decoder(output)  # This should work as expected now
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


In [15]:
# https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
import timeit
transformer = TransformerModel(input_dim=5, d_model=256, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, n_output_channels=206)

# train model
transformer, train_losses, valid_losses = train_model(transformer, train_dataloader, validation_dataloader, lr = 0.001, weight_decay = 0)
torch.save(transformer.state_dict(), 'model.pth')

# # load model
# transformer.load_state_dict(torch.load("/kaggle/input/transformer-model-2a3/transformer_model.pth"))
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# transformer.to(device)

Epoch 1 took 339.00s. Train loss: 0.1787., Valid loss: 0.1788. Patience: 10
Epoch 2 took 336.85s. Train loss: 0.1771., Valid loss: 0.1783. Patience: 10
Epoch 3 took 337.53s. Train loss: 0.1769., Valid loss: 0.1782. Patience: 10
Epoch 4 took 336.41s. Train loss: 0.1769., Valid loss: 0.1783. Patience: 9
Epoch 5 took 336.33s. Train loss: 0.1768., Valid loss: 0.1783. Patience: 8
Epoch 6 took 335.95s. Train loss: 0.1768., Valid loss: 0.1784. Patience: 7
Epoch 7 took 335.99s. Train loss: 0.1768., Valid loss: 0.1785. Patience: 6
Epoch 8 took 336.04s. Train loss: 0.1768., Valid loss: 0.1784. Patience: 5
Epoch 9 took 335.91s. Train loss: 0.1768., Valid loss: 0.1785. Patience: 4
Epoch 10 took 335.62s. Train loss: 0.1768., Valid loss: 0.1785. Patience: 3
Epoch 11 took 335.60s. Train loss: 0.1768., Valid loss: 0.1784. Patience: 2
Epoch 12 took 335.71s. Train loss: 0.1768., Valid loss: 0.1785. Patience: 1


In [16]:
device = 'cuda'
outputs = []
expected_padded = []
expected_nonpadded = []
transformer.eval()

for (x, y) in test_dataloader:  # iterate over batches
    
    # add y to expected_nonpadded
    expected_nonpadded.append(y.float().numpy())
    
    # add padding to y
    y_pad = y.clone()

    # add padding to y
    row_means = np.nanmean(y_pad, axis=1)
    for i, row in enumerate(y_pad):
        mask = np.isnan(row)
        y_pad[i, mask] = row_means[i]
        
    # add y_pad to expected_padded
    expected_padded.append(y_pad.float().numpy())
    
    if torch.cuda.is_available():
        y_pad = y_pad.to(device).float()
        x = x.to(device).float()

    output = transformer(x.float(), y_pad.float()).squeeze()  # your awesome model here!

    output_np = output.detach().cpu().numpy()
    outputs.append(output_np)

# Concatenate all the outputs and expected results
output_np = np.concatenate(outputs)
expected_pad_np = np.concatenate(expected_padded)
expected_nonpad_np = np.concatenate(expected_nonpadded)

print(output_np)
print(expected_pad_np)
print(expected_nonpad_np)
print(output_np.shape)
print(expected_pad_np.shape)
print(expected_nonpad_np.shape)

# Clip the values to be between 0 and 1
output_np = output_np.clip(0, 1)
expected_pad_np = expected_pad_np.clip(0, 1)
expected_nonpad_np = expected_nonpad_np.clip(0, 1)

# Calculate the Mean Absolute Error
mae1_pad = np.nanmean(np.abs(output_np - expected_pad_np))
mae1_nonpad = np.nanmean(np.abs(output_np - expected_nonpad_np))
print(mae1_pad)
print(mae1_nonpad)

  y_pad[i, mask] = row_means[i]


[[0.43209782 0.44418663 0.4249778  ... 0.43316352 0.4301526  0.43532902]
 [0.43209726 0.4441871  0.42497742 ... 0.43316287 0.43015245 0.43533015]
 [0.4320985  0.4441861  0.42497802 ... 0.43316257 0.43015298 0.43532974]
 ...
 [0.4320859  0.44418937 0.42497274 ... 0.43316427 0.43015945 0.43533313]
 [0.43209854 0.44418657 0.4249777  ... 0.43316197 0.43015358 0.43532962]
 [0.4320985  0.44418755 0.42497745 ... 0.433163   0.4301523  0.4353308 ]]
[[0.43326   0.43326   0.43326   ... 0.43326   0.43326   0.43326  ]
 [0.39229   0.39229   0.39229   ... 0.39229   0.39229   0.39229  ]
 [0.56442   0.56442   0.56442   ... 0.56442   0.56442   0.56442  ]
 ...
 [0.42745   0.42745   0.42745   ... 0.42745   0.42745   0.42745  ]
 [0.4861262 0.4861262 0.4861262 ... 0.4861262 0.4861262 0.4861262]
 [0.42485   0.42485   0.42485   ... 0.42485   0.42485   0.42485  ]]
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... n

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fe343e39-d2c0-4296-915d-091d9a42752d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>