In [2]:
#on importe les bibliotheques que l'on aura besoin
import os
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def process(data, standardize):
    sequences= data[:,0]
    mfe=data[:,1].astype(float)
    struct=data[:,2]
    hairpins= data[:,3].astype(int)

    # On encode les nucleotides et les secondary struct
    nuc = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    brack = {'(': 0, '.': 1, ')': 2}

    # Determine les longues maximales
    max_len = max(len(seq) for seq in sequences)
    max_struc = max(len(struc) for struc in struct)

    # On initialize les pads pour les differentes variables
    pad_seq = np.zeros((len(sequences), max_len, 4))
    pad_struct = np.zeros((len(struct), max_struc, 3))
    pad_hairpins = np.zeros((len(hairpins), 5))  # Assuming hairpins range from 1 to 5

    # One-hot encode les sequences
    for i, seq in enumerate(sequences):
        for j, ch in enumerate(seq):
            pad_seq[i, j, nuc[ch]] = 1

    # One-hot encode les structures
    for i, struc in enumerate(struct):
        for j, ch in enumerate(struc):
            pad_struct[i, j, brack[ch]] =1

        # One-hot encode les hairpins
    for i, hp in enumerate(hairpins):
        pad_hairpins[i, hp-1] = 1

    # on standardize les donnees
    if standardize:
        mfe = (mfe - np.mean(mfe)) / np.std(mfe)

    return pad_seq, mfe, pad_struct, pad_hairpins



In [3]:
# dans le tableau suivant je definit un tableau avec
#les noms des dossiers que l'on va importer pour l'analyse
files=['test_2p5M_struct.pkl','train_2p5M_struct.pkl','valid_2p5M_struct.pkl']

In [4]:
# on definit nos datasets
test=[]
train=[]
valid=[]
for i in range(len(files)):
    with open(files[i],'rb') as f:
        data= pickle.load(f)
    if i==0:
        test=np.array(data)
    elif i ==1:
        train= np.array(data)
    else:
        valid= np.array(data)

In [5]:
# on definit nos variables pour chaque split
seq_train, mfe_train, struct_train, pins_train= process(train,True)
seq_valid, mfe_valid, struct_valid, pins_valid= process(valid,True)
seq_test, mfe_test, struct_test, pins_test= process(test,True)

In [6]:
# on definit nos tensors
#sequences
seq_train= torch.tensor(seq_train, dtype= torch.float32)
seq_valid= torch.tensor(seq_valid, dtype= torch.float32)
seq_test= torch.tensor(seq_test, dtype= torch.float32)

#MFE
mfe_train= torch.tensor(mfe_train, dtype= torch.float32)
mfe_valid= torch.tensor(mfe_valid, dtype= torch.float32)
mfe_test= torch.tensor(mfe_test, dtype= torch.float32)

#hairpins
pins_train = torch.tensor(pins_train, dtype=torch.int64)
pins_valid = torch.tensor(pins_valid, dtype=torch.int64)
pins_test = torch.tensor(pins_test, dtype=torch.int64)

#Struct
struct_train= torch.tensor(struct_train, dtype= torch.float32)
struct_valid= torch.tensor(struct_valid, dtype= torch.float32)
struct_test= torch.tensor(struct_test, dtype= torch.float32)

In [7]:
# on redefinit nos splits
# train dataset
train_set= TensorDataset(seq_train, struct_train,mfe_train, pins_train)
train_loader= DataLoader(train_set, batch_size=32, shuffle= True)
# validation set
valid_set= TensorDataset(seq_valid, struct_valid,mfe_valid, pins_valid)
valid_loader= DataLoader(valid_set, batch_size=32)
#test set
test_set= TensorDataset(seq_test, struct_test,mfe_test, pins_test)
test_loader= DataLoader(test_set, batch_size=32)

In [3]:
class GRUModel(nn.Module):
  def __init__(self, input_size_seq, input_size_struct, hidden_size, output_size_mfe, input_size_hairpins, hidden_size_hairpins, output_size_hairpins):
    super(GRUModel, self).__init__()
    self.gru = nn.GRU(input_size=input_size_seq, hidden_size=hidden_size, batch_first=True)

    # on ajuste pour mfe
    self.fc_mfe = nn.Linear(1, output_size_mfe)  # Assumons un feature pour mfe

    self.fc_hairpins1 = nn.Linear(input_size_hairpins, hidden_size_hairpins)


    self.fc_hairpins2 = nn.Linear(hidden_size + output_size_mfe + hidden_size_hairpins, output_size_hairpins)
    #self.fc_struct = nn.Linear(hidden_size, 64)
    #self.fc_struct = nn.LogSoftmax(dim=-1)

    print(f"self.fc_hairpins2 ={self.fc_hairpins2}")
    print(f"self.fc_hairpins1= {self.fc_hairpins1}")
    print(f"self.gru={self.gru}")
    print(f"self.fc_mfe= {self.fc_mfe}")

  def forward(self, seq_input, struct_input, mfe_input, hairpins_input):
    gru_output, _ = self.gru(seq_input)
    batch_size = gru_output.size(0)

    # On reshape le data mfe si necessaire
    if len(mfe_input.size()) == 1:  # si le mfe a seulement une dimension
      mfe_input = mfe_input.view(batch_size, 1)  # Reshape(batch_size, 1)

    # on handle le input
    hairpins_processed = self.fc_hairpins1(hairpins_input)
    hairpins_processed = nn.functional.relu(hairpins_processed)  # activation optionnel peut overfit

    # concat avec le dernier timestep de la sortie Gru
    concatenated = torch.cat((gru_output[:, -1, :], mfe_input, hairpins_processed), dim=1)

    # s'assure que Mfe a les bonnes dimensions
    self.fc_mfe = nn.Linear(129, 1)  # on assuime 129 features base surles analyses precedentes

    hairpins_output = self.fc_hairpins2(concatenated)
    mfe_output = self.fc_mfe(concatenated)  # on utilise le nouveau mfe reshaped
    # prediction structure secondaire
    #struct_output = self.fc_struct(gru_output)  # Apply FC to entire sequence output

   # Reshape to match target format
    #struct_output = struct_output.reshape(struct_output.size(0), -1, self.fc_struct.out_features)

    #print(f"mfe_output= {mfe_output.shape}")
    #print(f"hairpins_output= {hairpins_output.shape}")

    return mfe_output, hairpins_output #, struct_output


In [4]:
# on instancie le modele avec les pertes
model =GRUModel(4,3,64,1,5,64,5)
criterion_mfe= nn.MSELoss()
criterion_pins= nn.CrossEntropyLoss()
criterion_struct=nn.CrossEntropyLoss()
optimizer= optim.Adam(model.parameters(),lr=.001)

self.fc_hairpins2 =Linear(in_features=129, out_features=5, bias=True)
self.fc_hairpins1= Linear(in_features=5, out_features=64, bias=True)
self.gru=GRU(4, 64, batch_first=True)
self.fc_mfe= Linear(in_features=1, out_features=1, bias=True)


In [10]:
# boucle d'entrainement
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss_mfe = 0.0
    running_loss_hairpins = 0.0
    #running_loss_struct=0.0

    for seq_batch, struct_batch, mfe_batch, hairpins_batch in train_loader:
        optimizer.zero_grad()
        #print(f"seq_batch= {seq_batch.shape}")
        #print(f"struct_batch= {struct_batch.shape}")
        #print(f"mfe= {mfe_batch.shape}")
        #print(f"hairpins_batch= {hairpins_batch.shape}")

        hairpins_batch = hairpins_batch.float()
        mfe_pred, hairpins_pred = model(seq_batch, struct_batch, mfe_batch, hairpins_batch)

        loss_mfe = criterion_mfe(mfe_pred, mfe_batch)
        loss_hairpins = criterion_pins(hairpins_pred, hairpins_batch.squeeze().float())
        #print(f"struct_pred: {struct_pred.shape } struct_batch={struct_batch.shape}")
        #loss_struct= criterion_struct(struct_pred, struct_batch)
        loss = loss_mfe + loss_hairpins

        loss.backward()
        optimizer.step()

        running_loss_mfe += loss_mfe.item() * seq_batch.size(0)
        running_loss_hairpins += loss_hairpins.item() * seq_batch.size(0)
        #running_loss_struct += loss_struct.item() * seq_batch.size(0)

    epoch_loss_mfe = running_loss_mfe / len(train_set)
    epoch_loss_hairpins = running_loss_hairpins / len(train_set)
    #epoch_loss_struct = running_loss_struct / len(train_set)

    print(f"Epoch {epoch+1}/{epochs}, Train MFE Loss: {epoch_loss_mfe:.10f}, Train Hairpins Loss: {epoch_loss_hairpins:.10f}, Train struct Loss: epoch_loss_struct:.10f")
    # boucle de validation
    model.eval()
    valid_loss_mfe = 0.0
    valid_loss_hairpins = 0.0
    #valid_loss_struct=0.0
    with torch.no_grad():
        for seq_batch, struct_batch, mfe_batch, hairpins_batch in valid_loader:
            hairpins_batch = hairpins_batch.float()
            mfe_pred, hairpins_pred = model(seq_batch, struct_batch, mfe_batch, hairpins_batch)
            valid_loss_mfe += criterion_mfe(mfe_pred, mfe_batch).item() * seq_batch.size(0)
            valid_loss_hairpins += criterion_pins(hairpins_pred, hairpins_batch.squeeze().float()).item() * seq_batch.size(0)
            #valid_loss_struct += criterion_struct(struct_pred, struct_batch).item() * seq_batch.size(0)

    valid_loss_mfe /= len(valid_set)
    valid_loss_hairpins /= len(valid_set)
    #valid_loss_struct /= len(valid_set)

    print(f"Epoch {epoch+1}/{epochs}, Valid MFE Loss: {valid_loss_mfe:.10f}, Valid Hairpins Loss: {valid_loss_hairpins:.10f} Valid struct Loss: valid_loss_struct:.10f")
    # Evaluation pour le test set
    model.eval()
    test_loss_mfe = 0.0
    test_loss_hairpins = 0.0
    #test_loss_struct=0.0

    with torch.no_grad():
       for seq_batch, struct_batch, mfe_batch, hairpins_batch in test_loader:
           hairpins_batch = hairpins_batch.float()
           mfe_pred, hairpins_pred = model(seq_batch, struct_batch, mfe_batch, hairpins_batch)
           test_loss_mfe += criterion_mfe(mfe_pred, mfe_batch).item() * seq_batch.size(0)
           test_loss_hairpins += criterion_pins(hairpins_pred, hairpins_batch.squeeze().float()).item() * seq_batch.size(0)
          # test_loss_struct += criterion_struct(struct_pred, struct_batch).item() * seq_batch.size(0)

    test_loss_mfe /= len(test_set)
    test_loss_hairpins /= len(test_set)
   # test_loss_struct /= len(test_set)

    print(f"Test MFE Loss: {test_loss_mfe:.10f}, Test Hairpins Loss: {test_loss_hairpins:.10f}, test Hairpins Loss: test_loss_struct:.10f")

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/20, Train MFE Loss: 1.0073478120, Train Hairpins Loss: 0.0020567416, Train struct Loss: epoch_loss_struct:.10f


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/20, Valid MFE Loss: 1.0058134710, Valid Hairpins Loss: 0.0000275761 Valid struct Loss: valid_loss_struct:.10f
Test MFE Loss: 1.0056267150, Test Hairpins Loss: 0.0000179889, test Hairpins Loss: test_loss_struct:.10f
Epoch 2/20, Train MFE Loss: 1.0055711255, Train Hairpins Loss: 0.0000105815, Train struct Loss: epoch_loss_struct:.10f
Epoch 2/20, Valid MFE Loss: 1.0053697120, Valid Hairpins Loss: 0.0000087013 Valid struct Loss: valid_loss_struct:.10f
Test MFE Loss: 1.0050846651, Test Hairpins Loss: 0.0000084690, test Hairpins Loss: test_loss_struct:.10f
Epoch 3/20, Train MFE Loss: 1.0053209328, Train Hairpins Loss: 0.0000038603, Train struct Loss: epoch_loss_struct:.10f
Epoch 3/20, Valid MFE Loss: 1.0056391748, Valid Hairpins Loss: 0.0000014379 Valid struct Loss: valid_loss_struct:.10f
Test MFE Loss: 1.0052092616, Test Hairpins Loss: 0.0000014237, test Hairpins Loss: test_loss_struct:.10f
Epoch 4/20, Train MFE Loss: 1.0053265739, Train Hairpins Loss: 0.0000026407, Train struct Los

KeyboardInterrupt: 

In [5]:
#On sauve le modele
torch.save(model.state_dict(), 'model_state_dict.pth')