In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import torch 
from torch.utils.data import Dataset
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam

from scipy.stats import spearmanr


import os
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


1. Load the dataset 

In [2]:
#load training data (will be put in a function later)  
path = os.getcwd()
for i in range(3) :

    path = os.path.dirname(path)

path += '/data/'
train_df = pd.read_csv(path + 'train_v1.csv',index_col="seq_id")
train_df = train_df.drop(columns=['data_source'])
train_df = train_df.dropna()
train_df.head()

Unnamed: 0_level_0,protein_sequence,pH,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5


2. Translate Amino-acids to numbers and create a One-Channel array for each sequence

In [3]:
# Count the frequency of each amino acid in the training set
map_dict = {} 
def count_frequency(s):
    counter = Counter(s)
    for p in s  : 
        if p in map_dict : 
            map_dict[p]+=counter[p]
        else : 
            map_dict[p]= counter[p]
        


In [4]:
# fill the map_dict with the frequency of each amino acid  (TODO:  better code )
_ = train_df.apply(lambda row : count_frequency(row['protein_sequence']),axis=1)
map_dict = dict(sorted(map_dict.items(), key=lambda item: item[1],reverse=True))

# Assign a number to each amino acid based on its frequency in decreasing order
i = 1
for key, value in map_dict.items():
    map_dict[key]= i
    i+=1

In [5]:
#map each amino acid to its number
def chr_to_int(s):
    l = []
    for ch in s :
       l.append(map_dict[ch])
    return l


In [6]:
#create a new column in the dataframe with the  amino acids in numerical form (array of numbers from 1 to 20)
train_df['numerical_sequence'] = train_df.apply(lambda row : chr_to_int(row['protein_sequence']),axis=1)

In [7]:
#add a new column that contains the length of each protein sequence (before padding)
train_df['length'] = train_df['protein_sequence'].str.len()

In [8]:
# Function to encode sequences (one hot encoding)
max_length = max(train_df['length'])
def encode_seq(sequence):
    alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] # aa letters
    char_to_int = dict((c, i) for i, c in enumerate(alphabet)) 
    integer_encoded = [char_to_int[char] for char in sequence] #each character becomes int
    onehot_encoded = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(alphabet))] #0 for all letters
        letter[value] = 1 #modify the column corresponding to the letter to 1
        onehot_encoded.append(letter) #put in the array (1 letter = 1 array of 20 columns)
    
    ar =   np.transpose(np.array(onehot_encoded))
    zeros = np.zeros([len(alphabet),max_length - len(integer_encoded)] )
    onehot_encoded = np.concatenate((ar, zeros), axis = 1) #zero padding


    return onehot_encoded #we have all arrays, corresponding to the whole sequence


# new column with encoded sequence (apply for each sequence)
train_df['encoded_sequence'] = train_df['protein_sequence'].apply(lambda x: encode_seq(x))

In [9]:
train_df['encoded_sequence'].shape

(28695,)

: 

In [9]:
#final dataframe 
train_df.head()

Unnamed: 0_level_0,protein_sequence,pH,tm,numerical_sequence,length,encoded_sequence
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,"[4, 4, 4, 4, 6, 4, 4, 4, 1, 4, 1, 1, 5, 3, 4, ...",341,"[[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0,..."
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,"[4, 4, 4, 8, 5, 3, 9, 1, 17, 14, 3, 3, 3, 10, ...",286,"[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,"[4, 4, 4, 15, 2, 11, 9, 10, 4, 11, 2, 16, 10, ...",497,"[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,"[4, 4, 4, 2, 5, 1, 10, 11, 4, 12, 9, 4, 13, 9,...",265,"[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,..."
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5,"[4, 4, 4, 11, 6, 2, 5, 9, 10, 10, 13, 2, 13, 5...",1451,"[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


: 

In [10]:
max_length = max(train_df['length'])
max_length

8798

# Adding inverted sequences (DISCRADED) 
**Add it to report as an experiment ?**


inverted_df = train_df.copy()

#inver the protein sequence
inverted_df['protein_sequence'] = inverted_df.apply(lambda row : row['protein_sequence'][::-1],axis=1)
inverted_df['numerical_sequence'] = inverted_df.apply(lambda row : chr_to_int(row['protein_sequence']),axis=1)

# merge the original and inverted dataframes (size of dataset is doubled)
train_df = pd.concat([train_df,inverted_df],ignore_index=True)

## 3. Padding with zeros 

In [12]:
## add 0 to numerical_sequence to make all of them the same length
padded_train_df = train_df.copy()
padded_train_df['encoded_sequence'] = train_df.apply(lambda row : row['encoded_sequence'][:] + [0]*(max_length - row['length']),axis=1)

ValueError: operands could not be broadcast together with shapes (20,341) (8457,) 

In [None]:
#Prepare dataframe to Pass to the dataloader (will be put in a function later)
padded_train_df.drop(columns=['protein_sequence'],inplace=True)
padded_train_df['y'] = padded_train_df['tm']
padded_train_df.drop(columns=['tm'],inplace=True)
padded_train_df['numerical_sequence'] = padded_train_df.apply(lambda row : np.array(row['numerical_sequence']),axis=1)

Split to train and validation sets

In [11]:
#splot padded_train_df into train and validation sets (will be put in a function later)
train_df = padded_train_df.sample(frac=0.8,random_state=200)
val_df = padded_train_df.drop(train_df.index)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


NameError: name 'padded_train_df' is not defined

si met la transformation dans le dataframe : le kernel dies
Si met avant, dans le panda, les dimensions sont pas les bonnes (peut être transposer ??)

In [52]:
train_df.iloc[:,5].astype('int')

ValueError: setting an array element with a sequence.

## Create 1d conv net

1. get DataLoader from train_dl

In [38]:
class EnzymesDataset(Dataset):
 
  def __init__(self,df):
    
    # the Amino acid sequences as an int array
    sequence= df.iloc[:,5].values
    # numerical : pH and length
    numerical = df.iloc[:,[1,4]].values

    # y : the target (tm)
    y=df.iloc[:,2].values
  
    #creta tensors from the numpy arrays
    self.x_sequence=torch.tensor(sequence)
    self.y=torch.tensor(y,dtype=torch.float32)
    self.num=torch.tensor(numerical,dtype=torch.float32)
   
 
  def __len__(self):
    return len(self.y)
   
  def __getitem__(self,idx):
    return self.x_sequence[idx],self.y[idx] , self.num[idx]


In [39]:
# hyperparameters
batch_size = 128
learning_rate = 0.001 # Suggested for Adam
num_epochs = 10


a faire : apres avoir fait one hot encoding, trouver comment mettre l'info de plusieurs channels dans le dataframe, sans qu'il mette d'erreur sur la taille. 
Voir comment mettre un tableau = 1 aa puis la longueur de la ligne = longueur totale (juste transposer ?)


class MyLoss(torch.nn.Module):
    def __init__(self, batch_size, classes):
        super(MyLoss, self).__init__()
        # define some attributes
        self.y_true_one_hot = torch.FloatTensor(batch_size, classes, length)

    def forward(self, y_pred, y_true):
        with torch.no_grad():
            self.y_true_one_hot.zero_().scatter_(1, y_true, 1) # permet one hot encoding
        # do some operations
        return loss


Or use cross entropy loss ?

In [40]:
# create pytorch dataframes
train_df = EnzymesDataset(train_df)
val_df = EnzymesDataset(val_df)


# create pytorch dataloaders
train_dl = torch.utils.data.DataLoader(train_df, batch_size=batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_df, batch_size=batch_size, shuffle=True)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
encoded = (torch.nn.functional.one_hot(train_df.x_sequence, num_classes=- 1))


In [None]:
aa_encoded = encoded.T

In [None]:
train_df.x_sequence

In [None]:
encoded[0][0]

In [None]:
class Conv1D_OneChannel(nn.Module):

    def __init__(self):
        super().__init__()
        self.protein_sequence = nn.Sequential(
            nn.Conv1d(1, 1,kernel_size=8, stride=1),
            nn.ReLU(),
            nn.Dropout(),


        )

        self.prot_seq_one_pooling = nn.Sequential(

            #With pooling only at the end (seen in paper)

            nn.Conv1d(1, 64,kernel_size=8, stride=1),
            nn.ReLU(),
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 32, 5, stride=1, padding=1), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.AdaptiveAvgPool1d(32), #argument = output size 
            nn.Conv1d(32, 1, 5, stride=1, padding=1), 
            nn.ReLU(), 
            nn.Dropout(),


        )
        self.numerical = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, 64),
            nn.ReLU(),
        )
        self.mlp = nn.Sequential(
            nn.Linear(94, 64),#input devrait être 32 + 64 plutôt non si on utilise MaxPoolId(2)? (était marqué 128 en input avant) Comme on fait le pooling
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, 1),
        )

    def forward(self, x,y):
        x = self.prot_seq_one_pooling(x.float())
        y = self.numerical(y)
       
        x = torch.cat((x.squeeze(1), y), 1)
        x = self.mlp(x)
        return x

In [None]:
model = Conv1D_OneChannel()

In [None]:

optimizer = Adam(model.parameters(), lr=learning_rate)
# defining the loss function
criterion = nn.MSELoss()
# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    optimizer = optimizer.cuda()
    


In [None]:
def train_epoch(model, optimizer, criterion, train_loader, epoch):
    model.train()
    rho = 0 
    for batch_idx, (seq, target,num) in enumerate(train_loader):
        if torch.cuda.is_available():
            seq = seq.cuda()
            target = target.cuda()
            num = num.cuda()
        optimizer.zero_grad()
        output = model(seq.unsqueeze(1),num)
        loss = criterion(output.squeeze(), target)
        loss.backward()
        optimizer.step()
        # calculate Spearman's rank correlation coefficient
        p, _ = spearmanr(target.cpu().detach().numpy(), output.cpu().detach().numpy())
        rho += p
        
        print(
            f"Train Epoch: {epoch}-{batch_idx:03d} "
            f"batch_loss={loss.item():0.2e} "
            
        )

    rho = rho / len(train_loader)
    return loss.item() , rho

In [None]:
def test_epoch(model, criterion, test_loader):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for seq, target,num in test_loader:
            if torch.cuda.is_available():
                seq = seq.cuda()
                target = target.cuda()
                num = num.cuda()
            output = model(seq.unsqueeze(1),num)
            test_loss += criterion(output.squeeze(), target).item()  # sum up batch loss
           # calculate pearson correlation 
            p, _ =  spearmanr(target.cpu().detach().numpy(), output.cpu().detach().numpy())
            rho += p
            

    test_loss /= len(test_loader)
    rho = rho / len(test_loader)
    print(
        f"Test set: Average loss: {test_loss:0.2e} "
    )

    return test_loss ,rho

In [None]:
# train and test the model (save it after each epoch)
train_loss_history = []
test_loss_history = []
train_rho_history = []
test_rho_history = []
for epoch in range(1, num_epochs + 1):
    train_loss , rho_train = train_epoch(
        model, optimizer, criterion, train_dl, epoch
    )
    train_loss_history.append(train_loss)
    train_rho_history.append(rho_train)

    
    
    test_loss , rho_test = test_epoch(model, criterion, val_df)
    test_loss_history.append(test_loss)
    test_rho_history.append(rho_test)
    
    torch.save(model.state_dict(), f"1-Conv1d_OneChannel_model_{epoch}.pth")


In [None]:
#create loss plot

plt.plot(train_loss_history, label='train loss')
plt.plot(test_loss_history, label='test loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title(' train and test MSE Loss')
plt.legend()
plt.savefig('plots/1-conv1d_OneChannel-Loss.png')


In [None]:
plt.plot(train_rho_history, label='train rho')
plt.plot(test_rho_history, label='test rho')
plt.xlabel('Epoch')
plt.ylabel('rho')
plt.title(' Spearman\'s rank correlation coefficient')
plt.legend()
plt.savefig('plots/1-conv1d_OneChannel-rho.png')