In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import torch 
from torch.utils.data import Dataset
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam


from collections import Counter

1. Load the dataset 

In [None]:
data_path = 'data/'

In [None]:
train_df = pd.read_csv(data_path + 'train_v1.csv',index_col="seq_id")
train_df = train_df.drop(columns=['data_source'])
train_df = train_df.dropna()
train_df.head()

2. Translate Amino-acids to numbers and create a One-Channel array for each sequence

In [None]:
# Count the frequency of each amino acid in the training set
map_dict = {} 
def count_frequency(s):
    counter = Counter(s)
    for p in s  : 
        if p in map_dict : 
            map_dict[p]+=counter[p]
        else : 
            map_dict[p]= counter[p]
        


In [None]:
# fill the map_dict with the frequency of each amino acid  (TODO:  better code )
_ = train_df.apply(lambda row : count_frequency(row['protein_sequence']),axis=1)
map_dict = dict(sorted(map_dict.items(), key=lambda item: item[1],reverse=True))

# Assign a number to each amino acid based on its frequency in decreasing order
i = 1
for key, value in map_dict.items():
    map_dict[key]= i
    i+=1

In [None]:
#map each amino acid to its number
def chr_to_int(s):
    l = []
    for ch in s :
       l.append(map[ch])
    return l


In [None]:
#create a new column in the dataframe with the  amino acids in numerical form (array of numbers from 1 to 20)
train_df['numerical_sequence'] = train_df.apply(lambda row : chr_to_int(row['protein_sequence']),axis=1)

In [None]:
#add a new column that contains the length of each protein sequence (before padding)
train_df['length'] = train_df['protein_sequence'].str.len()

In [None]:
#final dataframe 
train_df.head()

In [None]:
max_length = max(train_df['length'])
max_length

# Adding inverted sequences (DISCRADED) 
**Add it to report as an experiment ?**

In [None]:

inverted_df = train_df.copy()

#inver the protein sequence
inverted_df['protein_sequence'] = inverted_df.apply(lambda row : row['protein_sequence'][::-1],axis=1)
inverted_df['numerical_sequence'] = inverted_df.apply(lambda row : chr_to_int(row['protein_sequence']),axis=1)

# merge the original and inverted dataframes (size of dataset is doubled)
train_df = pd.concat([train_df,inverted_df],ignore_index=True)

## 3. Padding with zeros 

In [None]:
## add 0 to numerical_sequence to make all of them the same length
padded_train_df = train_df.copy()
padded_train_df['numerical_sequence'] = train_df.apply(lambda row : row['numerical_sequence'] + [0]*(max_length - row['length']),axis=1)

In [None]:
#Prepare dataframe to Pass to the dataloader (will be put in a function later)
padded_train_df.drop(columns=['protein_sequence'],inplace=True)
padded_train_df['y'] = padded_train_df['tm']
padded_train_df.drop(columns=['tm'],inplace=True)
padded_train_df['numerical_sequence'] = padded_train_df.apply(lambda row : np.array(row['numerical_sequence']),axis=1)

Split to train and validation sets

In [None]:
#splot padded_train_df into train and validation sets (will be put in a function later)
train_df = padded_train_df.sample(frac=0.8,random_state=200)
val_df = padded_train_df.drop(train_df.index)


## Create 1d conv net

1. get DataLoader from train_dl

In [None]:
class EnzymesDataset(Dataset):
 
  def __init__(self,df):
    
    # the Amino acid sequences as an int array
    sequence= df.iloc[:]['numerical_sequence']
    # numerical : pH and length
    numerical = df.iloc[:,[0,2]].values

    # y : the target (tm)
    y=df.iloc[:,3].values
  
    #creta tensors from the numpy arrays
    self.x_sequence=torch.tensor(sequence)
    self.y=torch.tensor(y,dtype=torch.float32)
    self.num=torch.tensor(numerical,dtype=torch.float32)
 
  def __len__(self):
    return len(self.y)
   
  def __getitem__(self,idx):
    return self.x_sequence[idx],self.y[idx] , self.num[idx]


In [None]:
# hyperparameters
batch_size = 128
learning_rate = 0.001 # Suggested for Adam
num_epochs = 10


In [None]:
# create pytorch dataframes
train_df = EnzymesDataset(train_df)
val_df = EnzymesDataset(val_df)

# create pytorch dataloaders
train_dl = torch.utils.data.DataLoader(train_df, batch_size=batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_df, batch_size=batch_size, shuffle=True)

In [None]:
class Conv1D_OneChannel(nn.Module):

    def __init__(self):
        super().__init__()
        self.protein_sequence = nn.Sequential(
            nn.Conv1d(1, 1,kernel_size=8, stride=1),
            nn.ReLU(),
            nn.Dropout(),


        )

        self.prot_seq_one_pooling = nn.Sequential(

            #With pooling only at the end (seen in paper)

            nn.Conv1d(1, 64,kernel_size=8, stride=1),
            nn.ReLU(),
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 64, 5, stride=1, padding=2), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.Conv1d(64, 32, 5, stride=1, padding=1), 
            nn.ReLU(), 
            nn.Dropout(),
            nn.AdaptiveAvgPool1d(32), #argument = output size 
            nn.Conv1d(32, 1, 5, stride=1, padding=1), 
            nn.ReLU(), 
            nn.Dropout(),


        )
        self.numerical = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, 64),
            nn.ReLU(),
        )
        self.mlp = nn.Sequential(
            nn.Linear(94, 64),#input devrait être 32 + 64 plutôt non si on utilise MaxPoolId(2)? (était marqué 128 en input avant) Comme on fait le pooling
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64, 1),
        )

    def forward(self, x,y):
        x = self.prot_seq_one_pooling(x.float())
        y = self.numerical(y)
       
        x = torch.cat((x.squeeze(1), y), 1)
        x = self.mlp(x)
        return x

In [None]:
model = Conv1D_OneChannel()

In [None]:
optimizer = Adam(model.parameters(), lr=learning_rate)
# defining the loss function
criterion = nn.MSELoss()
# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    


In [None]:
def train_epoch(model, optimizer, criterion, train_loader, epoch):
    model.train()

    for batch_idx, (seq, target,num) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(seq.unsqueeze(1),num)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
       
        
        print(
            f"Train Epoch: {epoch}-{batch_idx:03d} "
            f"batch_loss={loss.item():0.2e} "
            
        )

    return loss.item()

In [None]:
def test_epoch(model, criterion, test_loader):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for seq, target,num in test_loader:
            output = model(seq.unsqueeze(1),num)
            test_loss += criterion(output, target).item()  # sum up batch loss
           
            

    test_loss /= len(test_loader.dataset)

    print(
        f"Test set: Average loss: {test_loss:0.2e} "
    )

    return test_loss

In [None]:
# train and test the model (save it after each epoch)
train_loss_history = []
test_loss_history = []
for epoch in range(1, num_epochs + 1):
    train_loss = train_epoch(
        model, optimizer, criterion, train_dl, epoch
    )
    train_loss_history.append(train_loss)

    test_loss = test_epoch(model, criterion, val_df)

    test_loss_history.append(test_loss)
    
    torch.save(model.state_dict(), f"1-Conv1d_OneChannel_model_{epoch}.pth")


In [None]:
#create loss plot

plt.plot(train_loss_history, label='train loss')
plt.plot(test_loss_history, label='test loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title(' train and test MSE Loss')
plt.legend()
plt.savefig('plots/1-conv1d_OneChannel.png')
