In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import torch 
from torch.utils.data import Dataset
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam

from scipy.stats import spearmanr

from sklearn.metrics import mean_squared_error
from sklearn import metrics

import os
from collections import Counter

In [51]:
import blosum as bl
matrix = bl.BLOSUM(62)

A OPTIMISER SI TEMPS

1. Load the dataset 

In [52]:
#load training data (will be put in a function later) 
#TO CHANGE
path = os.getcwd()
for i in range(3) :

    path = os.path.dirname(path)

path += '/data/'
train_df = pd.read_csv(path + 'clean_train_data.csv')
train_df = train_df.drop(columns=['data_source1'])
train_df = train_df.drop(columns=['data_source2'])

train_df = train_df.drop(columns=['protSeq2'])
train_df = train_df.drop(columns=['tm1'])
train_df = train_df.drop(columns=['tm2'])
train_df = train_df.drop(columns=['group1'])
train_df = train_df.drop(columns=['group2'])

train_df = train_df.dropna()

train_df.head()
#dT = 'target'

Unnamed: 0,protSeq1,operation,position1,position2,change1,change2,pH1,pH2,target
0,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,D,6.5,6.5,-6.7
1,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,E,6.5,6.5,-3.9
2,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,F,6.5,6.5,-1.2
3,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,G,6.5,6.5,-4.0
4,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,L,6.5,6.5,2.7


In [53]:
test_df = pd.read_csv(path+ 'test_mutations.csv', index_col="seq_id")
test_df = test_df.drop(columns=['data_source'])
test_df = test_df.drop(columns=['b_factor'])
test_df = test_df.drop(columns=['bFactorAdj'])
test_df = test_df.drop(columns=['score'])
test_df = test_df.drop(columns=['position1'])
test_df = test_df.drop(columns=['position2'])
test_df = test_df[test_df['operation']=='replace']
test_df = test_df[test_df['pH']==8]

test_df.head()

Unnamed: 0_level_0,protein_sequence,pH,modif,score_adj,operation,change1,change2
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,16,0.880797,replace,E,L
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,16,0.880797,replace,K,L
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,17,0.952574,replace,C,K
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,17,0.952574,replace,F,K
31395,VPVNPEPDATSVENVALGTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,17,0.880797,replace,G,K


In [54]:
test_df['length'] = test_df['protein_sequence'].str.len()
test_df = test_df.drop(columns=['protein_sequence'])

In [55]:
train_df = train_df[train_df['operation']=='replace']

2. Translate Amino-acids to numbers and create a One-Channel array for each sequence

In [56]:
#add a new column that contains the length of each protein sequence (before padding)
train_df['length'] = train_df['protSeq1'].str.len()
train_df = train_df.drop(columns=['protSeq1'])



In [57]:
#Put the distance of the 2 aa as a feature, with blosum: 
#hypothesis : this score influences the delta Tm. 
#Allows to encode the "mutation" and the info : which aa into which aa

def blosum_apply(row):
 
        return matrix[row['change1'] + row['change2']]

train_df['dist_mutation'] = train_df.apply(blosum_apply, axis=1)
test_df['dist_mutation'] = test_df.apply(blosum_apply, axis=1)


    

In [58]:
test_df = test_df.drop(columns=['change1'])
test_df = test_df.drop(columns=['change2'])


In [59]:
train_df = train_df.drop(columns=['change1'])
train_df = train_df.drop(columns=['change2'])

In [60]:
train_df.head()

Unnamed: 0,operation,position1,position2,pH1,pH2,target,length,dist_mutation
0,replace,2,2,6.5,6.5,-6.7,164,-2.0
1,replace,2,2,6.5,6.5,-3.9,164,-1.0
2,replace,2,2,6.5,6.5,-1.2,164,-2.0
3,replace,2,2,6.5,6.5,-4.0,164,0.0
4,replace,2,2,6.5,6.5,2.7,164,-1.0


In [61]:
test_df.head()

Unnamed: 0_level_0,pH,modif,score_adj,operation,length,dist_mutation
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31390,8,16,0.880797,replace,221,-3.0
31391,8,16,0.880797,replace,221,-2.0
31393,8,17,0.952574,replace,221,-3.0
31394,8,17,0.952574,replace,221,-3.0
31395,8,17,0.880797,replace,221,-2.0


In [62]:
#Compute the relative position of the mutation

train_df['relative_position'] = train_df['position1']/train_df['length']
train_df = train_df.drop(columns=['position1'])
train_df = train_df.drop(columns=['position2'])





In [63]:
test_df['relative_position'] = test_df['modif']/test_df['length']
test_df = test_df.drop(columns=['modif'])

In [64]:
train_df.head()

Unnamed: 0,operation,pH1,pH2,target,length,dist_mutation,relative_position
0,replace,6.5,6.5,-6.7,164,-2.0,0.012195
1,replace,6.5,6.5,-3.9,164,-1.0,0.012195
2,replace,6.5,6.5,-1.2,164,-2.0,0.012195
3,replace,6.5,6.5,-4.0,164,0.0,0.012195
4,replace,6.5,6.5,2.7,164,-1.0,0.012195


In [65]:
test_df.head()


Unnamed: 0_level_0,pH,score_adj,operation,length,dist_mutation,relative_position
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31390,8,0.880797,replace,221,-3.0,0.072398
31391,8,0.880797,replace,221,-2.0,0.072398
31393,8,0.952574,replace,221,-3.0,0.076923
31394,8,0.952574,replace,221,-3.0,0.076923
31395,8,0.880797,replace,221,-2.0,0.076923


In [66]:
#Compute difference of pH
train_df['dPH'] = train_df['pH1']-train_df['pH2']
train_df = train_df.drop(columns=['pH1'])
train_df = train_df.drop(columns=['pH2'])

In [67]:
train_df.head()

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,-6.7,164,-2.0,0.012195,0.0
1,replace,-3.9,164,-1.0,0.012195,0.0
2,replace,-1.2,164,-2.0,0.012195,0.0
3,replace,-4.0,164,0.0,0.012195,0.0
4,replace,2.7,164,-1.0,0.012195,0.0


In [68]:
test_df['dPH'] =test_df['pH'] - 8
test_df = test_df.drop(columns=['pH'])




In [69]:
test_df = test_df.drop(columns=['score_adj'])

In [70]:
test_df = test_df.drop(columns=['operation'])
trian_df = train_df.drop(columns=['operation'])

In [71]:
test_df.head()

Unnamed: 0_level_0,length,dist_mutation,relative_position,dPH
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31390,221,-3.0,0.072398,0
31391,221,-2.0,0.072398,0
31393,221,-3.0,0.076923,0
31394,221,-3.0,0.076923,0
31395,221,-2.0,0.076923,0


In [72]:
train_df.head()

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,-6.7,164,-2.0,0.012195,0.0
1,replace,-3.9,164,-1.0,0.012195,0.0
2,replace,-1.2,164,-2.0,0.012195,0.0
3,replace,-4.0,164,0.0,0.012195,0.0
4,replace,2.7,164,-1.0,0.012195,0.0


In [73]:
df = train_df.copy()



In [74]:
df = df.reset_index(drop=True)


In [75]:
test_df = test_df.reset_index(drop=True)

Split to train and validation sets

In [76]:
#splot padded_train_df into train and validation sets (will be put in a function later)
train_df = df.sample(frac=0.8,random_state=24)
val_df = df.drop(train_df.index)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


In [77]:
print(len(train_df),len(val_df))

1837 459


si met la transformation dans le dataframe : le kernel dies
Si met avant, dans le panda, les dimensions sont pas les bonnes (peut être transposer ??)

In [78]:
df

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,-6.7,164,-2.0,0.012195,0.0
1,replace,-3.9,164,-1.0,0.012195,0.0
2,replace,-1.2,164,-2.0,0.012195,0.0
3,replace,-4.0,164,0.0,0.012195,0.0
4,replace,2.7,164,-1.0,0.012195,0.0
...,...,...,...,...,...,...
2291,replace,12.6,537,-2.0,0.160149,0.0
2292,replace,25.9,537,-2.0,0.160149,0.0
2293,replace,-7.6,537,0.0,0.160149,0.0
2294,replace,-13.3,537,1.0,0.160149,0.0


## Create 1d conv net

1. get DataLoader from train_dl

a faire : apres avoir fait one hot encoding, trouver comment mettre l'info de plusieurs channels dans le dataframe, sans qu'il mette d'erreur sur la taille. 
Voir comment mettre un tableau = 1 aa puis la longueur de la ligne = longueur totale (juste transposer ?)


class MyLoss(torch.nn.Module):
    def __init__(self, batch_size, classes):
        super(MyLoss, self).__init__()
        # define some attributes
        self.y_true_one_hot = torch.FloatTensor(batch_size, classes, length)

    def forward(self, y_pred, y_true):
        with torch.no_grad():
            self.y_true_one_hot.zero_().scatter_(1, y_true, 1) # permet one hot encoding
        # do some operations
        return loss


Or use cross entropy loss ?

In [79]:
#Voir pour changer activation fonction
#voir pq probleme de dimension

In [88]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators = 1000, max_depth = 6)


    


In [89]:
train_df

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,3.6,164,1.0,0.353659,0.0
1,replace,1.6,537,-1.0,0.197393,0.0
2,replace,-1.3,231,-1.0,0.619048,0.0
3,replace,-5.5,455,-3.0,0.336264,0.0
4,replace,-6.9,231,-1.0,0.619048,0.0
...,...,...,...,...,...,...
1832,replace,1.5,455,-1.0,0.081319,0.0
1833,replace,9.7,231,-1.0,0.619048,0.0
1834,replace,1.3,231,-1.0,0.445887,0.0
1835,replace,-6.9,537,0.0,0.255121,0.0


In [90]:
X = train_df.drop(columns = ['target','operation'])
y = train_df['target']


In [91]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import spearmanr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 11)

In [92]:
model.fit(X_train, y_train)
predictions1 = model.predict(X_test)


In [93]:
print('Mean Absolute Error =', mean_absolute_error(y_test, predictions1))
print('Mean Absolute Percentage Error = ', mean_absolute_percentage_error(y_test, predictions1))

Mean Absolute Error = 6.187791511647249
Mean Absolute Percentage Error =  166390228793968.66


In [94]:
rho, p = spearmanr(y_test, predictions1)
print('Spearman Correlation Coefficient =', rho.round(3))

Spearman Correlation Coefficient = -0.229



k_folds = 5
learning_rate = 1e-4
kfold = KFold(n_splits=k_folds, shuffle=True)
dataset = EnzymesDataset(df.reset_index(drop=True))
train_loss_history = []
test_loss_history = []
train_rho_history = []
test_rho_history = []
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

    # Define data loaders for training and testing data in this fold
    train_dl = torch.utils.data.DataLoader(
                      dataset, 
                      batch_size=32, sampler=train_subsampler)
    val_dl = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=32, sampler=test_subsampler)

    model = Conv1D_OneChannel()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    # defining the loss function
    criterion = nn.MSELoss()
    # checking if GPU is available
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
    
    for epoch in range(1, num_epochs + 1):
        train_loss , rho_train = train_epoch( model, optimizer, criterion, train_dl, epoch)
       

        
    
    
    test_loss , rho_test = test_epoch(model, criterion, val_dl)
        

    train_loss_history.append(train_loss)
    train_rho_history.append(rho_train)
    test_loss_history.append(test_loss)
    test_rho_history.append(rho_test)

    
    
    print(f'for fold {fold} : \n train_loss :  {train_loss}     test_loss : {test_loss} \n \n')
    
    
    
 


# train and test the model (save it after each epoch)
train_loss_history = []
test_loss_history = []
train_rho_history = []
test_rho_history = []
for epoch in range(1, num_epochs + 1):
    train_loss , rho_train = train_epoch(
        model, optimizer, criterion, train_dl, epoch
    )
    train_loss_history.append(train_loss)
    train_rho_history.append(rho_train)

    
    
    test_loss , rho_test = test_epoch(model, criterion, val_dl)
    test_loss_history.append(test_loss)
    test_rho_history.append(rho_test)
    
    #torch.save(model.state_dict(), f"2-Conv1d_OneHot_model_{epoch}.pth")


In [95]:
#test_df = pd.read_csv(path+ 'test.csv',index_col='seq_id')
#test_df['tm']=submission_df['tm'].values
#test_df = test_df.drop(columns=['dPH','length', 'relative_position', 'dist_mutation'])
#test_df.to_csv('Effect_mutation.csv', index=True)