This notebook studies the same features as the notebook "Effect_mutations_alone.ipynb" but with another method (xgboost)

In [38]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import torch 
from torch.utils.data import Dataset
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam

from scipy.stats import spearmanr

from sklearn.metrics import mean_squared_error
from sklearn import metrics

import os
from collections import Counter
from helpers import*

In [39]:
import blosum as bl
matrix = bl.BLOSUM(62)

1. Load the dataset 

In [40]:
#load training data (will be put in a function later) 
#TO CHANGE
path = os.getcwd()
for i in range(3) :

    path = os.path.dirname(path)

path += '/data/'
train_df = pd.read_csv(path + 'clean_train_data.csv')
train_df = train_df.drop(columns=['data_source1'])
train_df = train_df.drop(columns=['data_source2'])

train_df = train_df.drop(columns=['protSeq2'])
train_df = train_df.drop(columns=['tm1'])
train_df = train_df.drop(columns=['tm2'])
train_df = train_df.drop(columns=['group1'])
train_df = train_df.drop(columns=['group2'])

train_df = train_df.dropna()

train_df.head()
#dT = 'target'

Unnamed: 0,protSeq1,operation,position1,position2,change1,change2,pH1,pH2,target
0,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,D,6.5,6.5,-6.7
1,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,E,6.5,6.5,-3.9
2,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,F,6.5,6.5,-1.2
3,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,G,6.5,6.5,-4.0
4,MNAFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,replace,2,2,A,L,6.5,6.5,2.7


In [41]:
test_df = pd.read_csv(path+ 'test_mutations.csv', index_col="seq_id")
test_df = test_df.drop(columns=['data_source'])
test_df = test_df.drop(columns=['b_factor'])
test_df = test_df.drop(columns=['bFactorAdj'])
test_df = test_df.drop(columns=['score'])
test_df = test_df.drop(columns=['position1'])
test_df = test_df.drop(columns=['position2'])


test_df.head()

Unnamed: 0_level_0,protein_sequence,pH,modif,score_adj,operation,change1,change2
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,16,0.880797,replace,E,L
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,16,0.880797,replace,K,L
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,16,0.999955,delete,K,
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,17,0.952574,replace,C,K
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,17,0.952574,replace,F,K


2. Keep the features we need

In [42]:
test_df['length'] = test_df['protein_sequence'].str.len()
test_df = test_df.drop(columns=['protein_sequence'])

In [43]:
train_df = train_df[train_df['operation']=='replace']

In [44]:
#add a new column that contains the length of each protein sequence (before padding)
train_df['length'] = train_df['protSeq1'].str.len()
train_df = train_df.drop(columns=['protSeq1'])



In [45]:
#Put the distance of the 2 aa as a feature, with blosum: 
#hypothesis : this score influences the delta Tm. 
#Allows to encode the "mutation" and the info : which aa into which aa


def blosum_apply(row):
        if (row['operation']=='replace'):
            res = matrix[row['change1'] + row['change2']]
        else:
                res = -10
        return res

train_df['dist_mutation'] = train_df.apply(blosum_apply, axis=1)
test_df['dist_mutation'] = test_df.apply(blosum_apply, axis=1)


    

In [46]:
test_df = test_df.drop(columns=['change1'])
test_df = test_df.drop(columns=['change2'])


In [47]:
train_df = train_df.drop(columns=['change1'])
train_df = train_df.drop(columns=['change2'])

In [48]:
train_df.head()

Unnamed: 0,operation,position1,position2,pH1,pH2,target,length,dist_mutation
0,replace,2,2,6.5,6.5,-6.7,164,-2.0
1,replace,2,2,6.5,6.5,-3.9,164,-1.0
2,replace,2,2,6.5,6.5,-1.2,164,-2.0
3,replace,2,2,6.5,6.5,-4.0,164,0.0
4,replace,2,2,6.5,6.5,2.7,164,-1.0


In [49]:
test_df.head()

Unnamed: 0_level_0,pH,modif,score_adj,operation,length,dist_mutation
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31390,8,16,0.880797,replace,221,-3.0
31391,8,16,0.880797,replace,221,-2.0
31392,8,16,0.999955,delete,220,-10.0
31393,8,17,0.952574,replace,221,-3.0
31394,8,17,0.952574,replace,221,-3.0


In [50]:
#Compute the relative position of the mutation

train_df['relative_position'] = train_df['position1']/train_df['length']
train_df = train_df.drop(columns=['position1'])
train_df = train_df.drop(columns=['position2'])

In [51]:
test_df['relative_position'] = test_df['modif']/test_df['length']
test_df = test_df.drop(columns=['modif'])

In [52]:
train_df.head()

Unnamed: 0,operation,pH1,pH2,target,length,dist_mutation,relative_position
0,replace,6.5,6.5,-6.7,164,-2.0,0.012195
1,replace,6.5,6.5,-3.9,164,-1.0,0.012195
2,replace,6.5,6.5,-1.2,164,-2.0,0.012195
3,replace,6.5,6.5,-4.0,164,0.0,0.012195
4,replace,6.5,6.5,2.7,164,-1.0,0.012195


In [53]:
test_df.head()


Unnamed: 0_level_0,pH,score_adj,operation,length,dist_mutation,relative_position
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31390,8,0.880797,replace,221,-3.0,0.072398
31391,8,0.880797,replace,221,-2.0,0.072398
31392,8,0.999955,delete,220,-10.0,0.072727
31393,8,0.952574,replace,221,-3.0,0.076923
31394,8,0.952574,replace,221,-3.0,0.076923


In [54]:
#Compute difference of pH
train_df['dPH'] = train_df['pH1']-train_df['pH2']
train_df = train_df.drop(columns=['pH1'])
train_df = train_df.drop(columns=['pH2'])

In [55]:
train_df.head()

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,-6.7,164,-2.0,0.012195,0.0
1,replace,-3.9,164,-1.0,0.012195,0.0
2,replace,-1.2,164,-2.0,0.012195,0.0
3,replace,-4.0,164,0.0,0.012195,0.0
4,replace,2.7,164,-1.0,0.012195,0.0


In [56]:
test_df['dPH'] =test_df['pH'] - 8
test_df = test_df.drop(columns=['pH'])




In [57]:
test_df = test_df.drop(columns=['score_adj'])

In [58]:
test_df = test_df.drop(columns=['operation'])
trian_df = train_df.drop(columns=['operation'])

In [59]:
test_df.head()

Unnamed: 0_level_0,length,dist_mutation,relative_position,dPH
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31390,221,-3.0,0.072398,0
31391,221,-2.0,0.072398,0
31392,220,-10.0,0.072727,0
31393,221,-3.0,0.076923,0
31394,221,-3.0,0.076923,0


In [60]:
train_df.head()

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,-6.7,164,-2.0,0.012195,0.0
1,replace,-3.9,164,-1.0,0.012195,0.0
2,replace,-1.2,164,-2.0,0.012195,0.0
3,replace,-4.0,164,0.0,0.012195,0.0
4,replace,2.7,164,-1.0,0.012195,0.0


In [61]:
df = train_df.copy()



In [62]:
df = df.reset_index(drop=True)


In [63]:
test_df = test_df.reset_index(drop=True)

Split to train and validation sets

In [64]:
#split padded_train_df into train and validation sets (will be put in a function later)
train_df , val_df = split_train_test(df,frac=0.8)


train_df has shape : (1837, 6) 
 test_df has shape :  (459, 6)


In [65]:
#Train the model HGBRegressor
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators = 1000, max_depth = 6)

In [66]:
train_df

Unnamed: 0,operation,target,length,dist_mutation,relative_position,dPH
0,replace,3.6,164,1.0,0.353659,0.0
1,replace,1.6,537,-1.0,0.197393,0.0
2,replace,-1.3,231,-1.0,0.619048,0.0
3,replace,-5.5,455,-3.0,0.336264,0.0
4,replace,-6.9,231,-1.0,0.619048,0.0
...,...,...,...,...,...,...
1832,replace,1.5,455,-1.0,0.081319,0.0
1833,replace,9.7,231,-1.0,0.619048,0.0
1834,replace,1.3,231,-1.0,0.445887,0.0
1835,replace,-6.9,537,0.0,0.255121,0.0


In [67]:
#Select data and label
X = train_df.drop(columns = ['target','operation'])
y = train_df['target']


In [68]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import spearmanr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 11)

In [69]:
#Fit the model and predict

model.fit(X_train, y_train)
predictions1 = model.predict(X_test)

In [70]:
#Print performance of the model
print('Mean Absolute Error =', mean_absolute_error(y_test, predictions1))
print('Mean Absolute Percentage Error = ', mean_absolute_percentage_error(y_test, predictions1))

Mean Absolute Error = 6.187791511647249
Mean Absolute Percentage Error =  166390228793968.66


In [71]:
rho, p = spearmanr(y_test, predictions1)
print('Spearman Correlation Coefficient =', rho.round(3))

Spearman Correlation Coefficient = -0.229


Make submission

In [72]:
submission = model.predict(test_df)

In [73]:
test_df = pd.read_csv(path+ 'test.csv',index_col='seq_id')
test_df['tm']=submission

In [75]:
test_df = test_df.drop(columns=['protein_sequence','pH', 'data_source'])

In [77]:
test_df.to_csv('Effect_mutation_2nd_method.csv', index=True)