In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import os
import json

import ray
import tmap as tm
from faerun import Faerun
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from MDAnalysis import Universe
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from MDAnalysis.lib.util import convert_aa_code
from scipy import stats
from matplotlib.colors import ListedColormap

from mobius import Mobius
from mobius import SequenceGA, ScaffoldGA, GA, RandomGA
from mobius import Map4Fingerprint, SequenceDescriptors
from mobius import expected_improvement, greedy, probability_of_improvement, TanimotoSimilarityKernel, random_improvement
from mobius import HELMGeneticOperators
from mobius import Oracle
from mobius import affinity_binding_to_energy, energy_to_affinity_binding, compute_probability_matrix, plot_results
from mobius.helm import build_helm_string, parse_helm
from mobius.surrogate_model import GPModel
from mobius.acquisition_functions import AcqScoring
from mobius.generators import monomers_scanning, alanine_scanning, random_monomers_scanning, properties_scanning, scrumbled_scanning

In [4]:
mhci = pd.read_csv('binding_data_2013/bdata.20130222.mhci.csv')
print(mhci[mhci['mhc_allele'].str.contains("HLA")]['mhc_allele'].unique().shape)

# We removed those binding affinity values
# A lot of peptides were set with those values. Looks like some default values assigned...
dirty_values = [1, 2, 3, 5000, 10000, 20000, 43424, 50000, 69444.44444, 78125]

# Split dataset in training and testing sets
mhci = mhci[(mhci['mhc_allele'] == 'HLA-A*02:01') &
            (mhci['length'] <= 11) &
            (mhci['length'] >= 9) &
            (~mhci['affinity_binding'].isin(dirty_values))]

print(mhci.shape)

(119,)
(8327, 7)


In [5]:
pssm_files = ['IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-8.txt',
              'IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-9.txt',
              'IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-10.txt',
              'IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-11.txt']
mps = Oracle(pssm_files, mhci['sequence'], mhci['energy'])

----- Peptide global -----
N peptide: 8327
R2: 0.616
RMSD : 1.180 kcal/mol



In [8]:
with open('HELMCoreLibrary.json') as f:
    helm_core_library = json.load(f)

monomer_names = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
smiles = [x['smiles'] for x in helm_core_library if x['symbol'] in monomer_names and x['monomerType'] == 'Backbone']
    
probability_matrix = compute_probability_matrix(smiles)
helmgo = HELMGeneticOperators(monomer_names, probability_matrix)

In [9]:
map4 = Map4Fingerprint(input_type='helm', dimensions=4096, radius=1)

In [11]:
n_peptides = 1
peptide_lengths = [9]
energy_bounds = [affinity_binding_to_energy(10, 'uM'), affinity_binding_to_energy(20, 'uM')] # about between 1 and 250 uM

# Generate random peptides
random_peptides, random_peptide_scores = mps.generate_random_peptides(n_peptides, peptide_lengths, energy_bounds)

print(random_peptides)
print(random_peptide_scores)
print(energy_to_affinity_binding(random_peptide_scores, 'uM'))

['PEPTIDE1{Y.N.K.N.N.A.P.Q.L}$$$$V2.0']
[-6.48083468]
[18.98123845]


In [12]:
peptides = [''.join(c.split('$')[0].split('{')[1].split('}')[0].split('.')) for c in random_peptides]
print(peptides)

for seq in monomers_scanning(peptides[0], 'HREWVSNCPY'):
    peptides.append(seq)
    
    if len(peptides) >= 96:
        print('Reach max. number of peptides allowed.')
        break

peptides = np.unique(peptides)
peptides_helm = [build_helm_string({'PEPTIDE1': f}) for f in peptides]

['YNKNNAPQL']


In [13]:
scores = mps.score(peptides)

for i in np.argsort(scores):
    print(peptides[i], scores[i])

YNWNNAPQL -8.236926533004281
YVKNNAPQL -8.005408111715456
YNYNNAPQL -7.839158181534652
YNSNNAPQL -7.389667629564326
YNVNNAPQL -7.371195415099793
YNNNNAPQL -7.288686190491541
YNCNNAPQL -7.267751014098403
YWKNNAPQL -7.243121394812357
YNPNNAPQL -7.181547346597244
YNKNWAPQL -7.172926979847128
YNKNYAPQL -7.153223284418292
YNKNNAPYL -7.1495288415253855
YNHNNAPQL -7.095343679096086
YSKNNAPQL -7.092880717167482
YYKNNAPQL -7.042389997631089
YNKNNAPPL -7.01037149255923
YNKNNAPQV -7.01037149255923
YNKNNAWQL -6.964806696880046
YNKENAPQL -6.93401967277249
YNKNCAPQL -6.894612281914818
YNKNNAPWL -6.878603029378889
YNKNNVPQL -6.874908586485982
YNKNNAPEL -6.862593776842959
YNKNNAYQL -6.8601308149143545
YNKNVAPQL -6.776390109341801
YNKWNAPQL -6.748066047162849
YNKNNAPSL -6.724667908841106
YNENNAPQL -6.6852605179834335
YNKNNCPQL -6.660630898697389
YNKCNAPQL -6.640927203268553
YNKNHAPQL -6.6323068365184366
YNKNNAPHL -6.629843874589832
YCKNNAPQL -6.6224549888040185
YNRNNAPQL -6.60151981241088
YEKNNAPQL -6.

In [19]:
gpmodel = GPModel(kernel=TanimotoSimilarityKernel(), data_transformer=map4)
gpmodel.fit(peptides_helm, scores)
mu, sigma = gpmodel.transform(peptides_helm)
print(mu)
print(sigma)

[-5.386614  -4.4953766 -5.3554153 -5.2457647 -4.4029303 -5.6145782
 -5.6582413 -5.6993294 -5.7727065 -6.628053  -6.599327  -6.3059025
 -7.2660284 -6.6895185 -7.0954843 -6.6371713 -6.923922  -6.426993
 -6.8898754 -6.3529716 -6.6313286 -6.2764497 -5.9856186 -6.4365644
 -6.0286813 -6.5853214 -6.8557825 -6.627841  -6.490675  -7.004674
 -5.0282946 -4.068606  -4.1777353 -6.4928346 -4.43694   -4.7194815
 -4.5859485 -4.9353337 -6.986498  -4.977594  -4.494616  -6.483306
 -6.7231846 -6.5296607 -6.8795094 -7.145078  -5.66824   -6.074631
 -6.3251047 -6.9638276 -6.856497  -6.6613803 -6.2656493 -6.215912
 -6.542605  -6.5900884 -5.9287195 -6.5688167 -6.8706484 -6.5659213
 -6.491452  -6.174464  -6.3370504 -6.5193777 -6.7753754 -7.1715016
 -7.1497107 -6.591318  -6.164649  -6.5560036 -6.2888    -6.7490215
 -6.475518  -7.2868176 -7.1778665 -6.6030183 -7.385359  -7.3765564
 -8.2305155 -7.833273  -6.526773  -6.2253723 -7.0914955 -7.9856296
 -7.240745  -7.04202  ]
[0.06976247 0.06975307 0.06980091 0.0696803

