In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import json
from multiprocessing import Pool
from datetime import datetime
from collections import defaultdict

import torch
import gpytorch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import umap.umap_ as umap
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
from rdkit import Chem
from botorch.models.gpytorch import GPyTorchModel
from botorch.fit import fit_gpytorch_model
from MDAnalysis.analysis.rms import rmsd
from scipy.stats import pearsonr
from map4 import MAP4Calculator

from ga import LocalGA
from baye import get_fitted_model, map4_fingerprint, TanimotoSimilarityKernel
from baye import AcqScoring, expected_improvement, probability_of_improvement
from helm import build_helm_string, parse_helm, HELMGeneticOperators
from mhc import read_pssm_file, MHCIPeptideScorer

## Read dataset

In [3]:
mhci = pd.read_csv('../mhc/binding_data_2013/bdata.20130222.mhci.csv')
print(mhci[mhci['mhc_allele'].str.contains("HLA")]['mhc_allele'].unique().shape)

(119,)


In [4]:
# We removed those binding affinity values
# A lot of peptides were set with those values. Looks like some default values assigned...
dirty_values = [1, 2, 3, 5000, 10000, 20000, 43424, 50000, 69444.44444, 78125]

# Split dataset in training and testing sets
mhci = mhci[(mhci['mhc_allele'] == 'HLA-A*02:01') &
            (8 <= mhci['length']) &
            (mhci['length'] <= 11) &
            (~mhci['affinity_binding'].isin(dirty_values))]

## Genetic operators on HELM strings

In [5]:
with open('HELMCoreLibrary.json') as f:
    monomer_lib = json.load(f)

monomer_peptide_lib = [x for x in monomer_lib if x['polymerType'] == 'PEPTIDE']
AA1 = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
monomer_aa1_lib = [x for x in monomer_peptide_lib if x['symbol'] in AA1]

smiles = [monomer['smiles'] for monomer in monomer_aa1_lib]
fps = map4_fingerprint(smiles,input_type='smiles', radius=2)
t = TanimotoSimilarityKernel()
similarity_matrix = t.forward(fps, fps).numpy()

probability_matrix = []

for aa in similarity_matrix:
    tmp = aa.copy()
    tmp[tmp == 1.0] = 0
    probability_matrix.append(tmp / np.sum(tmp))
    
probability_matrix = np.array(probability_matrix)

In [6]:
def affinity_binding_to_energy(value, input_unit='nM', temperature=300.):
    unit_converter = {'nM': 1e-9, 'uM': 1e-6, 'mM': 1e-3, 'M': 1}
    RT = 0.001987 * temperature
    return RT * np.log(value * unit_converter[input_unit])

def energy_to_affinity_binding(value, output_unit='nM', temperature=300.):
    unit_converter = {'nM': 1e9, 'uM': 1e6, 'mM': 1e3, 'M': 1}
    RT = 0.001987 * temperature
    return np.exp(value / RT) * unit_converter[output_unit]

## Generate random peptides

In [7]:
random_peptides = []
random_peptide_scores = []

n_peptides = [150]
peptide_length = [9]
energy_bounds = [-8.235, -4.944] # about between 1 uM and 250 uM
#energy_bounds = [-4.944, -4.531] # about between 250 uM and 500 uM
#energy_bounds = [-4.531, -4.118] # about between 500 uM and 1 mM
#energy_bounds = [-8.649, -8.235] # about between 500 nM and 1uM
energy_cutoff = -4.11 # 1 mM
#energy_cutoff = -4.944 # 250 uM
#energy_cutoff = -8.235 # 1 uM

pssm_files = ['../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-8.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-9.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-10.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-11.txt']
mps = MHCIPeptideScorer(pssm_files, mhci, energy_cutoff=energy_cutoff)

# Generate random peptides
for n, size in zip(n_peptides, peptide_length):
    tmp_peptides, tmp_peptide_scores = mps.generate_random_peptides(n, energy_bounds, size)
    random_peptides.append(tmp_peptides)
    random_peptide_scores.append(tmp_peptide_scores)

random_peptides = np.concatenate(random_peptides)
random_peptide_scores = np.concatenate(random_peptide_scores)

clusters = defaultdict(list)
for i, sequence in enumerate(random_peptides):
    clusters[sequence.count('.')].append(i)
print('Distribution:', ['%d: %d' % (k, len(clusters[k])) for k in sorted(clusters.keys())])
print('')

print(len(random_peptides))
print(random_peptides)
print(random_peptide_scores)

----- Peptide global -----
N peptide: 8471
R2: 0.620
RMSD : 1.177 kcal/mol

Distribution: ['9: 150']

150
['PEPTIDE1{M.I.S.F.H.H.C.W.M}$$$$V2.0'
 'PEPTIDE1{F.S.M.W.P.M.S.Q.M}$$$$V2.0'
 'PEPTIDE1{Y.L.S.W.M.N.D.R.R}$$$$V2.0'
 'PEPTIDE1{H.Y.I.D.L.A.K.F.V}$$$$V2.0'
 'PEPTIDE1{M.I.I.N.S.T.M.T.W}$$$$V2.0'
 'PEPTIDE1{W.Q.I.A.F.I.F.Q.D}$$$$V2.0'
 'PEPTIDE1{A.M.H.C.Y.W.R.L.C}$$$$V2.0'
 'PEPTIDE1{Y.S.W.G.R.V.V.F.G}$$$$V2.0'
 'PEPTIDE1{F.A.E.K.W.A.Y.T.T}$$$$V2.0'
 'PEPTIDE1{W.I.K.F.S.D.A.F.L}$$$$V2.0'
 'PEPTIDE1{N.S.L.F.P.I.V.P.L}$$$$V2.0'
 'PEPTIDE1{D.T.D.A.Y.V.E.W.I}$$$$V2.0'
 'PEPTIDE1{N.Y.L.N.R.C.F.R.L}$$$$V2.0'
 'PEPTIDE1{N.I.N.R.Y.D.P.I.A}$$$$V2.0'
 'PEPTIDE1{N.C.Y.D.T.A.L.K.V}$$$$V2.0'
 'PEPTIDE1{L.Y.A.Y.E.S.Q.M.V}$$$$V2.0'
 'PEPTIDE1{V.K.N.I.I.T.T.S.V}$$$$V2.0'
 'PEPTIDE1{W.V.N.Y.W.Q.Y.P.D}$$$$V2.0'
 'PEPTIDE1{W.I.D.I.H.V.H.N.C}$$$$V2.0'
 'PEPTIDE1{A.N.N.W.A.S.W.E.A}$$$$V2.0'
 'PEPTIDE1{M.Y.I.E.A.P.Y.V.S}$$$$V2.0'
 'PEPTIDE1{L.W.E.T.Y.E.E.T.V}$$$$V2.0'
 'PEPTIDE1{K.Q.M.T.M.P.E.Y.Q}$$$$V2.

In [8]:
with open('HELMCoreLibrary.json') as f:
    monomer_lib = json.load(f)

monomer_peptide_lib = [x for x in monomer_lib if x['polymerType'] == 'PEPTIDE']
AA1 = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
monomer_aa1_lib = [x for x in monomer_peptide_lib if x['symbol'] in AA1]

smiles = [monomer['smiles'] for monomer in monomer_aa1_lib]
fps = map4_fingerprint(smiles,input_type='smiles', radius=2)
t = TanimotoSimilarityKernel()
similarity_matrix = t.forward(fps, fps).numpy()

probability_matrix = []

for aa in similarity_matrix:
    tmp = aa.copy()
    tmp[tmp == 1.0] = 0
    probability_matrix.append(tmp / np.sum(tmp))
    
probability_matrix = np.array(probability_matrix)

In [9]:
helmgo = HELMGeneticOperators(monomer_aa1_lib, probability_matrix)

In [12]:
lgao = LocalGA(helmgo)

In [13]:
dir(lgao)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_elitism',
 '_generate_new_population',
 '_greater_is_better',
 '_helmgo',
 '_n_children',
 '_n_gen',
 '_sigma',
 'candidate_scores',
 'candidates',
 'run']

In [14]:
lgao.run()

TypeError: run() missing 2 required positional arguments: 'scoring_function' and 'sequences'

In [13]:
type(LocalGA)

function