In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import copy
import json
import time
import pickle
from multiprocessing import Pool
from datetime import datetime
from collections import defaultdict
from abc import ABC, abstractmethod


import torch
import gpytorch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from scipy.stats import norm
from rdkit import Chem
from botorch.models.gpytorch import GPyTorchModel
from botorch.fit import fit_gpytorch_model
from map4 import MAP4Calculator

from ga import SequenceGA, ScaffoldGA, GA
from baye import get_fitted_model, map4_fingerprint, TanimotoSimilarityKernel
from baye import AcqScoring, expected_improvement, probability_of_improvement
from helm import build_helm_string, parse_helm
from helm_genetic_operators import HELMGeneticOperators
from mhc import read_pssm_file, MHCIPeptideScorer

## Read dataset

In [3]:
mhci = pd.read_csv('../mhc/binding_data_2013/bdata.20130222.mhci.csv')
print(mhci[mhci['mhc_allele'].str.contains("HLA")]['mhc_allele'].unique().shape)

(119,)


In [4]:
# We removed those binding affinity values
# A lot of peptides were set with those values. Looks like some default values assigned...
dirty_values = [1, 2, 3, 5000, 10000, 20000, 43424, 50000, 69444.44444, 78125]

# Split dataset in training and testing sets
mhci = mhci[(mhci['mhc_allele'] == 'HLA-A*02:01') &
            (8 <= mhci['length']) &
            (mhci['length'] <= 11) &
            (~mhci['affinity_binding'].isin(dirty_values))]

## Genetic operators on HELM strings

In [5]:
with open('HELMCoreLibrary.json') as f:
    monomer_lib = json.load(f)

monomer_peptide_lib = [x for x in monomer_lib if x['polymerType'] == 'PEPTIDE']
AA1 = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
monomer_aa1_lib = [x for x in monomer_peptide_lib if x['symbol'] in AA1]

smiles = [monomer['smiles'] for monomer in monomer_aa1_lib]
fps = map4_fingerprint(smiles,input_type='smiles', radius=2)
t = TanimotoSimilarityKernel()
similarity_matrix = t.forward(fps, fps).numpy()

probability_matrix = []

for aa in similarity_matrix:
    tmp = aa.copy()
    tmp[tmp == 1.0] = 0
    probability_matrix.append(tmp / np.sum(tmp))
    
probability_matrix = np.array(probability_matrix)

helmgo = HELMGeneticOperators(monomer_aa1_lib, probability_matrix)

In [6]:
def affinity_binding_to_energy(value, input_unit='nM', temperature=300.):
    unit_converter = {'nM': 1e-9, 'uM': 1e-6, 'mM': 1e-3, 'M': 1}
    RT = 0.001987 * temperature
    return RT * np.log(value * unit_converter[input_unit])

def energy_to_affinity_binding(value, output_unit='nM', temperature=300.):
    unit_converter = {'nM': 1e9, 'uM': 1e6, 'mM': 1e3, 'M': 1}
    RT = 0.001987 * temperature
    return np.exp(value / RT) * unit_converter[output_unit]

## Generate random peptides

In [7]:
random_peptides = []
random_peptide_scores = []

n_peptides = [38, 37, 37, 38]
peptide_length = [8, 9, 10, 11]
energy_bounds = [-8.235, -4.944] # about between 1 uM and 250 uM
#energy_bounds = [-4.944, -4.531] # about between 250 uM and 500 uM
#energy_bounds = [-4.531, -4.118] # about between 500 uM and 1 mM
#energy_bounds = [-8.649, -8.235] # about between 500 nM and 1uM
energy_cutoff = -4.11 # 1 mM
#energy_cutoff = -4.944 # 250 uM
#energy_cutoff = -8.235 # 1 uM

pssm_files = ['../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-8.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-9.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-10.txt',
              '../mhc/IEDB_MHC_I-2.9_matx_smm_smmpmbec/smmpmbec_matrix/HLA-A-02:01-11.txt']
mps = MHCIPeptideScorer(pssm_files, mhci, energy_cutoff=energy_cutoff)

# Generate random peptides
for n, size in zip(n_peptides, peptide_length):
    tmp_peptides, tmp_peptide_scores = mps.generate_random_peptides(n, energy_bounds, size)
    random_peptides.append(tmp_peptides)
    random_peptide_scores.append(tmp_peptide_scores)

random_peptides = np.concatenate(random_peptides)
random_peptide_scores = np.concatenate(random_peptide_scores)

clusters = defaultdict(list)
for i, sequence in enumerate(random_peptides):
    clusters[sequence.count('.')].append(i)
print('Distribution:', ['%d: %d' % (k, len(clusters[k])) for k in sorted(clusters.keys())])
print('')

print(len(random_peptides))
print(random_peptides)
print(random_peptide_scores)

----- Peptide global -----
N peptide: 8471
R2: 0.620
RMSD : 1.177 kcal/mol

Distribution: ['8: 38', '9: 37', '10: 37', '11: 38']

150
['PEPTIDE1{R.L.F.T.Y.G.P.Q}$$$$V2.0' 'PEPTIDE1{F.I.P.P.Y.F.C.C}$$$$V2.0'
 'PEPTIDE1{A.L.N.K.V.K.W.D}$$$$V2.0' 'PEPTIDE1{I.Y.Y.L.M.F.C.I}$$$$V2.0'
 'PEPTIDE1{W.H.G.S.T.C.D.Q}$$$$V2.0' 'PEPTIDE1{Q.S.G.T.I.Y.F.E}$$$$V2.0'
 'PEPTIDE1{M.A.C.Q.C.F.I.K}$$$$V2.0' 'PEPTIDE1{Y.P.Y.W.M.P.P.H}$$$$V2.0'
 'PEPTIDE1{L.D.A.Y.E.C.K.I}$$$$V2.0' 'PEPTIDE1{E.L.R.N.I.Q.Q.N}$$$$V2.0'
 'PEPTIDE1{Y.I.M.R.H.P.F.I}$$$$V2.0' 'PEPTIDE1{M.A.S.C.K.H.P.I}$$$$V2.0'
 'PEPTIDE1{H.I.Q.V.W.C.F.Q}$$$$V2.0' 'PEPTIDE1{R.F.T.G.D.W.V.D}$$$$V2.0'
 'PEPTIDE1{Y.Y.V.R.F.I.S.H}$$$$V2.0' 'PEPTIDE1{E.V.Y.P.P.Q.A.Y}$$$$V2.0'
 'PEPTIDE1{F.A.M.V.P.D.L.M}$$$$V2.0' 'PEPTIDE1{S.D.A.Q.T.C.E.C}$$$$V2.0'
 'PEPTIDE1{W.E.V.T.N.N.F.T}$$$$V2.0' 'PEPTIDE1{H.G.L.P.S.N.W.T}$$$$V2.0'
 'PEPTIDE1{C.F.C.R.Y.S.Y.V}$$$$V2.0' 'PEPTIDE1{K.Q.E.I.M.A.F.Q}$$$$V2.0'
 'PEPTIDE1{G.I.D.F.Y.N.T.D}$$$$V2.0' 'PEPTIDE1{E.Y.V.V.A.V.E.T}

In [8]:
class DMTExperience:
    
    def __init__(self, init_population, init_scores):
        self._init_pop = init_population
        self._init_scores = init_scores
        
    def run(self, n_step=3, n_sample=10, **config):        
        data = []
        NCAN = config['NCAN']

        # Defined local and global GA optimization
        helmgo = config['helmgo']
        config.pop('helmgo')
        gao = GA(helmgo, **config)
        
        # Add initial data
        for seq, exp_score in zip(self._init_pop, self._init_scores):
            data.append((0, 0, exp_score, np.nan, seq.count('.'), seq))

        for i in range(n_sample):
            print('Run: %s' % (i + 1))

            # We want to keep a copy of the random peptides generated
            candidates = self._init_pop.copy()
            candidate_scores = self._init_scores.copy()

            # Compute the MAP4 fingerprint for all the peptides
            X_fps_exp = map4_fingerprint(candidates, input_type='helm')
            y_exp = torch.from_numpy(candidate_scores).float()
            print('Exp dataset size: (%d, %d)' % (X_fps_exp.shape[0], X_fps_exp.shape[1]))

            print('\n')
            print('Init.')
            print('N pep: ', X_fps_exp.shape[0])
            print('Best peptide: %.3f' % y_exp.min())
            for n in [-13, -12, -11, -10, -9, -8]:
                print('N pep under %d kcal/mol: %03d' % (n, y_exp[y_exp < n].shape[0]))
            print('Non binding pep        : %03d' % (y_exp[y_exp == 0.].shape[0]))
            print('\n')

            for j in range(n_step):
                print('Generation: %d' % (j + 1))
                
                # Fit GP model
                model = get_fitted_model(X_fps_exp, y_exp, kernel=TanimotoSimilarityKernel)

                # Initialize acquisition function
                #acq = AcqScoring(model, probability_of_improvement, y_exp, sequence_type='helm', greater_is_better=True)
                scoring_function = AcqScoring(model, expected_improvement, y_exp, sequence_type='helm', greater_is_better=False)

                # Find new candidates using GA optimization
                gao.run(scoring_function, candidates, candidate_scores)

                # Take NCAN (96) best candidates found
                candidate_sequences = gao.sequences[:NCAN]
                candidates_acq = gao.scores[:NCAN]

                clusters = defaultdict(list)
                for i_seq, sequence in enumerate(candidate_sequences):
                    clusters[sequence.count('.')].append(i_seq)
                print('Final selection:', ['%d: %d' % (k, len(v)) for k, v in clusters.items()])

                # Get affinitiy binding values (MAKE TEST)
                candidate_sequences_fasta = [''.join(c.split('$')[0].split('{')[1].split('}')[0].split('.')) for c in candidate_sequences]
                y_candidates = mps.predict_energy(candidate_sequences_fasta)

                # Add candidates to the training set
                candidates = np.append(candidates, candidate_sequences)
                candidate_scores = np.append(candidate_scores, y_candidates)
                candidate_fps = map4_fingerprint(candidate_sequences, input_type='helm')

                X_fps_exp = torch.cat([X_fps_exp, candidate_fps])
                y_exp = torch.cat([y_exp, torch.from_numpy(y_candidates)])
                
                print('')
                print('N pep: ', X_fps_exp.shape[0])
                print('Best peptide: %.3f' % y_exp.min())
                for n in [-13, -12, -11, -10, -9, -8]:
                    print('N pep under %d kcal/mol: %03d' % (n, y_exp[y_exp < n].shape[0]))
                print('Non binding pep        : %03d' % (y_exp[y_exp == 0.].shape[0]))
                print('')
                
                # Store data
                for seq, acq_score, exp_score in zip(candidate_sequences, candidates_acq, y_candidates):
                    data.append((i + 1, j + 1, exp_score, acq_score, seq.count('.'), seq))
        
        columns = ['sample', 'gen', 'exp_score', 'acq_score', 'length', 'sequence']
        df = pd.DataFrame(data=data, columns=columns)
        
        return df        

In [None]:
parameters = {'NCAN': 96,
              'n_gen': 1, 'helmgo': helmgo,
              'sequence_n_gen': 10, 'sequence_n_children': 500, 'sequence_sigma': 0.1, 'sequence_elitism': True, 'sequence_n_process': 4,
              'scaffold_n_gen': 1, 'scaffold_n_children': 1000, 'scaffold_sigma': 0.5, 'scaffold_elitism': True,
              'scaffold_probability': 0.25, 'scaffold_only_terminus': True, 'scaffold_minimum_size': 8, 'scaffold_maximum_size': 11}

dmt = DMTExperience(random_peptides, random_peptide_scores)
df = dmt.run(4, 10, **parameters)

Run: 1
Exp dataset size: (150, 4096)


Init.
N pep:  150
Best peptide: -8.158
N pep under -13 kcal/mol: 000
N pep under -12 kcal/mol: 000
N pep under -11 kcal/mol: 000
N pep under -10 kcal/mol: 000
N pep under -9 kcal/mol: 000
N pep under -8 kcal/mol: 002
Non binding pep        : 000


Generation: 1
End scaffold opt - Score: -0.011 - Seq: 9 - PEPTIDE1{F.W.Q.I.F.N.L.N.M}$$$$V2.0
End sequence opt - Score: -0.014 - Seq: 9 - PEPTIDE1{F.W.M.C.F.N.L.N.M}$$$$V2.0
End sequence opt - Score: -0.011 - Seq: 8 - PEPTIDE1{W.F.C.F.N.M.C.I}$$$$V2.0
End sequence opt - Score: -0.015 - Seq: 10 - PEPTIDE1{W.F.L.C.N.M.V.I.L.V}$$$$V2.0
End sequence opt - Score: -0.018 - Seq: 11 - PEPTIDE1{F.F.F.L.M.F.I.I.L.L.V}$$$$V2.0
End GA opt - Score: -0.018 - Seq: 11 - PEPTIDE1{F.F.F.L.M.F.I.I.L.L.V}$$$$V2.0
57.73698592185974
Final selection: ['11: 94', '10: 2']

N pep:  246
Best peptide: -9.647
N pep under -13 kcal/mol: 000
N pep under -12 kcal/mol: 000
N pep under -11 kcal/mol: 000
N pep under -10 kcal/mol: 000
N pep

End sequence opt - Score: -0.000 - Seq: 9 - PEPTIDE1{F.L.L.M.F.V.I.F.W}$$$$V2.0
End sequence opt - Score: -0.041 - Seq: 10 - PEPTIDE1{F.L.L.M.F.V.I.F.W.V}$$$$V2.0
End sequence opt - Score: -0.000 - Seq: 11 - PEPTIDE1{A.L.L.F.M.F.V.I.F.W.V}$$$$V2.0
End GA opt - Score: -0.041 - Seq: 10 - PEPTIDE1{F.L.L.M.F.V.I.F.W.V}$$$$V2.0
48.303138256073
Final selection: ['10: 96']

N pep:  342
Best peptide: -12.361
N pep under -13 kcal/mol: 000
N pep under -12 kcal/mol: 005
N pep under -11 kcal/mol: 094
N pep under -10 kcal/mol: 128
N pep under -9 kcal/mol: 133
N pep under -8 kcal/mol: 192
Non binding pep        : 000

Generation: 3
[ 7.58420545  6.83806778  6.7989902   6.74892204  6.66832452  6.60604461
  6.59749639  6.54132236  6.46805189  6.46683071  6.38745436  6.32395329
  6.29342392  6.23724989  6.23114402  6.08826659  6.06140075  6.05651606
  6.04552548  5.97836088  5.94416799  5.94294682  5.9295139   5.91119628
  5.89409984  5.85868577  5.75488594  5.7084813   5.6559708   5.62422026
  5.55461

End scaffold opt - Score: -0.014 - Seq: 10 - PEPTIDE1{F.L.I.M.F.V.I.F.G.V}$$$$V2.0
End sequence opt - Score: -0.000 - Seq: 8 - PEPTIDE1{M.M.L.F.I.Y.W.V}$$$$V2.0
End sequence opt - Score: -0.001 - Seq: 9 - PEPTIDE1{L.M.L.F.F.I.F.W.V}$$$$V2.0
End sequence opt - Score: -0.047 - Seq: 10 - PEPTIDE1{F.L.M.M.F.V.I.F.F.V}$$$$V2.0
End sequence opt - Score: -0.000 - Seq: 11 - PEPTIDE1{F.L.M.M.F.V.I.F.W.V.T}$$$$V2.0
End GA opt - Score: -0.047 - Seq: 10 - PEPTIDE1{F.L.M.M.F.V.I.F.F.V}$$$$V2.0
49.845787048339844
Final selection: ['10: 96']

N pep:  438
Best peptide: -13.169
N pep under -13 kcal/mol: 001
N pep under -12 kcal/mol: 075
N pep under -11 kcal/mol: 190
N pep under -10 kcal/mol: 224
N pep under -9 kcal/mol: 229
N pep under -8 kcal/mol: 288
Non binding pep        : 000

Generation: 4
