In [1]:
%matplotlib inline

import Bio.PDB
import Bio.SeqIO
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.SeqUtils import seq3

import re
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess

# paths

dbfiles = glob.glob("../data/mhc_alleles/hla/*prot.fasta") + glob.glob("../data/mhc_alleles/mhc/*/*prot.fasta")
hla_fasta = '../data/mhc_alleles/hla/hla_prot.fasta'
mhc_db_path = '../sandbox/db.fasta'
epitope_db_path = '../data/epitopes/epitope_table.csv'
blast_output_path = '../sandbox/blast_output'
epitope_annot = '../sandbox/epitope_annot'
mhc_annot = '../sandbox/mhc_annot'
query_path = '../sandbox/query.fasta'
pdb_path = '../data/structures'
epitope_query_path = '../sandbox/epitope_query.fasta'

In [2]:
import __main__
__main__.pymol_argv = ['pymol','-Qc'] # Pymol: quiet and no GUI
from time import sleep
import pymol
from pymol import cmd

pymol.finish_launching()

In [3]:
# replace ' ' in indentifiers with ';'
db = open(mhc_db_path, 'w')
for dbfile in dbfiles:
    f = open(dbfile, 'r')
    text = f.read()
    f.close()
    db.write(';'.join(text.split(' ')))

db.close()

In [4]:
# fetch all chains from pdbs, combine in one query file and blast against hla db

def blast_pdb():
    parser = Bio.PDB.PDBParser()
    ppb = Bio.PDB.CaPPBuilder()

    query_file = open(query_path, 'w')

    pdbfiles = glob.glob(pdb_path + "/*.pdb")
    for pdb in pdbfiles:
        structure = parser.get_structure(id=pdb[-8:-4], file=pdb)
        for chain in structure[0]:
            try:
                query_file.write('>%s;%s\n%s\n' % (structure.id, chain.id, ''.join([str(s.get_sequence()) for s in ppb.build_peptides(chain)])))
            except Exception:
                continue

    query_file.close()
    
    # submit query file for blast search
    command = ["blastp", "-query", query_path, 
                     "-subject", mhc_db_path, "-out", 
                     blast_output_path, "-outfmt", str(6), "-num_alignments", str(1)]
    if subprocess.call(command) != 0:
        print('Failed to blast')
        exit()
    
#blast_pdb()

In [5]:
# convert blast output to mhc table

def host_n_allele(s):
    host = s
    if ';' in s:
            host = s.split(';')[1]
    if '-' in host:
        return host.split('-')
    else:
        return ['Human', host]

def make_mhc_table():
    blasted = pd.read_csv(blast_output_path, sep='\t', header=None)
    pdb_list = list(set([x.split(';')[0] for x in blasted[0]]))
    
    mhc_table = []

    for pdb in pdb_list:
        piece = blasted[blasted[0].str.startswith(pdb)]
        if piece.shape[0] < 2:
            continue
        thres = np.mean(piece[11])
        chunk = piece[piece[11] > thres]
        for i in chunk.index:
            line = chunk.loc[i, :]
            if line[2] < 90:
                continue
            hna = host_n_allele(line[1])
            susp = 0
            if line[11] < thres or line[2] < 95:
                susp = 1
            row = [line[0].split(';')[0], line[0].split(';')[1], 'class', hna[0], hna[1], susp]
            mhc_table.append(row)

    mhc_table = pd.DataFrame(mhc_table, columns=['pdb', 'chain', 'mhc_class', 'mhc_host', 'mhc_allele', 'unsure'])
    mhc_table.to_csv(mhc_annot, sep='\t', index=False)
    
#make_mhc_table()

In [6]:
# make epitope table using epitope database

def make_epitope_table():
    epitope_db = pd.read_csv(epitope_db_path, header=None, skiprows=1, low_memory=False)
    epitope_table = []

    query_file = open(query_path, 'r')
    text = query_file.read()
    query_file.close()
    text = text.split()

    epitope_query_file = open(epitope_query_path, 'w')
    for i in range(len(text)):
        if text[i][0] == '>':
            if len(text[i+1]) < 30:
                seq = text[i+1]
                name = text[i].split(';')
                epitope_query_file.write('%s\n%s\n' % (text[i], seq))
                epitope_db_tmp = epitope_db[epitope_db[2].apply(lambda x: len(x)) == len(seq)]
                epitope_db_tmp = epitope_db_tmp[epitope_db_tmp[2].str.contains(seq)]
                trial = epitope_db_tmp[epitope_db_tmp[2] == seq]
                if trial.shape[0] > 0:
                    epitope_db_tmp = trial
                if epitope_db_tmp.shape[0] > 0:
                    row = [name[0][1:], name[1], seq, len(seq), epitope_db_tmp.iloc[0, 7], 
                           epitope_db_tmp.iloc[0, 9], epitope_db_tmp.iloc[0, 11]]
                else:
                    row = [name[0][1:], name[1], seq, len(seq), 'NaN', 'NaN', 'NaN']
                epitope_table.append(row)

    epitope_query_file.close()

    epitope_table = pd.DataFrame(epitope_table, columns=['pdb', 'chain', 'sequence', 'length', 'name', 'organism', 'comments'])
    epitope_table.to_csv(epitope_annot, sep='\t', index=False)

#make_epitope_table()

In [7]:
epitope_table = pd.read_csv(epitope_annot, sep='\t')
mhc_table = pd.read_csv(mhc_annot, sep='\t')

mhc_table = mhc_table[mhc_table.mhc_host == 'Human']

In [8]:
# make detailed mhc table

# check if chains are less than 8 A separated
def chain_close(a, b):
    mindist = 100
    for a1 in a.get_atoms():
        if mindist < 8:
            return True
        for a2 in b.get_atoms():
            dist = a1-a2
            if dist < mindist:
                mindist = dist
    return False

# mean distance
def chain_dist(a, b):
    ave = 0
    num = 0
    for a1 in a.get_atoms():
        mindist = 100
        for a2 in b.get_atoms():
            dist = a1-a2
            if dist < mindist:
                mindist = dist
        ave += mindist
        num += 1
    return ave/num

# pair MHC chains based on allele names and interchain distances
def pair_chains(mhc, epi, alleles, pdb):
    model = parser.get_structure(id=pdb[-8:-4], file=pdb)[0]
    out = []
    
    for epi_ch in epi:
        firmindist = 100
        first_id = first_al = ''
        for mhc_ch, allele in zip(mhc, alleles):
            dist = chain_dist(model[epi_ch], model[mhc_ch])
            if dist < firmindist:
                firmindist = dist
                first_id = mhc_ch
                first_al = allele
        
        if len(allele.split('*')[0]) > 2:
            second_id = second_al = ''
            secmindist = 100
            for mhc_ch, allele in zip(mhc, alleles):
                if first_id == mhc_ch: 
                    continue
                dist = chain_dist(model[epi_ch], model[mhc_ch])
                if dist < secmindist:
                    secmindist = dist
                    second_id = mhc_ch
                    second_al = allele
            if (firmindist < 8 and secmindist < 8 and chain_close(model[first_id], model[second_id])):
                if (first_al.split('*')[0][2] == 'B' and second_al.split('*')[0][2] == 'A'):
                    out.append([[second_id, first_id], epi_ch])
                else:
                    out.append([[first_id, second_id], epi_ch])
        else:
            out.append([[first_id], epi_ch])
    return out

# verify intersection absence
def correctness(pairs):
    for i in range(len(pairs)-1):
        for j in range(i+1, len(pairs)):
            for l in range(len(pairs[i][0])):
                for m in range(len(pairs[j][0])):
                    assert(pairs[i][0][l] != pairs[j][0][m])


def make_annotation_table():
    parser = Bio.PDB.PDBParser(QUIET=True)
    ppb = Bio.PDB.CaPPBuilder()                    
    pdb_set = list(set(mhc_table.pdb))
    annotation = []

    for pdb in pdb_set[277:]:
        mhc_chunk = mhc_table[mhc_table.pdb == pdb]
        epi_chunk = epitope_table[epitope_table.pdb == pdb]
        pairs = pair_chains(list(mhc_chunk.chain), list(epi_chunk.chain), list(mhc_chunk.mhc_allele), 
                            "%s/%s.pdb" % (pdb_path, pdb))
        correctness(pairs)

        for pair in pairs:
            line = ''
            if len(pair[0]) == 1:
                mhc_local = mhc_chunk[mhc_chunk.chain == pair[0][0]]
                assert(mhc_local.iloc[0, 4][1] == '*')
                #               pdb          class         host              1 chain  
                line = [mhc_local.iloc[0, 0], 'I', mhc_local.iloc[0, 3],  mhc_local.iloc[0, 1], 
                #         1 chain allele       2 chain   2 chain allele      
                        mhc_local.iloc[0, 4],   '',         'B2M']

            if len(pair[0]) == 2:
                mhc_a = mhc_chunk[mhc_chunk.chain == pair[0][0]]
                mhc_b = mhc_chunk[mhc_chunk.chain == pair[0][1]]

                assert(mhc_a.iloc[0, 4][:2] == mhc_b.iloc[0, 4][:2])
                assert(mhc_a.iloc[0, 4][2] == 'A')
                assert(mhc_b.iloc[0, 4][2] == 'B')
                line = [mhc_chunk.iloc[0, 0], 'II', mhc_chunk.iloc[0, 3], mhc_a.iloc[0, 1], 
                        mhc_a.iloc[0, 4], mhc_b.iloc[0, 1], mhc_b.iloc[0, 4]]

            line = line + list(epi_chunk[epi_chunk.chain == pair[1]].iloc[0, 1:])
        annotation.append(line)

        cols = ['pdb', 'mhc_class', 'host', 'a_chain_id', 'a_chain_allele', 'b_chain_id', 'b_chain_allele', 
            'antigen_id', 'antigen_seq', 'antigen_len', 'antigen_name', 'antigen_organism', 'comments']
        annotation = pd.DataFrame(annotation, columns=cols).drop_duplicates()
        annotation.to_csv('../sandbox/detailed_mhc', index=None, sep='\t')
        
#make_annotation_table()

annotation = pd.read_csv('../sandbox/detailed_mhc', sep='\t')

In [292]:
# define microglobuline chains
def b2m_define(table):
    chunk = table[table.mhc_class == 'I']
    parser = Bio.PDB.PDBParser(QUIET=True)
    ppb = Bio.PDB.CaPPBuilder() 
    for i in chunk.index:
        pdb = chunk.loc[i, 'pdb']
        print(pdb)
        structure = parser.get_structure(id=name, file=pdb_path+'/'+pdb+'.pdb')[0]
        mhc = chunk.loc[i, 'a_chain_id']
        pep = chunk.loc[i, 'antigen_id']
        mindist = 1000
        b2m = None;
        for chain in structure.get_chains():
            if chain.get_id() != mhc and chain.get_id() != pep:
                if len(list(structure.get_chains())) == 3:
                    b2m = chain
                else:
                    if chain_close(chain, structure[mhc]):
                        dist = chain_dist(chain, structure[mhc])
                        if dist <= mindist:
                            mindist = dist
                            b2m = chain
        table.loc[i, 'b_chain_id'] = b2m.get_id()
        print('bsm chain = %s' % b2m.get_id())

    annotation.to_csv('../sandbox/detailed_mhc', index=None, sep='\t')
        
#b2m_define(annotation)

annotation = pd.read_csv('../sandbox/detailed_mhc', sep='\t')

In [11]:
iclass = annotation[annotation.mhc_class == 'I']
iclass = iclass[iclass.a_chain_allele.str.contains('[ABC]\*', regex=True)]
iclass.a_chain_allele = [':'.join(x.split(':')[:2]) for x in iclass.a_chain_allele]
iclass = iclass.reset_index(drop=True)

In [254]:
tmp = iclass[iclass.a_chain_allele.str.startswith('A*02:01')]
with open('../sandbox/peptides_cluster', 'w') as file:
    for i in range(tmp.shape[0]):
        file.write('>%s\n%s\n' % (tmp.iloc[i, 0], tmp.iloc[i, 8]))
        

cluster1 = ['2x4r', '3gsv', '3gsq', '3mrd', '3mrb', '3mrc', '3gsx', '3gsw', '3gsu', '3gsr', '3mr9']
cluster2 = ['4l3c', '1s9w', '3kla', '3gjf', '1s9y', '1s9x']

In [21]:
iclass[(iclass.antigen_len == 9) & 
       (iclass.a_chain_allele.str.startswith('A'))][iclass.pdb.isin(['4i48', '3i6l', '1q94', '1ao7'])]

Unnamed: 0,pdb,mhc_class,host,a_chain_id,a_chain_allele,b_chain_id,b_chain_allele,antigen_id,antigen_seq,antigen_len,antigen_name,antigen_organism,comments
0,3i6l,I,Human,D,A*24:02,E,B2M,F,QFKDNVILL,9,Nucleoprotein,SARS coronavirus Tor2,
1,5swq,I,Human,A,A*02:01,B,B2M,C,CVNGSCFTV,9,neuraminidase,Influenza A virus,
5,3mri,I,Human,A,A*02:01,B,B2M,P,CINMWCWTV,9,,,
7,2uwe,I,Human,H,A*02:01,I,B2M,J,ALWGFFPVL,9,chromosome 15 open reading frame 24,Homo sapiens,
8,3gsw,I,Human,A,A*02:01,B,B2M,P,NLVPMVAAV,9,,,
12,3hpj,I,Human,D,A*02:01,E,B2M,F,RMFPNAPYL,9,Wilms tumor protein,Homo sapiens,
13,2git,I,Human,D,A*02:01,E,B2M,F,LLFGKPVYV,9,,,
21,3qeq,I,Human,A,A*02:01,B,B2M,C,AAGIGILTV,9,Melanoma antigen recognized by T-cells 1,Homo sapiens,
22,4hx1,I,Human,A,A*68:02,B,B2M,C,SVYDFFVWL,9,tyrosinase-related protein-2,Homo sapiens,
23,1b0g,I,Human,D,A*02:01,E,B2M,F,ALWGFFPVL,9,chromosome 15 open reading frame 24,Homo sapiens,


In [26]:
def align_pdb(table):
    
    class SelectChains(Bio.PDB.Select):
        def __init__(self, chain_letters):
            self.chain_letters = chain_letters
        def accept_chain(self, chain):
            return (chain.get_id() in self.chain_letters)

    parser = Bio.PDB.PDBParser(QUIET=True)

    name = table.iloc[0, 0]; amhc = table.iloc[0, 3]; pept = table.iloc[0, 7]; b2m = table.iloc[0, 5]
    ref = parser.get_structure(id=name, file=pdb_path+'/'+name+'.pdb')[0]
    ref_atoms = []
    for atom in ref[amhc].get_atoms():
        if atom.get_name() == 'CA':
            ref_atoms.append(atom)

    io = Bio.PDB.PDBIO()
    io.set_structure(ref)
    io.save('../sandbox/aligned_pdb/%s_cut.pdb' % name, SelectChains([amhc, pept, b2m]))

    super_imposer = Bio.PDB.Superimposer()

    for i in range(1, table.shape[0]):
        name = table.iloc[i, 0]
        amhc = table.iloc[i, 3]
        b2m  = table.iloc[i, 5]
        pept = table.iloc[i, 7]
        sample = parser.get_structure(id=name, file=pdb_path+'/'+name+'.pdb')[0]
        sample_atoms = []
        for atom in sample[amhc].get_atoms():
            if atom.get_name() == 'CA':
                sample_atoms.append(atom)
                
        length = min(len(ref_atoms), len(sample_atoms))
        super_imposer.set_atoms(ref_atoms[:length], sample_atoms[:length])
        super_imposer.apply(sample.get_atoms())
        
        io.set_structure(sample)
        io.save('../sandbox/aligned_pdb/%s_cut.pdb' % name, SelectChains([amhc, pept, b2m]))
        
align_pdb(iclass[(iclass.antigen_len == 9) & 
       (iclass.a_chain_allele.str.startswith('A'))][iclass.pdb.isin(['4i48', '3i6l', '1q94', '1ao7'])])

In [25]:
# if residue in mhc is close to antigen then return 1.
# if further return 0.5
# if far return 0.01
def get_points(res, chain):
    mdist = 1000
    for a in res:
        for b in chain.get_atoms():
            dist = a-b
            if dist < mdist:
                mdist = dist
                if mdist < 5:
                    return 1.0
    if mdist < 10:
        return 0.5
    return 0.01

# return sorted list of best alleles
def score_alignments(scores, blasted, points, alleles, line_id, size):
    p1 = re.compile('\w+\s+([0-9]+)\s+(\w+)\s+([0-9]+)\\n')
    p2 = re.compile('Subject_([0-9]+)')
    p3 = re.compile('[A-Z]')

    res_start = start = end = counter = 0
    
    for line in blasted:
        if line[:7] == 'Query_1':
            m = p1.match(line)
            res_start = int(m.group(1))-1
            start = m.start(2)
            end = m.end(2)
            counter = 0
            
        if line[:8] == 'Subject_':
            if counter >= size:
                break
            allele_id = int(p2.match(line).group(1))-1
            chunk = line[start:end+1]
            scores[line_id, allele_id] = 0.0
            
            for m in p3.finditer(chunk):
                res_ind = res_start+m.start()
                scores[line_id, allele_id] += points[res_ind]
            counter += 1

    return scores

# blast crystal hla I seqs against hla data base
# and create allele-pdb compatibility table
def make_allele2pdb_table():
    p4 = re.compile('.+([ABC]\*[0-9]+:[0-9A-Z]+)')
    alleles = []
    file = open("../data/mhc_alleles/hla/hla1class.fasta", 'r')
    lines = file.readlines()
    file.close()
    for line in lines:
        if line[0] == '>':
            alleles.append(p4.search(line).group(1))

    parser = Bio.PDB.PDBParser(QUIET=True)
    ppb = Bio.PDB.CaPPBuilder()

    size = len(alleles)
    num_alignments = size

    scores = np.zeros((iclass.shape[0], size))
    scores.fill(np.inf)

    for i in range(iclass.shape[0]):
        name = iclass.iloc[i, 0]
        print(name)
        chain = iclass.iloc[i, 3]
        antichain = iclass.iloc[i, 7]
        struct = parser.get_structure(id=name, file=pdb_path+'/'+name+'.pdb')[0]
        seq = ''.join([str(x.get_sequence()) for x in ppb.build_peptides(struct[chain])])
        query = open('../sandbox/query.fasta', 'w')
        query.write('>%s\n%s\n' % (name, seq))
        query.close()

        command = ["blastp", "-query", "../sandbox/query.fasta",
                   "-subject", "../data/mhc_alleles/hla/hla1class.fasta", "-out", 
                   "../sandbox/out", "-outfmt", str(3), "-num_alignments", str(num_alignments),
                   "-line_length", str(1000), "-ungapped", "-comp_based_stats", "F"]
        if subprocess.call(command) != 0:
            print('Failed to blast')
            exit()

        points = np.array([0.0]*len(struct[chain]))
        residues = list(struct[chain])
        for j in range(len(residues)):
            points[j] = get_points(residues[j], struct[antichain])

        file = open("../sandbox/out", 'r')
        blasted = file.readlines()
        file.close()

        scores = score_alignments(scores, blasted, points, alleles, i, num_alignments)

    alleles_vs_pdb = pd.DataFrame([alleles]+[list(x) for x in list(scores)]).T.drop_duplicates(0)
    alleles_vs_pdb.index = alleles_vs_pdb[0]
    alleles_vs_pdb = alleles_vs_pdb.iloc[:, 1:]
    alleles_vs_pdb.columns = list(iclass.pdb)
    alleles_vs_pdb = alleles_vs_pdb.replace(np.inf, np.nan)
    alleles_vs_pdb.to_csv('../sandbox/alleles_vs_pdb', sep='\t', float_format='%.2f')
    
#make_allele2pdb_table()

alleles_vs_pdb = pd.read_csv('../sandbox/alleles_vs_pdb', sep='\t', index_col=0, low_memory=False)

In [14]:
ligand_assays = pd.read_csv('../data/epitopes/mhc_ligand_full.csv', skiprows=1, low_memory=False)
ligand_assays = ligand_assays[ligand_assays['MHC allele class'] == 'I']

affinity = ligand_assays[['Description', 'Allele Name', 'Qualitative Measure', 'Quantitative measurement']]
affinity['Length'] = [len(x) for x in affinity.Description]
affinity = affinity[(affinity.Length < 12) & (affinity.Length > 7)]
affinity = affinity[affinity['Allele Name'].str.contains('HLA-[ABC]\*\S+$', regex=True)]
affinity['Allele Name'] = [x[4:] for x in affinity['Allele Name']]

In [15]:
# compare peptides with bias to terminal acids
def cmp_peptides(a, b):
        assert(len(a) == len(b))
        score = 0.0
        length = len(a)
        for i in range(length):
            if a[i] != b[i]:
                if i in [0,1,length-1]:
                    score += 1.0
                else:
                    score += 0.5
        return score

# create peptide-pdb compatibility table
def make_peptide2pdb_table():
    antigen_list = list(set(affinity['Description']))
    size = len(antigen_list)

    peptide_scores = np.zeros((size, iclass.shape[0]))
    peptide_scores.fill(np.inf)

    for counter in range(size):
        if counter % 1000 == 0:
            print(counter)
        seq = antigen_list[counter]
        length = len(seq)

        local = iclass[iclass.antigen_len == length]
        for i in local.index:
            peptide_scores[counter, i] = 0.0
            crystal_seq = local.loc[i, 'antigen_seq']
            peptide_scores[counter, i] = cmp_peptides(crystal_seq, seq)

    peptides_vs_pdb = pd.DataFrame(peptide_scores, index=antigen_list, columns=list(iclass.pdb))
    peptides_vs_pdb = peptides_vs_pdb.replace(np.inf, np.nan)
    peptides_vs_pdb.to_csv('../sandbox/peptides_vs_pdb', sep='\t')

#make_peptide2pdb_table()

peptides_vs_pdb = pd.read_csv('../sandbox/peptides_vs_pdb', sep='\t', index_col=0, low_memory=False)

In [16]:
# s1 - mutated sequence
# s2 - reference sequence
def align(s1, s2):
    matrix = matlist.blosum62
    gap_open = -100
    gap_extend = -0.5
    alns = pairwise2.align.globalds(s1, s2, matrix, gap_open, gap_extend)
    top_aln = alns[0]
    
    
    print(top_aln[0])
    print(top_aln[1])
    
    # check that there is no gaps inside the alignment
    assert(re.search('[A-Z]-+[A-Z]', top_aln[0]) == None)
    assert(re.search('[A-Z]-+[A-Z]', top_aln[1]) == None)
    
    mutations = []
    gap = re.match('(-+)', top_aln[0])
    shift = 0
    if gap:
        shift = gap.end(1) - gap.start(1)
        
    for i, a1, a2 in zip(list(range(-shift+1, len(top_aln[0])-shift+1)), top_aln[0], top_aln[1]):
        if a1 != '-' and a2 != '-':
            if a1 != a2:
                mutations.append((str(i), a1, a2))
    return mutations

# mutate pdb structure in accordance to input sequences
def mutate(pdb, newseq, newpep, table):
    parser = Bio.PDB.PDBParser(QUIET=True)
    ppb=Bio.PDB.PPBuilder()
    
    name = pdb; 
    amhc = table[table.pdb == pdb].iloc[0, 3]
    pept = table[table.pdb == pdb].iloc[0, 7]
    b2m = table[table.pdb == pdb].iloc[0, 5]
    
    struct = parser.get_structure(id=name, file=pdb_path+'/'+name+'.pdb')[0]
    oldseq = ''.join([str(x.get_sequence()) for x in ppb.build_peptides(struct[amhc])])
    oldpep = table[table.pdb == pdb].iloc[0, 8]
    
    assert(len(oldpep) == len(newpep))
    mhc_mutations = align(oldseq, newseq)
    pep_mutations = align(oldpep, newpep)

    cmd.reinitialize()
    cmd.load(pdb_path+'/'+name+'.pdb', 'obj')
    
    # remove alternative atom locations
    cmd.remove("not alt ''+A")
    cmd.alter('all', "alt=''")
    cmd.save('../sandbox/mutated/original.pdb', 'obj//%s// obj//%s// obj//%s//' % (amhc, pept, b2m))
    
    # mutate
    cmd.wizard('mutagenesis')
    cmd.refresh_wizard()

    for i, old, new in mhc_mutations:
        print(i, old, new)
        cmd.get_wizard().do_select(amhc+'/'+str(i)+'/')
        cmd.get_wizard().set_mode(seq3(new).upper())
        cmd.get_wizard().apply()
    print("\n%s: %i MHC mutations\n" % (name, len(mhc_mutations)))
        
    for i, old, new in pep_mutations:
        print(i, old, new)
        cmd.get_wizard().do_select(pept+'/'+str(i)+'/')
        cmd.get_wizard().set_mode(seq3(new).upper())
        cmd.get_wizard().apply()
    print("\n%s: %i peptide mutations\n" % (name, len(pep_mutations)))
        
    cmd.set_wizard()
    cmd.save('../sandbox/mutated/mutated.pdb', 'obj//%s// obj//%s// obj//%s//' % (amhc, pept, b2m))
    #sleep(0.5)

# wrapper for mutate(..)
def find_and_mutate(pepseq, allele, alleledict, pep2pdb, mhc2pdb, pdb_table):
    best_pdbs = (alleles_vs_pdb.loc[allele, :] + peptides_vs_pdb.loc[pepseq, :]/10).sort_values()
    alleleseq = str(alleledict[allele].seq)
    for i in xrange(len(best_pdbs)):
        try:
            best_pdb = best_pdbs.index[i]
            print('Compability score: %f' % best_pdbs[best_pdb])
            mutate(best_pdb, alleleseq, pepseq, pdb_table)
        except AssertionError:
            continue
        else:
            break
        
        
    
# testing
alleledict = Bio.SeqIO.to_dict(Bio.SeqIO.parse(hla_fasta, 'fasta'))
for key in alleledict.keys():
    alleledict[':'.join(key.split(';')[1].split(':')[:2])] = alleledict.pop(key)

idx = 9000
for idx in xrange(100):
    pepseq = affinity.iloc[idx, 0]
    allele = affinity.iloc[idx, 1]
    find_and_mutate(pepseq, allele, alleledict, peptides_vs_pdb, alleles_vs_pdb, iclass)

In [272]:
table=iclass
pdb ='4hx1'
parser = Bio.PDB.PDBParser(QUIET=True)
ppb=Bio.PDB.PPBuilder()
name = pdb; amhc = table[table.pdb == pdb].iloc[0, 3]; pept = table[table.pdb == pdb].iloc[0, 7]
struct = parser.get_structure(id=name, file=pdb_path+'/'+name+'.pdb')[0]
oldseq = str(ppb.build_peptides(struct[amhc])[0].get_sequence())
oldpep = table[table.pdb == pdb].iloc[0, 8]

In [234]:
ppb.build_peptides(struct[amhc])

[<Polypeptide start=1 end=274>]

In [261]:
affinity.shape

(256535, 5)

In [18]:
iclass[(iclass.antigen_len == 9) & (iclass.a_chain_allele.str.startswith('A'))]

Unnamed: 0,pdb,mhc_class,host,a_chain_id,a_chain_allele,b_chain_id,b_chain_allele,antigen_id,antigen_seq,antigen_len,antigen_name,antigen_organism,comments
0,3i6l,I,Human,D,A*24:02,E,B2M,F,QFKDNVILL,9,Nucleoprotein,SARS coronavirus Tor2,
1,5swq,I,Human,A,A*02:01,B,B2M,C,CVNGSCFTV,9,neuraminidase,Influenza A virus,
2,3x14,I,Human,A,B*08:02,B,B2M,C,FLRGRAYGL,9,nuclear antigen EBNA-3,Human herpesvirus 4,
5,3mri,I,Human,A,A*02:01,B,B2M,P,CINMWCWTV,9,,,
6,1w0v,I,Human,A,B*27:05,B,B2M,C,RRLPIFSRL,9,ERF-2,Homo sapiens,
7,2uwe,I,Human,H,A*02:01,I,B2M,J,ALWGFFPVL,9,chromosome 15 open reading frame 24,Homo sapiens,
8,3gsw,I,Human,A,A*02:01,B,B2M,P,NLVPMVAAV,9,,,
11,4qru,I,Human,A,B*08:01,B,B2M,C,ELRRKMMYM,9,55 kDa immediate-early protein 1,Human herpesvirus 5 strain AD169,
12,3hpj,I,Human,D,A*02:01,E,B2M,F,RMFPNAPYL,9,Wilms tumor protein,Homo sapiens,
13,2git,I,Human,D,A*02:01,E,B2M,F,LLFGKPVYV,9,,,
