# Retrieve the human-mouse orthologous genes and canonical transcripts  
This script:
- Retrieved human mutation-derived immunogenic peptides and mapped them to the mouse genome;
- Retrieved mouse tumor-mutation derived peptides and mapped them to the mouse genome;


**Author**: Kexin Dong  
**Date**: May 19, 2024  

## H2M  

In [1]:
import bioh2m as h2m
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
import time
import ast
from adjustText import adjust_text
import matplotlib.patheffects as PathEffects
import seaborn as sns
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']



### Load and pre-preocess data (N = 1,835)

In [80]:
df_h_validated = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/validated_tsnadb2_download.txt', header=0, sep='\t')
df_h_validated = df_h_validated[['Mutation Type','Gene','Mutation','Mutant Peptide','HLA']].drop_duplicates()

In [81]:
df_h_validated

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA
0,SNV,SEC22C,SEC22C_H248Y,AEHSLQVAY,B44:03
1,SNV,TKT,TKT_R438W,AMFWSVPTV,A02:01
2,SNV,KIF6B,KIF6B_L1020P,APARLERRHSAL,B07:02
3,SNV,ACSS3,ACSS3_H194Y,ARIGAIYSL,C07:02
4,SNV,AP2S1,AP2S1_N86K,AYLEAIHKF,A24:02
...,...,...,...,...,...
1851,INDEL,C2CD3,C2CD3_R1186del,RSPTAEGVL,B07:02
1852,INDEL,SPEN,KLLE1864_1867del,SPRGEAQLK,B07:02
1853,INDEL,DDX3X,TR213_214del,TRYPTPVQK,A03:01
1854,INDEL,PDE12,PDE12_S50del,VPSEPKLLS,B07:02


### SNV-derived (N = 1,627)

In [82]:
df_h_validated = df_h_validated.dropna(subset ='Mutation')
df_h_validated = df_h_validated[df_h_validated['Mutation Type']!='Fusion']
df_h_validated['Mutation'] = [str(x).split('_')[1] for x in df_h_validated['Mutation']]
df_h_validated = df_h_validated[df_h_validated['Mutation Type']=='SNV'].reset_index(drop=True)
df_h_validated

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA
0,SNV,SEC22C,H248Y,AEHSLQVAY,B44:03
1,SNV,TKT,R438W,AMFWSVPTV,A02:01
2,SNV,KIF6B,L1020P,APARLERRHSAL,B07:02
3,SNV,ACSS3,H194Y,ARIGAIYSL,C07:02
4,SNV,AP2S1,N86K,AYLEAIHKF,A24:02
...,...,...,...,...,...
1622,SNV,TNFAIP1,R48L,YTTVRALTL,B08:01
1623,SNV,CEL,T471L,YVFGKPFATPLGY,A29:02
1624,SNV,LRPPRC,T1335I,YVSEKDVISAK,A68:01
1625,SNV,TSR1,H561Y,YVSEVPVSV,C17:01


### Select MHC-I binding (N = 1,211)

In [83]:
def split_mutation(mutation):
    parts = re.findall(r'([A-Z])(\d+)([A-Z])', mutation)
    if parts:
        return parts[0] 
    return ('', '', '') 

df_h_validated[['ref_seq_h', 'start_h', 'alt_seq_h']] = df_h_validated['Mutation'].apply(lambda x: pd.Series(split_mutation(x)))
df_h_validated['end_h'] = df_h_validated['start_h']
df_h_validated = df_h_validated.dropna(subset = 'HLA').reset_index(drop=True)
df_h_validated = df_h_validated[[x[0] in ['A','B','C'] for x in df_h_validated['HLA']]]
df_h_validated = df_h_validated.reset_index(drop=True)
df_h_validated

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA,ref_seq_h,start_h,alt_seq_h,end_h
0,SNV,SEC22C,H248Y,AEHSLQVAY,B44:03,H,248,Y,248
1,SNV,TKT,R438W,AMFWSVPTV,A02:01,R,438,W,438
2,SNV,KIF6B,L1020P,APARLERRHSAL,B07:02,L,1020,P,1020
3,SNV,ACSS3,H194Y,ARIGAIYSL,C07:02,H,194,Y,194
4,SNV,AP2S1,N86K,AYLEAIHKF,A24:02,N,86,K,86
...,...,...,...,...,...,...,...,...,...
1206,SNV,TNFAIP1,R48L,YTTVRALTL,B08:01,R,48,L,48
1207,SNV,CEL,T471L,YVFGKPFATPLGY,A29:02,T,471,L,471
1208,SNV,LRPPRC,T1335I,YVSEKDVISAK,A68:01,T,1335,I,1335
1209,SNV,TSR1,H561Y,YVSEVPVSV,C17:01,H,561,Y,561


### Select 9-aa-long peptides (N = 1,211)

In [84]:
df_h_validated = df_h_validated[[len(x)==9 for x in df_h_validated['Mutant Peptide']]].reset_index(drop=True)
df_h_validated['peptide_index'] = range(len(df_h_validated))
df_h_validated

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA,ref_seq_h,start_h,alt_seq_h,end_h,peptide_index
0,SNV,SEC22C,H248Y,AEHSLQVAY,B44:03,H,248,Y,248,0
1,SNV,TKT,R438W,AMFWSVPTV,A02:01,R,438,W,438,1
2,SNV,ACSS3,H194Y,ARIGAIYSL,C07:02,H,194,Y,194,2
3,SNV,AP2S1,N86K,AYLEAIHKF,A24:02,N,86,K,86,3
4,SNV,TMEM48,F169L,CLNEYHLFL,A02:01,F,169,L,169,4
...,...,...,...,...,...,...,...,...,...,...
637,SNV,TOX2,S382Y,YPAPQPPVL,B08:01,S,382,Y,382,637
638,SNV,NR1D1,G39D,YSDNSNDSF,A01:01,G,39,D,39,638
639,SNV,TNFAIP1,R48L,YTTVRALTL,B08:01,R,48,L,48,639
640,SNV,TSR1,H561Y,YVSEVPVSV,C17:01,H,561,Y,561,640


### Check and rename genes  (N = 586)

In [85]:
df_h_validated['gene_name_h'] = df_h_validated['Gene']
dict_of_aacr_symbol = {
    'H3F3A':'H3-3A',
    'H3F3B':'H3-3B',
    'WHSC1':'NSD1',
    'GPR124':'ADGRA2',
    'MKL1':'MRTFA',
    'PARK2':'PRKN',
    'MRE11A':'MRE11',
    'MLLT4':'AFDN',
    'CASC5': 'KNL1',
    'MEF2BNB-MEF2B':'BORCS8-MEF2B',
    'PAK7':'PAK5',
    'HIST1H1C':'H1-2',
    'HIST1H3B': 'H3C1',
    'HIST1H1E': 'H1-4',
    'HIST1H2BD': 'H2BC5',
    'HIST3H3': 'H3-4',
    'HIST1H3D': 'H3C2',
    'HIST1H3E': 'H3C3',
    'HIST1H3J': 'H3C12',
    'HIST1H2BK': 'H2BC8',
    'HIST1H3F': 'H3C4',
    'HIST1H1B': 'H1-5',
    'HIST1H1D': 'H1-3',
    'HIST1H2AC': 'H2AC1',
    'HIST1H2BJ': 'H2BC13',
    'HIST1H4E': 'H4C5',
    'FAM46C':'TENT5C',
    'WHSC1L1':'NSD3',
    'SETD8':'KMT5A',
    'LPHN3':'ADGRL3',
    'BAI3':'ADGRB3',
    'SEPT9':'NAPB',
    'BRE':'BABAM2',
    'RFWD2':'COP1',
    'TCEB1':'ELOC',
    'GBA':'GBA1',
    'PVRL4':'NECTIN4',
    'ICK':'GCKR',
    'GNB2L1':'RACK1',
    'MGEA5':'OGA',
    'DIRC2':'SLC49A4',
    'LARGE':'LARGE1',
    'TMEM173B':'STING1',
    'SEPT5':'SEPTIN5'
    }
df_h_validated['gene_name_h'] = [dict_of_aacr_symbol.get(x,x) for x in df_h_validated['gene_name_h']]

### H2M modeling  (N = 402/489 for mouse)

In [86]:
df_h_validated = h2m.get_tx_batch(df_h_validated, 'h', ver=37)[0]
df_h_validated = h2m.query_batch(df_h_validated)[0]
df_h_validated = h2m.get_tx_batch(df_h_validated, 'm', ver=37)[0]

There were rows that could not be processed.
There were rows that could not be processed.
No error occurs.


In [240]:
df_h_validated.drop_duplicates(subset='peptide_index')

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA,ref_seq_h,start_h,alt_seq_h,end_h,peptide_index,gene_name_h,tx_id_h,ref_genome_h,gene_name_m,tx_id_m,type_h,index
0,SNV,SEC22C,H248Y,AEHSLQVAY,B44:03,H,248,Y,248,0,SEC22C,ENST00000264454.3,GRCh37,Sec22c,ENSMUST00000078547.12,SNP,0
1,SNV,TKT,R438W,AMFWSVPTV,A02:01,R,438,W,438,1,TKT,ENST00000423516.1,GRCh37,Tkt,ENSMUST00000022529.8,SNP,1
2,SNV,ACSS3,H194Y,ARIGAIYSL,C07:02,H,194,Y,194,2,ACSS3,ENST00000548058.1,GRCh37,Acss3,ENSMUST00000165067.9,SNP,2
3,SNV,AP2S1,N86K,AYLEAIHKF,A24:02,N,86,K,86,3,AP2S1,ENST00000263270.6,GRCh37,Ap2s1,ENSMUST00000086112.8,SNP,3
4,SNV,MED15,P677S,DANSFLQSV,B51:01,P,677,S,677,5,MED15,ENST00000263205.7,GRCh37,Med15,ENSMUST00000012259.9,SNP,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,SNV,TOX2,S382Y,YPAPQPPVL,B08:01,S,382,Y,382,637,TOX2,ENST00000341197.4,GRCh37,Tox2,ENSMUST00000099110.10,SNP,882
883,SNV,NR1D1,G39D,YSDNSNDSF,A01:01,G,39,D,39,638,NR1D1,ENST00000246672.3,GRCh37,Nr1d1,ENSMUST00000064941.7,SNP,883
884,SNV,TNFAIP1,R48L,YTTVRALTL,B08:01,R,48,L,48,639,TNFAIP1,ENST00000226225.2,GRCh37,Tnfaip1,ENSMUST00000108277.3,SNP,884
885,SNV,TSR1,H561Y,YVSEVPVSV,C17:01,H,561,Y,561,640,TSR1,ENST00000301364.5,GRCh37,Tsr1,ENSMUST00000045807.14,SNP,885


In [27]:
path_h_ref, path_m_ref = '/Users/kexindong/Documents/GitHub/Database/RefGenome/ncbi-2023-09-12/GCF_000001405.25_GRCh37.p13_genomic.fna.gz', '/Users/kexindong/Documents/GitHub/Database/RefGenome/mouse-2023-09-13/GCF_000001635.27_GRCm39_genomic.fna.gz'
# remember to replace the paths with yours
records_h, index_list_h = h2m.genome_loader(path_h_ref)
records_m, index_list_m = h2m.genome_loader(path_m_ref)
path_h_anno, path_m_anno = '/Users/kexindong/Documents/GitHub/Database/Genecode/gencode_v19_GRCh37.db', '/Users/kexindong/Documents/GitHub/Database/Genecode/gencode_vm33_GRCm39.db'
db_h, db_m = h2m.anno_loader(path_h_anno), h2m.anno_loader(path_m_anno)

In [91]:
df_h_validated['type_h'] = 'SNP'
df_h_validated['index'] = range(len(df_h_validated))
df_result_h2m, df_fail = h2m.model_batch(df_h_validated, records_h, index_list_h, records_m, index_list_m, db_h, db_m, 37, coor='aa', show_sequence=True)

There were rows that could not be processed.


In [111]:
df_result_h2m.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/h2m_neoantigen_result_v3.csv',index=False)

In [93]:
df_result_h2m.drop_duplicates(subset=['HGVSp_h','gene_name_h'])['class'].value_counts()

1    402
4     57
Name: class, dtype: int64

In [94]:
df_result_h2m.drop_duplicates(subset=['HGVSp_h','gene_name_h','tx_id_m'])['class'].value_counts()

1    489
4    164
Name: class, dtype: int64

In [106]:
df_h_validated

Unnamed: 0,Mutation Type,Gene,Mutation,Mutant Peptide,HLA,ref_seq_h,start_h,alt_seq_h,end_h,peptide_index,gene_name_h,tx_id_h,ref_genome_h,gene_name_m,tx_id_m,type_h,index
0,SNV,SEC22C,H248Y,AEHSLQVAY,B44:03,H,248,Y,248,0,SEC22C,ENST00000264454.3,GRCh37,Sec22c,ENSMUST00000078547.12,SNP,0
1,SNV,TKT,R438W,AMFWSVPTV,A02:01,R,438,W,438,1,TKT,ENST00000423516.1,GRCh37,Tkt,ENSMUST00000022529.8,SNP,1
2,SNV,ACSS3,H194Y,ARIGAIYSL,C07:02,H,194,Y,194,2,ACSS3,ENST00000548058.1,GRCh37,Acss3,ENSMUST00000165067.9,SNP,2
3,SNV,AP2S1,N86K,AYLEAIHKF,A24:02,N,86,K,86,3,AP2S1,ENST00000263270.6,GRCh37,Ap2s1,ENSMUST00000086112.8,SNP,3
4,SNV,MED15,P677S,DANSFLQSV,B51:01,P,677,S,677,5,MED15,ENST00000263205.7,GRCh37,Med15,ENSMUST00000012259.9,SNP,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,SNV,TOX2,S382Y,YPAPQPPVL,B08:01,S,382,Y,382,637,TOX2,ENST00000341197.4,GRCh37,Tox2,ENSMUST00000099110.10,SNP,882
883,SNV,NR1D1,G39D,YSDNSNDSF,A01:01,G,39,D,39,638,NR1D1,ENST00000246672.3,GRCh37,Nr1d1,ENSMUST00000064941.7,SNP,883
884,SNV,TNFAIP1,R48L,YTTVRALTL,B08:01,R,48,L,48,639,TNFAIP1,ENST00000226225.2,GRCh37,Tnfaip1,ENSMUST00000108277.3,SNP,884
885,SNV,TSR1,H561Y,YVSEVPVSV,C17:01,H,561,Y,561,640,TSR1,ENST00000301364.5,GRCh37,Tsr1,ENSMUST00000045807.14,SNP,885


In [95]:
df_process = df_result_h2m.drop_duplicates(subset=['HGVSp_h','gene_name_h','tx_id_m']).reset_index(drop=True)
df_process = pd.merge(df_h_validated[['Mutant Peptide',  'HLA',
        'index']], df_process, on = 'index', how = 'left')
df_final_success = df_process[df_process['status']==True].reset_index(drop=True)
df_final_success['flank_size_left'] = df_final_success['flank_size_left'].fillna('0aa')
df_final_success['flank_size'] = [int(x[:-2])+int(y[:-2]) for x,y in zip(df_final_success['flank_size_left'], df_final_success['flank_size_right'])]

In [96]:
df_final_success = df_final_success.reset_index(drop=True)
df_final_success

Unnamed: 0,Mutant Peptide,HLA,index,gene_name_h,gene_id_h,tx_id_h,chr_h,exon_num_h,strand_h,match,...,mouse_tx_idx_ori,mouse_p_idx_ori,mouse_new_p_idx_ori,new_seq_m,mouse_tx_idx,mouse_p_idx,mouse_new_p_idx,dist_h,dist_m,flank_size
0,AEHSLQVAY,B44:03,0,SEC22C,ENSG00000093183.9,ENST00000264454.3,chr3,6.0,-,False,...,"[741, 742, 743]",[247],[247],ATGTCCATGATCCTTTTTGCCAGCATCGTACGGGTGAGGGATGGAC...,"[741, 742, 743]","[247, 247, 247]",[247],,,12
1,AMFWSVPTV,A02:01,1,TKT,ENSG00000163931.11,ENST00000423516.1,chr3,15.0,-,False,...,"[1287, 1288, 1289]",[429],[429],ATGGAAGGTTACCATAAGCCAGATCAGCAGAAGCTCCAGGCCCTGA...,"[1287, 1288, 1289]","[429, 429, 429]",[429],,,60
2,ARIGAIYSL,C07:02,2,ACSS3,ENSG00000111058.3,ENST00000548058.1,chr12,16.0,+,False,...,"[564, 565, 566]",[188],[188],ATGAAGCCATCCTGGTTGCAATGTCGCAAAGTAACAGGCGCCGGGA...,"[564, 565]","[188, 188]",[188],,,24
3,AYLEAIHKF,A24:02,3,AP2S1,ENSG00000042753.7,ENST00000263270.6,chr19,5.0,-,True,...,"[255, 256, 257]",[85],[85],ATGATCCGATTCATCCTTATCCAGAACCGGGCAGGCAAGACGCGCC...,[257],[85],[85],,,142
4,DANSFLQSV,B51:01,4,MED15,ENSG00000099917.13,ENST00000263205.7,chr22,18.0,+,False,...,"[2031, 2032, 2033]",[677],[677],ATGGACGTTTCGGGGCAGGAGACCGACTGGCGTAGCGCCGCCTTTC...,"[2031, 2032, 2033]","[677, 677, 677]",[677],,,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,YPAPQPPVL,B08:01,882,TOX2,ENSG00000124191.13,ENST00000341197.4,chr20,9.0,+,False,...,"[1269, 1270, 1271]",[423],[423],ATGGGGGGCGGGGGTGCCGGGCCTCCAGCCAATAGAGGACCAAGAG...,"[1269, 1270, 1271]","[423, 423, 423]",[423],,,5
485,YSDNSNDSF,A01:01,883,NR1D1,ENSG00000126368.5,ENST00000246672.3,chr17,8.0,-,True,...,"[114, 115, 116]",[38],[38],ATGACGACCCTGGACTCCAATAACAACACAGGTGGTGTTATCACCT...,[115],[38],[38],,,32
486,YTTVRALTL,B08:01,884,TNFAIP1,ENSG00000109079.5,ENST00000226225.2,chr17,6.0,+,True,...,"[141, 142, 143]",[47],[47],ATGTCAGGGGACACCTGTCTGTGTCCAGCCTCGGGGGCCAAGCCCA...,[142],[47],[47],,,78
487,YVSEVPVSV,C17:01,885,TSR1,ENSG00000167721.6,ENST00000301364.5,chr17,15.0,-,True,...,"[1677, 1678, 1679]",[559],[559],ATGGCGGCTCACCGCTCCGGGCCGCTGAAGCAGCAGAATAAAGCTC...,"[1677, 1678, 1679]","[559, 559, 559]",[559],,,13


### 9-mer splicing (N = 300/383 for mouse)

In [98]:
def find_9mers_containing_position(sequence, position):
    """
    Generate all 9-mer slices from a given DNA sequence that include a specific position.

    Args:
    sequence (str): The DNA sequence.
    position (int): The 1-based position within the sequence.

    Returns:
    list: A list of all 9-mer slices including the specified position.
    """
    # Adjust position to 0-based for easier slicing
    sequence = str(h2m.Translate(sequence))
    position = position[0]

    k_mers = [""] * 9 
    start_index = position - 8

    for i in range(9):
        if 0 <= start_index + i <= len(sequence) - 9:  
            k_mer = sequence[start_index + i: start_index + i + 9]
            k_mers[i] = k_mer
    
    # Initialize list to collect 9-mers
    
    return k_mers

def expand_row(row):
    return pd.DataFrame({
        'human_9mers': row['human_mouse_9mers']['human_9mers'],
        'mouse_9mers': row['human_mouse_9mers']['mouse_9mers'],
        'index':row['index'],
        'HLA':row['HLA'],
        'Mutant Peptide':row['Mutant Peptide']
    })

In [100]:
df_final_success['human_mouse_9mers'] = df_final_success.apply(
    lambda row: {
        'human_9mers': find_9mers_containing_position(row['new_seq_h'], row['human_p_idx']),
        'mouse_9mers': find_9mers_containing_position(row['new_seq_m'], row['mouse_p_idx'])
    }, axis=1)

In [101]:
expanded_rows = pd.concat([expand_row(row) for _, row in df_final_success.iterrows()]).reset_index(drop=True)

In [110]:
o = expanded_rows[expanded_rows['human_9mers']==expanded_rows['Mutant Peptide']].reset_index(drop=True)

In [103]:
df_result_h2m.drop_duplicates(subset=['HGVSp_h','gene_name_h','tx_id_m'])['class'].value_counts()

1    489
4    164
Name: class, dtype: int64

In [112]:
o

Unnamed: 0,human_9mers,mouse_9mers,index,HLA,Mutant Peptide
0,AYLEAIHKF,AYLEAIHKF,3,A24:02,AYLEAIHKF
1,FLYNLLTRV,FMYNLLTRV,8,A02:01,FLYNLLTRV
2,GLFGDIYLA,GLFGDIYLA,9,A02:01,GLFGDIYLA
3,GRIAFFLKY,GKIAFFLKF,10,B27:05,GRIAFFLKY
4,HMTEVVRHC,HMTEVVRHC,12,A02:01,HMTEVVRHC
...,...,...,...,...,...
378,YLSELLQTV,YLSELLQTV,880,A02:01,YLSELLQTV
379,YNTDDIEFY,YNTDDIEFY,881,A29:02,YNTDDIEFY
380,YSDNSNDSF,YSDSSNDSF,883,A01:01,YSDNSNDSF
381,YTTVRALTL,YTTVRALTL,884,B08:01,YTTVRALTL


In [127]:
len(o['human_9mers'].unique())

300

In [132]:
df_output = df_final_success.merge(o)[['gene_name_h','HGVSp_h','status','flank_size','human_9mers','gene_name_m','HGVSp_m','mouse_9mers','index']].sort_values(by='gene_name_h').reset_index(drop=True)
df_output

Unnamed: 0,gene_name_h,HGVSp_h,status,flank_size,human_9mers,gene_name_m,HGVSp_m,mouse_9mers,index
0,ABCB10,S405Y,True,39,GLYGNLIVL,Abcb10,S370Y,GLYGNLIVL,119
1,ABCC6,Q715E,True,3,GEELDPPWL,Abcc6,Q713E,REELDLPWL,739
2,ACE,S167F,True,11,FLDPDLTNI,Ace,S172F,FLDPELTNI,88
3,ACE2,D615Y,True,19,AYQSIKVRI,Ace2,D615Y,AYQSIKVRI,379
4,ACTG1,G366S,True,375,ESSPSIVHR,Actg1,G366S,ESSPSIVHR,711
...,...,...,...,...,...,...,...,...,...
378,ZNF611,D404G,True,1,KVCGTAFTW,Zfp966,D268G,KQCGKAFVR,224
379,ZNF611,D404G,True,0,KVCGTAFTW,Zfp975,D305G,NQCGKGFAQ,228
380,ZNF611,D404G,True,0,KVCGTAFTW,Zfp976,D333G,NQCGKGFAQ,229
381,ZNF611,D404G,True,1,KVCGTAFTW,5430403G16Rik,D330G,YKCGQCGKA,195


### Generate NetMHCpan inputs

In [133]:
for index, x in o.iterrows():
    k = x['index']
    l = x['human_9mers']
    with open('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/input_h_3.txt', 'a') as file:
        file.writelines(f'>{k}\n{l}\n')
    l = x['mouse_9mers']
    with open('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/input_m_3.txt', 'a') as file:
        file.writelines(f'>{k}\n{l}\n')

In [134]:
o

Unnamed: 0,human_9mers,mouse_9mers,index,HLA,Mutant Peptide
0,AYLEAIHKF,AYLEAIHKF,3,A24:02,AYLEAIHKF
1,FLYNLLTRV,FMYNLLTRV,8,A02:01,FLYNLLTRV
2,GLFGDIYLA,GLFGDIYLA,9,A02:01,GLFGDIYLA
3,GRIAFFLKY,GKIAFFLKF,10,B27:05,GRIAFFLKY
4,HMTEVVRHC,HMTEVVRHC,12,A02:01,HMTEVVRHC
...,...,...,...,...,...
378,YLSELLQTV,YLSELLQTV,880,A02:01,YLSELLQTV
379,YNTDDIEFY,YNTDDIEFY,881,A29:02,YNTDDIEFY
380,YSDNSNDSF,YSDSSNDSF,883,A01:01,YSDNSNDSF
381,YTTVRALTL,YTTVRALTL,884,B08:01,YTTVRALTL


### Human results (top score is selected for each peptide among different MHC alleles)

In [137]:
result_h_i_list = []
for x in ['A','B1','B2','C1','C2']:
    result_h_i_list.append(pd.read_csv(f'/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/output_h_mhci_3_{x}.txt', header=0, sep='\t', comment="#"))
result_h_i = pd.concat(result_h_i_list).reset_index(drop=True)
del result_h_i_list

In [139]:
result_h_i['allele'] = [x[4:] for x in result_h_i['allele']]
result_h_i['allele'] = [''.join(str(x).split('*')) for x in result_h_i['allele']]

In [149]:
', '.join(result_h_i['allele'].sort_values().unique())

'A01:01, A02:01, A02:06, A03:01, A11:01, A23:01, A24:02, A25:01, A26:01, A29:02, A30:01, A30:02, A31:01, A32:01, A33:03, B07:02, B08:01, B13:01, B13:02, B14:02, B15:01, B15:02, B15:25, B18:01, B27:02, B27:05, B35:01, B35:03, B37:01, B38:01, B39:01, B40:01, B40:02, B44:02, B44:03, B46:01, B48:01, B49:01, B50:01, B51:01, B52:01, B53:01, B55:01, B56:01, B57:01, B58:01, B58:02, C01:02, C02:02, C02:09, C03:02, C03:03, C03:04, C04:01, C05:01, C06:02, C07:01, C07:02, C07:04, C08:01, C08:02, C12:02, C12:03, C14:02, C15:02, C16:01, C17:01, E01:01, E01:03, G01:01, G01:02, G01:03, G01:04, G01:06'

In [151]:
result_h_i_unique = result_h_i.sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [159]:
k = pd.merge(df_output,result_h_i_unique[['allele','peptide','score','percentile_rank']].rename(columns={'peptide':'human_9mers',
                                                                                                           'allele':'HLA_allele',
                                                                                                           'score':'score_h',
                                                                                                           'percentile_rank':'percentile_rank_h'}),how='left')

In [160]:
k

Unnamed: 0,gene_name_h,HGVSp_h,status,flank_size,human_9mers,gene_name_m,HGVSp_m,mouse_9mers,index,HLA_allele,score_h,percentile_rank_h
0,ABCB10,S405Y,True,39,GLYGNLIVL,Abcb10,S370Y,GLYGNLIVL,119,A02:01,0.876096,0.04
1,ABCC6,Q715E,True,3,GEELDPPWL,Abcc6,Q713E,REELDLPWL,739,B40:01,0.750505,0.13
2,ACE,S167F,True,11,FLDPDLTNI,Ace,S172F,FLDPELTNI,88,A02:01,0.987693,0.01
3,ACE2,D615Y,True,19,AYQSIKVRI,Ace2,D615Y,AYQSIKVRI,379,A24:02,0.837559,0.04
4,ACTG1,G366S,True,375,ESSPSIVHR,Actg1,G366S,ESSPSIVHR,711,A33:03,0.909769,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...
378,ZNF611,D404G,True,1,KVCGTAFTW,Zfp966,D268G,KQCGKAFVR,224,B57:01,0.988189,0.01
379,ZNF611,D404G,True,0,KVCGTAFTW,Zfp975,D305G,NQCGKGFAQ,228,B57:01,0.988189,0.01
380,ZNF611,D404G,True,0,KVCGTAFTW,Zfp976,D333G,NQCGKGFAQ,229,B57:01,0.988189,0.01
381,ZNF611,D404G,True,1,KVCGTAFTW,5430403G16Rik,D330G,YKCGQCGKA,195,B57:01,0.988189,0.01


In [155]:
result_m_i = pd.read_csv(f'/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/output_m_mhci_3.txt', header=0, sep='\t', comment="#")

In [156]:
result_m_i['allele'] = [x[4:] for x in result_m_i['allele']]
result_m_i['allele'] = [''.join(str(x).split('*')) for x in result_m_i['allele']]

### Mouse results (top score is selected for each peptide among different MHC alleles)

In [162]:
result_m_i_unique = result_m_i.sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [181]:
l_both_best = pd.merge(k,result_m_i_unique[['allele','peptide','score','percentile_rank']].rename(columns={'peptide':'mouse_9mers',
                                                                                                           'allele':'H2_allele',
                                                                                                           'percentile_rank':'percentile_rank_m',
                                                                                                           'score':'score_m'}),how='left')

### Mouse Kb results (most common allele: Kb/Db)  

In [187]:
result_m_i_kb = result_m_i[result_m_i['allele'].isin(['Kb','Db'])].reset_index(drop=True).sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [190]:
l_final = pd.merge(l_both_best,result_m_i_kb[['peptide','score','percentile_rank']].rename(columns={'peptide':'mouse_9mers',
                                                                                                           'percentile_rank':'percentile_rank_kbdb',
                                                                                                           'score':'score_kbdb'}),how='left')

In [191]:
l_final

Unnamed: 0,gene_name_h,HGVSp_h,status,flank_size,human_9mers,gene_name_m,HGVSp_m,mouse_9mers,index,HLA_allele,score_h,percentile_rank_h,H2_allele,score_m,percentile_rank_m,score_kbdb,percentile_rank_kbdb
0,ABCB10,S405Y,True,39,GLYGNLIVL,Abcb10,S370Y,GLYGNLIVL,119,A02:01,0.876096,0.04,Qa1,0.346745,0.79,0.253319,0.24
1,ABCC6,Q715E,True,3,GEELDPPWL,Abcc6,Q713E,REELDLPWL,739,B40:01,0.750505,0.13,Qa2,0.192596,0.38,0.005281,7.80
2,ACE,S167F,True,11,FLDPDLTNI,Ace,S172F,FLDPELTNI,88,A02:01,0.987693,0.01,Qa1,0.464993,0.34,0.021126,3.00
3,ACE2,D615Y,True,19,AYQSIKVRI,Ace2,D615Y,AYQSIKVRI,379,A24:02,0.837559,0.04,Kd,0.917959,0.01,0.038557,2.60
4,ACTG1,G366S,True,375,ESSPSIVHR,Actg1,G366S,ESSPSIVHR,711,A33:03,0.909769,0.01,Qa1,0.089522,5.80,0.001162,21.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,ZNF611,D404G,True,1,KVCGTAFTW,Zfp966,D268G,KQCGKAFVR,224,B57:01,0.988189,0.01,Qa1,0.014362,20.00,0.000303,38.00
379,ZNF611,D404G,True,0,KVCGTAFTW,Zfp975,D305G,NQCGKGFAQ,228,B57:01,0.988189,0.01,Qa2,0.001335,33.00,0.000112,47.00
380,ZNF611,D404G,True,0,KVCGTAFTW,Zfp976,D333G,NQCGKGFAQ,229,B57:01,0.988189,0.01,Qa2,0.001335,33.00,0.000112,47.00
381,ZNF611,D404G,True,1,KVCGTAFTW,5430403G16Rik,D330G,YKCGQCGKA,195,B57:01,0.988189,0.01,Kk,0.000284,50.00,0.000108,55.00


In [194]:
l_final.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/h2m_neoantigen_result_v3.csv',index=False)

## M2H  

### Load and pre-preocess data from 2 studies (N = 246)  

In [None]:
dms_1 = pd.read_excel('/Users/kexindong/Documents/GitHub/Output/neoantigen/mouse_dms/li_snv.xlsx', sheet_name='Sup Data File 3 SNV Oncogenes', header=1)
dms_2 = pd.read_excel('/Users/kexindong/Documents/GitHub/Output/neoantigen/mouse_dms/li_indels.xlsx', sheet_name='Sup Data File 4 INDEL Oncogenes', header=1)
dms_3 = pd.read_excel('/Users/kexindong/Documents/GitHub/Output/neoantigen/mouse_dms/niknafs.xlsx',sheet_name='Table S3', header=3)

In [None]:
dms_1['source'] = 'Li 2020'
dms_2['source'] = 'Li 2020'
dms_3['source'] = 'Niknafs 2019'

In [None]:
dms_input_1 = dms_1[['Gene symbol','Position','REF','Alt','Sample ID']]
dms_input_1.columns = ['gene_name_m','start_m','ref_seq_m','alt_seq_m','index']
dms_input_1['end_m'] =dms_input_1['start_m']
dms_input_1['index'] = [f'1_{(x+1):04}' for x in range(len(dms_input_1))]
dms_input_1['type_m'] = 'SNP'
dms_coor_1 = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/dms_1_bed.txt', sep='\t',header=0)
dms_1['Position'] = dms_coor_1['start']

In [None]:
species = 'mus_musculus'
input_assembly = 'GRCm38'
output_assembly = 'GRCm39'
list_start, list_end = [],[]
for i, x in dms_2.iterrows():
    print(f'{i}/{len(dms_2)}')
    region = str(x['Chr'])
    start = x['Start']
    end = x['End']
    tu = convert_coordinates(species, input_assembly, output_assembly, region, start, end)
    list_start.append(tu[0])
    list_end.append(tu[1])

In [None]:
dms_2['Start'] = list_start
dms_2['End'] = list_end
dms_input_2 = dms_2[['Gene symbol','Start','End','Reference','Alt','Sample ID','Sequence variant type']]
dms_input_2.columns = ['gene_name_h','start_h','end_h','ref_seq_h','alt_seq_h','index', 'type_h']
dms_input_2['type_h'] = [h2m.get_type(x,y) for x,y in zip(dms_input_2['ref_seq_h'], dms_input_2['alt_seq_h'])]
dms_input_2.columns =  ['gene_name_m','start_m','end_m','ref_seq_m','alt_seq_m','index' ,'type_m']
dms_input_2['index'] = [f'2_{(x+1):04}' for x in range(len(dms_input_2))]

In [None]:
dms_input_3 = dms_3[['Gene Symbol','Position (mm10)','Reference Base','Alternate Base']]
# dms_coor_3 = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/P1xyCbSc9wkWYBBr.bed', sep='\t')
# dms_coor_3.columns = ['chr','start','end']
dms_input_3.columns = ['gene_name_m','start_m','ref_seq_m','alt_seq_m']
dms_input_3['start_m'] = list_start
dms_input_3['end_m'] = list_end
dms_input_3['index'] = [f'3_{(x+1):04}' for x in range(len(dms_input_3))]
dms_input_3['type_m'] = 'SNP'

In [None]:
dms_input_3['index'] = [f'3_{(x+1):05}' for x in range(len(dms_input_3))]
dms_input_2['index'] = [f'2_{(x+1):05}' for x in range(len(dms_input_2))]
dms_input_1['index'] = [f'1_{(x+1):05}' for x in range(len(dms_input_1))]

In [None]:
dms_input = pd.concat([dms_input_1, dms_input_2, dms_input_3]).reset_index(drop=True)
# dms_input.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_input.csv',index=False)

In [211]:
dms_input = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_input.csv')

In [214]:
dms_input = dms_input[[x[0]!='4' for x in dms_input['index']]]
dms_input.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_input_v2.csv')

### M2H modeling (N = 137)  

In [None]:
dms = h2m.get_tx_batch(dms_input,'m')[0]

There were rows that could not be processed.


In [None]:
dms = h2m.query_batch(dms,'m2h')[0]

There were rows that could not be processed.


In [None]:
dms = h2m.get_tx_batch(dms,'h',37)[0]

No error occurs.


In [None]:
df_result = h2m.model_batch(dms, records_h, index_list_h, records_m, index_list_m, db_h, db_m, 37, direction='m2h', show_sequence=True)

There were rows that could not be processed.


In [None]:
o = df_result[0]

In [216]:
o = o[[x[0]!='4' for x in o['index']]].reset_index(drop=True)
o.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v2.csv',index=False)

In [336]:
o = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v2.csv')

In [337]:
len(o[o['status']==True]['index'].unique())

137

In [338]:
df_process = o.drop_duplicates(subset=['HGVSp_m','gene_name_m','tx_id_h']).reset_index(drop=True)
df_final_success = df_process[df_process['status']==True].reset_index(drop=True)
df_final_success = df_final_success.dropna(subset = ['flank_size_left','flank_size_right']).reset_index(drop=True)
df_final_success['flank_size_left'] = df_final_success['flank_size_left'].fillna('0aa')
df_final_success['flank_size'] = [int(x[:-2])+int(y[:-2]) for x,y in zip(df_final_success['flank_size_left'], df_final_success['flank_size_right'])]

In [227]:
def find_9mers_containing_position(sequence, position):
    """
    Generate all 9-mer slices from a given DNA sequence that include a specific position.

    Args:
    sequence (str): The DNA sequence.
    position (int): The 1-based position within the sequence.

    Returns:
    list: A list of all 9-mer slices including the specified position.
    """
    def string_to_list(string):
        try:
            result = ast.literal_eval(string)
            if isinstance(result, list):
                return result
            else:
                raise ValueError("The provided string does not evaluate to a list")
        except:
            raise ValueError("Invalid string format for list conversion")
    sequence = str(h2m.Translate(sequence))
    position = string_to_list(position)[0]

    k_mers = [""] * 9 
    start_index = position - 8

    for i in range(9):
        if 0 <= start_index + i <= len(sequence) - 9:
            k_mer = sequence[start_index + i: start_index + i + 9]
            k_mers[i] = k_mer
    return k_mers


In [228]:
df_final_success = df_final_success.dropna(subset=['human_new_p_idx','mouse_new_p_idx']).reset_index(drop=True)

In [229]:
df_final_success['human_mouse_9mers'] = df_final_success.apply(
    lambda row: {
        'human_9mers': find_9mers_containing_position(row['new_seq_h'], row['human_new_p_idx']),
        'mouse_9mers': find_9mers_containing_position(row['new_seq_m'], row['mouse_new_p_idx'])
    }, axis=1)

In [230]:
def expand_row(row):
    return pd.DataFrame({
        'human_9mers': row['human_mouse_9mers']['human_9mers'],
        'mouse_9mers': row['human_mouse_9mers']['mouse_9mers'],
        'index':row['index']
    })

In [231]:
expanded_rows = pd.concat([expand_row(row) for index, row in df_final_success.iterrows()]).reset_index(drop=True)

In [232]:
expanded_rows['identical'] = [x==y for x,y in zip(expanded_rows['human_9mers'], expanded_rows['mouse_9mers'])]

In [233]:
expanded_rows = expanded_rows[['*' not in x for x in expanded_rows['human_9mers']]].reset_index(drop=True)
expanded_rows = expanded_rows[['*' not in x for x in expanded_rows['mouse_9mers']]].reset_index(drop=True)

In [234]:
expanded_rows = expanded_rows[[len(x)>0 for x in expanded_rows['human_9mers']]].reset_index(drop=True)
expanded_rows = expanded_rows[[len(x)>0 for x in expanded_rows['mouse_9mers']]].reset_index(drop=True)

In [247]:
df_final_success

Unnamed: 0,gene_name_m,gene_id_m,tx_id_m,chr_m,exon_num_m,strand_m,matcm,start_m,end_m,ref_seq_m,...,human_tx_idx_ori,human_p_idx_ori,human_new_p_idx_ori,dist_h,new_seq_h,human_tx_idx,human_p_idx,human_new_p_idx,index,human_mouse_9mers
0,Trp53,ENSMUSG00000059552.14,ENSMUST00000108658.10,chr11,10,+,True,69480028,69480028,G,...,[733],[244],[244],,ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTC...,[733],[244],[244],1_00001,"{'human_9mers': ['MCNSSCMGA', 'CNSSCMGAM', 'NS..."
1,Trp53,ENSMUSG00000059552.14,ENSMUST00000108658.10,chr11,10,+,True,69480040,69480040,G,...,[745],[248],[248],,ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTC...,"[744, 745]","[248, 248]",[248],1_00002,"{'human_9mers': ['SCMGGMNRL', 'CMGGMNRLP', 'MG..."
2,Notch1,ENSMUSG00000026923.16,ENSMUST00000028288.5,chr2,34,-,True,26356613,26356613,A,...,[5032],[1677],[1677],,ATGCCGCCGCTCCTGGCGCCCCTGCTCTGCCTGGCGCTGCTGCCCG...,[5032],[1677],[1677],1_00004,"{'human_9mers': ['DVRGSIVYP', 'VRGSIVYPE', 'RG..."
3,Notch1,ENSMUSG00000026923.16,ENSMUST00000028288.5,chr2,34,-,True,26371174,26371174,T,...,[910],[303],[303],,ATGCCGCCGCTCCTGGCGCCCCTGCTCTGCCTGGCGCTGCTGCCCG...,[910],[303],[303],1_00005,"{'human_9mers': ['VDECQLMPS', 'DECQLMPSA', 'EC..."
4,Notch1,ENSMUSG00000026923.16,ENSMUST00000028288.5,chr2,34,-,True,26371195,26371195,T,...,[889],[296],[296],,ATGCCGCCGCTCCTGGCGCCCCTGCTCTGCCTGGCGCTGCTGCCCG...,[889],[296],[296],1_00006,"{'human_9mers': ['GQYCTEDVG', 'QYCTEDVGE', 'YC..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,Bcl9,ENSMUSG00000038256.16,ENSMUST00000046521.14,chr3,7,-,True,97115926,97115926,G,...,[2769],[923],[923],,ATGCATTCCAGTAACCCTAAAGTGAGGAGCTCTCCATCAGGAAACA...,[2769],[923],[923],3_00153,"{'human_9mers': ['ASPVHLKSS', 'SPVHLKSSS', 'PV..."
145,Sbno2,ENSMUSG00000035673.11,ENSMUST00000219260.2,chr10,31,-,True,79896637,79896637,G,...,[2613],[871],[871],,ATGCTTGCAGTGGGGCCCGCCATGGACAGGGATTACCCGCAGCATG...,"[2613, 2614, 2615]","[871, 871, 871]",[871],3_00154,"{'human_9mers': ['RFASIVAKW', 'FASIVAKWL', 'AS..."
146,Mamdc4,ENSMUSG00000026941.17,ENSMUST00000095117.10,chr2,29,-,True,25460022,25460022,C,...,[224],[74],[74],,ATGCCTCTGTCCAGCCACCTGCTGCCCGCCTTGGTCCTGTTCCTGG...,[224],[74],[74],3_00155,"{'human_9mers': ['DFEQDPCGC', 'FEQDPCGCR', 'EQ..."
147,Srxn1,ENSMUSG00000032802.9,ENSMUST00000041500.8,chr2,2,+,True,151947715,151947715,C,...,[54],[18],[18],,ATGGGGCTGCGTGCAGGAGGAACGCTGGGCAGGGCCGGCGCGGGTC...,[54],[18],[18],3_00156,"{'human_9mers': ['RAGAGRGAA', 'AGAGRGAAE', 'GA..."


In [248]:
expanded_rows.drop_duplicates('human_9mers')

Unnamed: 0,human_9mers,mouse_9mers,index,identical
0,MCNSSCMGA,MCNSSCMGA,1_00001,True
1,CNSSCMGAM,CNSSCMGAM,1_00001,True
2,NSSCMGAMN,NSSCMGAMN,1_00001,True
3,SSCMGAMNR,SSCMGAMNR,1_00001,True
4,SCMGAMNRR,SCMGAMNRR,1_00001,True
...,...,...,...,...
1280,DSESQKEES,DSESQKEES,3_00157,True
1281,SESQKEESA,SESQKEESA,3_00157,True
1282,ESQKEESAE,ESQKEESAE,3_00157,True
1283,SQKEESAEE,SQKEESAEE,3_00157,True


### Generate NetMHCpan inputs (N = 1231/1089)

In [252]:
len(expanded_rows['human_9mers'].unique())

1231

In [330]:
len(expanded_rows['mouse_9mers'].unique())

1089

In [253]:
for index, x in expanded_rows.iterrows():
    k = x['index']
    l = x['human_9mers']
    with open('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_input_h_sub_3.txt', 'a') as file:
        file.writelines(f'>{k}\n{l}\n')
    l = x['mouse_9mers']
    with open('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_input_m_sub_3.txt', 'a') as file:
        file.writelines(f'>{k}\n{l}\n')

### Human results (top score is selected for each peptide among different MHC alleles)

In [267]:
result_h_i_list = []
for x in ['A1','A2','B1','B2','B3','B4','C1','C2','EG']:
    result_h_i_list.append(pd.read_csv(f'/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_h_{x}.txt', header=0, sep='\t', comment="#"))
result_h_i = pd.concat(result_h_i_list).reset_index(drop=True)
del result_h_i_list

In [268]:
result_h_i.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_h_v3.csv',index=False)

In [316]:
result_h_i = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_h_v3.csv')

In [317]:
result_h_i['allele'] = [x[4:] for x in result_h_i['allele']]
result_h_i['allele'] = [''.join(str(x).split('*')) for x in result_h_i['allele']]

In [318]:
result_h_i_unique = result_h_i.sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [340]:
g = pd.merge(df_output,result_h_i_unique[['allele','peptide','score','percentile_rank']].rename(columns={'peptide':'human_9mers',
                                                                                                           'allele':'HLA_alleles',
                                                                                                           'score':'score_h',
                                                                                                           'percentile_rank':'percentile_rank_h'
                                                                                                           }),how='left')

In [341]:
g = g.drop_duplicates().reset_index(drop=True)

### Mouse results (top score is selected for each peptide among different MHC alleles)

In [352]:
result_m_i_list = []
for x in ['1','2']:
    result_m_i_list.append(pd.read_csv(f'/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_m_{x}.txt', header=0, sep='\t', comment="#"))
result_m_i = pd.concat(result_m_i_list).reset_index(drop=True)
del result_m_i_list

In [280]:
result_m_i.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_m_v3.csv',index=False)

In [281]:
result_m_i = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_output_v3/m2h_output_m_v3.csv')

In [353]:
result_m_i['allele'] = [x[4:] for x in result_m_i['allele']]
result_m_i['allele'] = [''.join(str(x).split('*')) for x in result_m_i['allele']]

In [354]:
result_m_i_unique = result_m_i.sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [355]:
l_both_best = pd.merge(g,result_m_i_unique[['allele','peptide','score','percentile_rank']].rename(columns={'peptide':'mouse_9mers',
                                                                                                           'allele':'H2_allele',
                                                                                                           'percentile_rank':'percentile_rank_m',
                                                                                                           'score':'score_m'}),how='left')

In [356]:
l_both_best

Unnamed: 0,gene_name_h,HGVSp_h,status,flank_size,human_9mers,gene_name_m,HGVSp_m,mouse_9mers,index,HLA_alleles,score_h,percentile_rank_h,H2_allele,score_m,percentile_rank_m
0,ABCB9,Q520delinsQ,True,115,RPHTQVLQN,Abcb9,Q516delinsQ,RPHTQVLQN,3_00022,B55:01,0.058610,1.70,Dq,0.008150,14.00
1,ABCB9,Q520delinsQ,True,115,RTRPHTQVL,Abcb9,Q516delinsQ,RTRPHTQVL,3_00022,A30:01,0.925992,0.01,Qa1,0.804364,0.01
2,ABCB9,Q520delinsQ,True,115,TRPHTQVLQ,Abcb9,Q516delinsQ,TRPHTQVLQ,3_00022,B27:05,0.009969,4.40,Qa1,0.010245,24.00
3,ABCB9,Q520delinsQ,True,115,PHTQVLQNV,Abcb9,Q516delinsQ,PHTQVLQNV,3_00022,B38:01,0.212903,0.48,Kd,0.025846,2.40
4,ABCB9,Q520delinsQ,True,115,HTQVLQNVS,Abcb9,Q516delinsQ,HTQVLQNVS,3_00022,A30:01,0.007771,9.70,Qa1,0.003275,36.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3986,ZNF878,P23delinsP,True,0,LDPSQKNLC,Zfp455,P24delinsP,LEPAQWDLY,3_00036,B37:01,0.005016,6.90,Qa1,0.059734,8.40
3987,ZNF878,P23delinsP,True,0,PVQRNLYQD,Zfp455,P24delinsP,PAQWDLYRD,3_00036,A30:01,0.000029,77.00,Kq,0.000031,83.00
3988,ZNF878,P23delinsP,True,0,LDPVQRNLY,Zfp455,P24delinsP,LEPAQWDLY,3_00036,A01:01,0.079631,1.20,Qa1,0.059734,8.40
3989,ZNF878,P23delinsP,True,0,REEWALLGP,Zfp455,P24delinsP,PEEWECLEP,3_00036,B50:01,0.295977,0.32,Qa2,0.002958,24.00


### Mouse Kb/Db results (most common allele: Kb/Db)  

In [357]:
result_m_i_kb = result_m_i[result_m_i['allele'].isin(['Kb','Db'])].reset_index(drop=True).sort_values(by='score', ascending=False).reset_index(drop=True).drop_duplicates(subset='peptide')

In [358]:
l_final = pd.merge(l_both_best,result_m_i_kb[['peptide','score','percentile_rank']].rename(columns={'peptide':'mouse_9mers',
                                                                                                           'percentile_rank':'percentile_rank_kbdb',
                                                                                                           'score':'score_kbdb'}),how='left')

In [359]:
l_final

Unnamed: 0,gene_name_h,HGVSp_h,status,flank_size,human_9mers,gene_name_m,HGVSp_m,mouse_9mers,index,HLA_alleles,score_h,percentile_rank_h,H2_allele,score_m,percentile_rank_m,score_kbdb,percentile_rank_kbdb
0,ABCB9,Q520delinsQ,True,115,RPHTQVLQN,Abcb9,Q516delinsQ,RPHTQVLQN,3_00022,B55:01,0.058610,1.70,Dq,0.008150,14.00,0.000145,50.00
1,ABCB9,Q520delinsQ,True,115,RTRPHTQVL,Abcb9,Q516delinsQ,RTRPHTQVL,3_00022,A30:01,0.925992,0.01,Qa1,0.804364,0.01,0.176445,0.59
2,ABCB9,Q520delinsQ,True,115,TRPHTQVLQ,Abcb9,Q516delinsQ,TRPHTQVLQ,3_00022,B27:05,0.009969,4.40,Qa1,0.010245,24.00,0.000825,24.00
3,ABCB9,Q520delinsQ,True,115,PHTQVLQNV,Abcb9,Q516delinsQ,PHTQVLQNV,3_00022,B38:01,0.212903,0.48,Kd,0.025846,2.40,0.001880,16.00
4,ABCB9,Q520delinsQ,True,115,HTQVLQNVS,Abcb9,Q516delinsQ,HTQVLQNVS,3_00022,A30:01,0.007771,9.70,Qa1,0.003275,36.00,0.000603,25.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3986,ZNF878,P23delinsP,True,0,LDPSQKNLC,Zfp455,P24delinsP,LEPAQWDLY,3_00036,B37:01,0.005016,6.90,Qa1,0.059734,8.40,0.005304,7.70
3987,ZNF878,P23delinsP,True,0,PVQRNLYQD,Zfp455,P24delinsP,PAQWDLYRD,3_00036,A30:01,0.000029,77.00,Kq,0.000031,83.00,0.000004,98.00
3988,ZNF878,P23delinsP,True,0,LDPVQRNLY,Zfp455,P24delinsP,LEPAQWDLY,3_00036,A01:01,0.079631,1.20,Qa1,0.059734,8.40,0.005304,7.70
3989,ZNF878,P23delinsP,True,0,REEWALLGP,Zfp455,P24delinsP,PEEWECLEP,3_00036,B50:01,0.295977,0.32,Qa2,0.002958,24.00,0.000003,95.00


In [360]:
l_final.to_csv('/Users/kexindong/Documents/GitHub/Output/neoantigen/final_result/m2h_neoantigen_result_v3_.csv',index=False)