In [3]:
#from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)

### Combine peptide coverage with binding affinities

In [5]:
OUT = '../potential_neoantigens.txt'
ANNOTATION = '../peptide_coverage.tsv' #output from variant_peptides.sh
A0103 = '../A0103.tsv' # predicted MHC I binding affinity for the expressed HLA alleles
A6601 = '../A6601.tsv'
B0801 = '../B0801.tsv'

SAMPLES = {
    'A0103' : A0103,
    'A6601' : A6601,
    'B0801' : B0801
}

In [6]:
def importAnnot(path):
    annot = pd.read_csv(
        path,
        sep='\t',
        header=0,
        index_col=False)
    return(annot)

In [7]:
def getAffinity(path, annot, name):
    affinity = pd.read_csv(
        path,
        sep='\t',
        header=1,
        usecols=['Peptide','nM'])
    names = affinity.columns.values
    newNames = ['peptide', name + '_' + names[1]]
    affinity.columns = newNames
    return(affinity)

In [8]:
from functools import reduce


In [9]:
def main():
    annot = importAnnot(ANNOTATION)
    affinity = [annot]
    for sample, path in SAMPLES.items():
        affinity.append(
            getAffinity(path, annot, sample))
    df_final = reduce(lambda left,right: pd.merge(left,right,on='peptide'), affinity)
    df_final.to_csv(
            OUT,
            sep='\t',
            header=True,
            index=False)
    return(df_final)

In [10]:
out= main()

In [11]:
out.head(3)

Unnamed: 0,peptide,mut_index_len,orf_types,peptide_type,variant_ORF_ids,variant_ORF_id,ORF_ID,variant_qc,ORF_type,prot_mut,Mel11_1_TPM,Mel11_3_TPM,Mel11_4_TPM,mut_loc,mel11.1_wt_cov,mel11.1_mut_cov,mel11.3_wt_cov,mel11.3_mut_cov,mel11.4_wt_cov,mel11.4_mut_cov,13240-011.Nor.WES_wt_cov,13240-011.Nor.WES_mut_cov,13240-011.Tum.WES_wt_cov,13240-011.Tum.WES_mut_cov,13240-011.Tum.RNA_wt_cov,13240-011.Tum.RNA_mut_cov,Mel11_PBMCs_WGS_wt_cov,Mel11_PBMCs_WGS_mut_cov,cellline_WGS_wt_cov,cellline_WGS_mut_cov,wt_rpf_sum,mut_rpf_sum,A0103_nM,A6601_nM,B0801_nM
0,FFWSACSRD,3,"['canonical', 'within', 'three_prime_overlap']",annotated,"['ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A', 'ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within|5|64595854|G|A', 'ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap|5|64595854|G|A']",ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A,ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical,PASS,canonical,S443F,0.092088,0.060707,0.050803,5_64595854_G_A,0.0,0.0,0.0,0.0,0.0,0.0,229.0,0.0,181.0,127.0,24.0,14.0,84.0,0.0,18.0,51.0,0.0,0.0,40805.9375,38017.832,30527.5684
1,FFWSACSRD,3,"['canonical', 'within', 'three_prime_overlap']",annotated,"['ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A', 'ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within|5|64595854|G|A', 'ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap|5|64595854|G|A']",ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within|5|64595854|G|A,ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within,PASS,within,S443F,0.016841,0.044407,0.04054,5_64595854_G_A,0.0,0.0,0.0,0.0,0.0,0.0,229.0,0.0,181.0,127.0,24.0,14.0,84.0,0.0,18.0,51.0,0.0,0.0,40805.9375,38017.832,30527.5684
2,FFWSACSRD,3,"['canonical', 'within', 'three_prime_overlap']",annotated,"['ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A', 'ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within|5|64595854|G|A', 'ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap|5|64595854|G|A']",ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap|5|64595854|G|A,ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap,PASS,three_prime_overlap,S145F,0.0,0.0,0.0,5_64595854_G_A,0.0,0.0,0.0,0.0,0.0,0.0,229.0,0.0,181.0,127.0,24.0,14.0,84.0,0.0,18.0,51.0,0.0,0.0,40805.9375,38017.832,30527.5684


In [12]:
out_dropna = out.dropna(axis=0, how='any')

In [17]:
out_dedup = out_dropna.drop_duplicates(subset='peptide')

In [12]:
out_dedup.columns

Index(['peptide', 'mut_index_len', 'orf_types', 'peptide_type',
       'variant_ORF_ids', 'variant_ORF_id', 'ORF_ID', 'variant_qc', 'ORF_type',
       'prot_mut', 'Mel11_1_TPM', 'Mel11_3_TPM', 'Mel11_4_TPM', 'mut_loc',
       'mel11.1_wt_cov', 'mel11.1_mut_cov', 'mel11.3_wt_cov',
       'mel11.3_mut_cov', 'mel11.4_wt_cov', 'mel11.4_mut_cov',
       '13240-011.Nor.WES_wt_cov', '13240-011.Nor.WES_mut_cov',
       '13240-011.Tum.WES_wt_cov', '13240-011.Tum.WES_mut_cov',
       '13240-011.Tum.RNA_wt_cov', '13240-011.Tum.RNA_mut_cov',
       'Mel11_PBMCs_WGS_wt_cov', 'Mel11_PBMCs_WGS_mut_cov',
       'cellline_WGS_wt_cov', 'cellline_WGS_mut_cov', 'wt_rpf_sum',
       'mut_rpf_sum', 'A0103_nM', 'A6601_nM', 'B0801_nM'],
      dtype='object')

In [19]:
def sum_cov(row):
    row['ribo_wt_cov'] = int(row['mel11.1_wt_cov']) + int(row['mel11.3_wt_cov']) + int(row['mel11.4_wt_cov'])
    row['ribo_mut_cov'] = int(row['mel11.1_mut_cov']) + int(row['mel11.3_mut_cov']) + int(row['mel11.4_mut_cov'])
    return(row)

In [20]:
out_dropna = out_dropna.apply(sum_cov, axis=1)

In [21]:
out_dedup = out_dedup.apply(sum_cov, axis=1)

In [25]:
# out_dedup_cov_mow = out_dedup[
#           ((out_dedup['ribo_mut_cov'] > minMut) &
#         (out_dedup['MoW'] > MoW))]

In [23]:
out_dropna['TPM'] = out_dropna.loc[:,['Mel11_1_TPM','Mel11_3_TPM','Mel11_4_TPM']].mean(axis=1)

In [24]:
out_dedup['TPM'] = out_dedup.loc[:,['Mel11_1_TPM','Mel11_3_TPM','Mel11_4_TPM']].mean(axis=1)

In [37]:
def MoW(row):
    if int(row['ribo_wt_cov']) + int(row['ribo_mut_cov']) > 0:
        row['MoW'] = int(row['ribo_mut_cov']) / (int(row['ribo_wt_cov']) + int(row['ribo_mut_cov']))
    else: 
        row['MoW'] = 0
    return(row)

In [38]:
out_dropna = out_dropna.apply(MoW, axis=1)

In [39]:
out_dedup = out_dedup.apply(MoW, axis=1)

In [32]:
out_dedup.to_csv('/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/allCan/snv_coverage/potential_neoantigens_dedup.tsv', header=True, index=False, sep='\t')

In [33]:
out_dedup.head()

Unnamed: 0,peptide,mut_index_len,orf_types,peptide_type,variant_ORF_ids,variant_ORF_id,ORF_ID,variant_qc,ORF_type,prot_mut,Mel11_1_TPM,Mel11_3_TPM,Mel11_4_TPM,mut_loc,mel11.1_wt_cov,mel11.1_mut_cov,mel11.3_wt_cov,mel11.3_mut_cov,mel11.4_wt_cov,mel11.4_mut_cov,13240-011.Nor.WES_wt_cov,13240-011.Nor.WES_mut_cov,13240-011.Tum.WES_wt_cov,13240-011.Tum.WES_mut_cov,13240-011.Tum.RNA_wt_cov,13240-011.Tum.RNA_mut_cov,Mel11_PBMCs_WGS_wt_cov,Mel11_PBMCs_WGS_mut_cov,cellline_WGS_wt_cov,cellline_WGS_mut_cov,wt_rpf_sum,mut_rpf_sum,A0103_nM,A6601_nM,B0801_nM,ribo_wt_cov,ribo_mut_cov,TPM,MoW
0,FFWSACSRD,3,"['canonical', 'within', 'three_prime_overlap']",annotated,"['ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A', 'ADAMTS6|ENST00000470597.5_1_5:64558740-64769500:-|within|5|64595854|G|A', 'ADAMTS6|ENST00000381052.8_1_5:64593037-64747992:-|three_prime_overlap|5|64595854|G|A']",ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical|5|64595854|G|A,ADAMTS6|ENST00000381055.7_1_5:64447665-64769500:-|canonical,PASS,canonical,S443F,0.092088,0.060707,0.050803,5_64595854_G_A,0.0,0.0,0.0,0.0,0.0,0.0,229.0,0.0,181.0,127.0,24.0,14.0,84.0,0.0,18.0,51.0,0.0,0.0,40805.9375,38017.832,30527.5684,0,0,0.067866,0.0
3,NRFYMNSQG,1,['canonical'],annotated,['AC084219.2|ENST00000591793.1_1_19:44580542-44591410:+|canonical|19|44590849|G|C'],AC084219.2|ENST00000591793.1_1_19:44580542-44591410:+|canonical|19|44590849|G|C,AC084219.2|ENST00000591793.1_1_19:44580542-44591410:+|canonical,PASS,canonical,K406N,0.011564,0.0,0.0,19_44590849_G_C,0.0,0.0,0.0,0.0,0.0,1.0,223.0,0.0,190.0,74.0,18.0,0.0,60.0,0.0,13.0,36.0,0.0,1.0,44973.6172,39099.9727,21336.5156,0,1,0.003855,1.0
4,PDEDWLSPHP,1,['five_prime'],nuORF,['ACSM3|ENST00000501740.6_1_16:20685933-20686095:+|five_prime|16|20686058|G|A'],ACSM3|ENST00000501740.6_1_16:20685933-20686095:+|five_prime|16|20686058|G|A,ACSM3|ENST00000501740.6_1_16:20685933-20686095:+|five_prime,PASS,five_prime,G42E,0.0,0.0,0.0,16_20686058_G_A,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,60.0,0.0,32.0,12.0,0.0,0.0,46492.582,47306.4766,46473.9648,0,0,0.0,0.0
5,TVVFPMLNL,1,['canonical'],annotated,['OR9G1|ENST00000312153.1_1_11:56467863-56468778:+|canonical|11|56468407|C|T'],OR9G1|ENST00000312153.1_1_11:56467863-56468778:+|canonical|11|56468407|C|T,OR9G1|ENST00000312153.1_1_11:56467863-56468778:+|canonical,PASS,canonical,P182S,0.0,0.0,0.0,11_56468407_C_T,0.0,0.0,0.0,0.0,0.0,0.0,361.0,0.0,340.0,59.0,0.0,0.0,132.0,1.0,85.0,40.0,0.0,0.0,31539.8633,1384.6987,23185.748,0,0,0.0,0.0
6,KTFLETSNST,1,['canonical'],annotated,['TRMT13|ENST00000370141.6_1_1:100598724-100614373:+|canonical|1|100613768|C|T'],TRMT13|ENST00000370141.6_1_1:100598724-100614373:+|canonical|1|100613768|C|T,TRMT13|ENST00000370141.6_1_1:100598724-100614373:+|canonical,PASS,canonical,S379F,0.071284,0.070489,0.064351,1_100613768_C_T,0.0,0.0,0.0,0.0,0.0,0.0,580.0,0.0,350.0,176.0,212.0,142.0,72.0,0.0,0.0,42.0,0.0,0.0,35871.0859,23690.8906,34287.5,0,0,0.068708,0.0


#### Filter the dedup list of neoantigens to keep only those that are supported by Ribo-seq reads, but regardless of their MHC I binding predictions. 

#### Keep mut_locs that are supported by Ribo-seq

In [80]:
MoW = 0.15
minMut = 9

In [81]:
out_dropna_pass = out_dropna[((out_dropna['ribo_mut_cov'] > minMut) & (out_dropna['MoW'] > MoW))]

In [84]:
mut_loc_counts_pass = pd.DataFrame(out_dropna_pass.groupby(['mut_loc','peptide_type']).size())

In [86]:
mut_locs_pass = list(set(mut_loc_counts_pass.index.get_level_values(0)))

In [87]:
len(mut_locs_pass)

217

In [90]:
for i in mut_locs_pass:
    if (('nuORF' in mut_loc_counts_pass.loc[i,:].index) & ('annotated' in mut_loc_counts_pass.loc[i,:].index)):
        mut_loc_counts_pass.loc[i,'category'] = 'both'
    if (('nuORF' in mut_loc_counts_pass.loc[i,:].index) & ('annotated' not in mut_loc_counts_pass.loc[i,:].index)):
        mut_loc_counts_pass.loc[i,'category'] = 'nuORF'
    if (('nuORF' not in mut_loc_counts_pass.loc[i,:].index) & ('annotated' in mut_loc_counts_pass.loc[i,:].index)):
        mut_loc_counts_pass.loc[i,'category'] = 'annotated'

In [91]:
mut_loc_counts_pass

Unnamed: 0_level_0,Unnamed: 1_level_0,0,category
mut_loc,peptide_type,Unnamed: 2_level_1,Unnamed: 3_level_1
10_129902338_G_A,annotated,38,annotated
10_22048133_T_C,annotated,19,annotated
10_46268741_G_A,annotated,114,annotated
10_50714025_C_G,annotated,47,annotated
10_5498922_C_T,annotated,57,annotated
10_79397445_C_T,annotated,256,both
10_79397445_C_T,nuORF,310,both
10_97393314_G_A,nuORF,12,nuORF
10_98743431_A_G,annotated,19,annotated
11_16838504_G_A,annotated,95,annotated


In [159]:
mut_loc_counts_pass_unique = mut_loc_counts_pass.groupby("category").agg(lambda x: x.index.get_level_values(0).nunique())
mut_loc_counts_pass_unique

Unnamed: 0_level_0,0
category,Unnamed: 1_level_1
annotated,131
both,39
nuORF,47


In [93]:
sum(mut_loc_counts_pass.groupby("category").agg(lambda x: x.index.get_level_values(0).nunique())[0])

217

In [94]:
len(set(mut_loc_counts_pass.index.get_level_values(0)))

217

In [95]:
47 / 217

0.21658986175115208

In [172]:
39 / 217

0.17972350230414746

In [101]:
mut_loc_counts_pass_reset = mut_loc_counts_pass.reset_index()

In [103]:
mut_loc_counts_pass_reset.drop_duplicates('mut_loc', inplace=True)

In [104]:
mut_loc_counts_pass_reset

Unnamed: 0,mut_loc,peptide_type,0,category
0,10_129902338_G_A,annotated,38,annotated
1,10_22048133_T_C,annotated,19,annotated
2,10_46268741_G_A,annotated,114,annotated
3,10_50714025_C_G,annotated,47,annotated
4,10_5498922_C_T,annotated,57,annotated
5,10_79397445_C_T,annotated,256,both
7,10_97393314_G_A,nuORF,12,nuORF
8,10_98743431_A_G,annotated,19,annotated
9,11_16838504_G_A,annotated,95,annotated
10,11_17352494_C_T,annotated,95,both


In [122]:
out_dropna_filtered_dedup = out_dropna.merge(mut_loc_counts_pass_reset, on='mut_loc', how='inner').drop_duplicates('mut_loc')

In [117]:
out_dropna_filtered_dedup[out_dropna_filtered_dedup['category'] == 'nuORF']

Unnamed: 0,peptide,mut_index_len,orf_types,peptide_type_x,variant_ORF_ids,variant_ORF_id,ORF_ID,variant_qc,ORF_type,prot_mut,Mel11_1_TPM,Mel11_3_TPM,Mel11_4_TPM,mut_loc,mel11.1_wt_cov,mel11.1_mut_cov,mel11.3_wt_cov,mel11.3_mut_cov,mel11.4_wt_cov,mel11.4_mut_cov,13240-011.Nor.WES_wt_cov,13240-011.Nor.WES_mut_cov,13240-011.Tum.WES_wt_cov,13240-011.Tum.WES_mut_cov,13240-011.Tum.RNA_wt_cov,13240-011.Tum.RNA_mut_cov,Mel11_PBMCs_WGS_wt_cov,Mel11_PBMCs_WGS_mut_cov,cellline_WGS_wt_cov,cellline_WGS_mut_cov,wt_rpf_sum,mut_rpf_sum,A0103_nM,A6601_nM,B0801_nM,ribo_wt_cov,ribo_mut_cov,TPM,MoW,peptide_type_y,0,category
399,MKEHQCISTA,1,['noncoding_other'],nuORF,['PAX8-AS1|ENST00000556070.5_1_2:113993137-114020997:+|noncoding_other|2|113993147|G|A'],PAX8-AS1|ENST00000556070.5_1_2:113993137-114020997:+|noncoding_other|2|113993147|G|A,PAX8-AS1|ENST00000556070.5_1_2:113993137-114020997:+|noncoding_other,PASS,noncoding_other,V4M,0.0,0.0,0.0,2_113993147_G_A,18.0,11.0,7.0,1.0,17.0,2.0,93.0,0.0,74.0,45.0,2.0,8.0,45.0,0.0,18.0,39.0,42.0,14.0,37926.6367,31665.002,12523.4141,42,14,0.0,0.25,nuORF,168,nuORF
1007,AGAQLLRVA,1,['iORF'],nuORF,['OAS3|ENST00000548514.1_1_12:113376504-113379546:+|iORF|12|113379404|C|T'],OAS3|ENST00000548514.1_1_12:113376504-113379546:+|iORF|12|113379404|C|T,OAS3|ENST00000548514.1_1_12:113376504-113379546:+|iORF,PASS,iORF,S13L,3.085882,4.332306,4.041036,12_113379404_C_T,3.0,12.0,2.0,3.0,2.0,4.0,222.0,0.0,109.0,74.0,141.0,50.0,44.0,0.0,22.0,24.0,7.0,19.0,40856.2852,31804.7441,17023.9609,7,19,3.819741,0.730769,nuORF,19,nuORF
1560,SFHSSEEER,1,['canonical_truncated'],nuORF,['AAK1|ENST00000606389.5_1_2:69692559-69736406:-|canonical_truncated|2|69693486|C|T'],AAK1|ENST00000606389.5_1_2:69692559-69736406:-|canonical_truncated|2|69693486|C|T,AAK1|ENST00000606389.5_1_2:69692559-69736406:-|canonical_truncated,PASS,canonical_truncated,G201R,3.661159,3.130718,2.706073,2_69693486_C_T,22.0,49.0,1.0,3.0,4.0,8.0,238.0,0.0,169.0,82.0,102.0,38.0,60.0,0.0,18.0,35.0,27.0,60.0,41445.3516,22471.2305,39874.9766,27,60,3.165983,0.689655,nuORF,19,nuORF
1835,VLRQWNGVEP,1,['five_prime'],nuORF,['FOXJ2|ENST00000162391.7_1_12:8185724-8186174:+|five_prime|12|8186017|C|T'],FOXJ2|ENST00000162391.7_1_12:8185724-8186174:+|five_prime|12|8186017|C|T,FOXJ2|ENST00000162391.7_1_12:8185724-8186174:+|five_prime,PASS,five_prime,P98L,2.731577,1.582234,1.272496,12_8186017_C_T,55.0,120.0,3.0,7.0,10.0,17.0,1.0,0.0,1.0,0.0,0.0,0.0,55.0,0.0,26.0,44.0,68.0,144.0,42192.8203,40813.8867,28528.0449,68,144,1.862102,0.679245,nuORF,19,nuORF
1930,LAVVRGTDLL,1,['five_prime_overlap'],nuORF,['RAB10|ENST00000495146.5_1_2:26257410-26257569:+|five_prime_overlap|2|26257436|C|T'],RAB10|ENST00000495146.5_1_2:26257410-26257569:+|five_prime_overlap|2|26257436|C|T,RAB10|ENST00000495146.5_1_2:26257410-26257569:+|five_prime_overlap,PASS,five_prime_overlap,P9L,3.299371,3.198585,3.69874,2_26257436_C_T,12.0,52.0,1.0,18.0,4.0,23.0,93.0,0.0,74.0,53.0,288.0,154.0,51.0,0.0,13.0,60.0,17.0,93.0,29964.7012,18669.7363,2224.7119,17,93,3.398899,0.845455,nuORF,16,nuORF
2098,QTAPGLAAPG,1,['Variant'],nuORF,['FBXO2|ENST00000354287.4_1_1:11708750-11714484:-|Variant|1|11714480|C|T'],FBXO2|ENST00000354287.4_1_1:11708750-11714484:-|Variant|1|11714480|C|T,FBXO2|ENST00000354287.4_1_1:11708750-11714484:-|Variant,PASS,Variant,R2Q,6.447326,6.482859,7.753978,1_11714480_C_T,0.0,8.0,0.0,3.0,0.0,14.0,4.0,0.0,3.0,6.0,0.0,5.0,47.0,0.0,0.0,27.0,0.0,25.0,32627.3203,19046.3848,32952.6562,0,25,6.894721,1.0,nuORF,4,nuORF
2294,GYAAGIENRC,2,"['three_prime_overlap', 'uoORF']",nuORF,"['MRPL49|ENST00000534078.1_1_11:64889784-64893341:+|three_prime_overlap|11|64889856|G|A', 'MRPL49|ENST00000526319.5_1_11:64889796-64892026:+|uoORF|11|64889856|G|A']",MRPL49|ENST00000534078.1_1_11:64889784-64893341:+|three_prime_overlap|11|64889856|G|A,MRPL49|ENST00000534078.1_1_11:64889784-64893341:+|three_prime_overlap,PASS,three_prime_overlap,M24I,0.998929,0.08899,0.162481,11_64889856_G_A,74.0,28.0,17.0,4.0,24.0,6.0,170.0,0.0,177.0,35.0,96.0,5.0,43.0,0.0,56.0,17.0,115.0,38.0,42591.3945,34142.75,41317.7344,115,38,0.4168,0.248366,nuORF,57,nuORF
2503,PSLAPFPGP,1,['five_prime_overlap'],nuORF,['LMNA|ENST00000368301.6_1_1:156084636-156084825:+|five_prime_overlap|1|156084656|C|T'],LMNA|ENST00000368301.6_1_1:156084636-156084825:+|five_prime_overlap|1|156084656|C|T,LMNA|ENST00000368301.6_1_1:156084636-156084825:+|five_prime_overlap,PASS,five_prime_overlap,P7L,6.476542,13.454367,10.399423,1_156084656_C_T,14.0,26.0,2.0,1.0,6.0,0.0,9.0,0.0,12.0,1.0,30.0,5.0,41.0,0.0,38.0,20.0,22.0,27.0,43865.8516,42303.9023,44447.7734,22,27,10.110111,0.55102,nuORF,128,nuORF
3480,SLILPEAIE,2,"['three_prime_overlap', 'three_prime_overlap']",nuORF,"['NPRL3|ENST00000399953.7_2_16:136756-188473:-|three_prime_overlap|16|150406|G|A', 'NPRL3|ENST00000620134.4_2_16:136756-188831:-|three_prime_overlap|16|150406|G|A']",NPRL3|ENST00000399953.7_2_16:136756-188473:-|three_prime_overlap|16|150406|G|A,NPRL3|ENST00000399953.7_2_16:136756-188473:-|three_prime_overlap,PASS,three_prime_overlap,P288L,3.261922,4.513082,4.674052,16_150406_G_A,53.0,11.0,7.0,3.0,15.0,9.0,96.0,1.0,69.0,3.0,213.0,26.0,41.0,0.0,31.0,38.0,75.0,23.0,42877.1289,28231.4219,35481.6133,75,23,4.149686,0.234694,nuORF,38,nuORF
3708,REDAARRARL,1,['five_prime_overlap'],nuORF,['CBFB|ENST00000290858.10_1_16:67063120-67063363:+|five_prime_overlap|16|67063332|C|T'],CBFB|ENST00000290858.10_1_16:67063120-67063363:+|five_prime_overlap|16|67063332|C|T,CBFB|ENST00000290858.10_1_16:67063120-67063363:+|five_prime_overlap,PASS,five_prime_overlap,P71L,0.698451,1.116214,1.401147,16_67063332_C_T,49.0,483.0,36.0,53.0,82.0,163.0,257.0,0.0,294.0,67.0,714.0,186.0,82.0,0.0,45.0,18.0,167.0,699.0,35013.2852,34263.0078,6043.3813,167,699,1.071937,0.807159,nuORF,19,nuORF


#### Now, also incorporate binding affinity to just select those that could be potential neoantigens. 

In [186]:
maxNM = 500
minTPM = 0
MoW = 0.15
minMut = 9

def filter_df(df):
    out = df[
    #(df['TPM'] > minTPM) & 
          ((df['A0103_nM'] < maxNM) |
          (df['A6601_nM'] < maxNM) |
          (df['B0801_nM'] < maxNM)) &
          (df['ribo_mut_cov'] > minMut) &
        (df['MoW'] > MoW)]
    return(out)

In [187]:
out_dropna_filtered_pass_nM = filter_df(out_dropna_filtered)

In [188]:
len(out_dropna_filtered_pass_nM)

221

In [189]:
out_dropna_filtered_pass_nM_dedup = out_dropna_filtered_pass_nM.drop_duplicates(subset=['peptide','peptide_type_x','mut_loc','category'])

In [190]:
len(out_dropna_filtered_pass_nM_dedup)

75

In [191]:
out_dedup_pass = out_dropna_filtered_pass_nM_dedup.copy()

In [192]:
len(out_dedup_pass)

75

In [193]:
out_dedup_pass['mut_loc'].nunique()

47

In [143]:
out_dedup_pass.to_csv('/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/allCan/snv_coverage/potential_neoantigens_pass_filt.tsv', header=True, index=False, sep='\t')

In [48]:
f = open("/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/allCan/snv_coverage/potential_neoantigen_filters.txt", "a")
f.write('maxNM: ' + str(maxNM) + 'nM' + '\n')
f.write('minTPM: ' + str(minTPM) + '\n')
f.write('Minimum mutant over wild-type read coverage ratio: ' + str(MoW) + '\n')
f.write('Minimum mutant read coverage: ' + str(minMut) + '\n')
f.close()

In [3]:
out_dedup_pass = pd.read_csv('/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/snv_coverage/potential_neoantigens_pass_filt.tsv', header=0, sep='\t')

In [4]:
len(out_dedup_pass)

75

In [5]:
out_dedup_pass.head()

Unnamed: 0,peptide,mut_index_len,orf_types,peptide_type,variant_ORF_ids,variant_ORF_id,ORF_ID,variant_qc,ORF_type,prot_mut,Mel11_1_TPM,Mel11_3_TPM,Mel11_4_TPM,Mel11pre_TPM,mut_loc,mel11.1_wt_cov,mel11.1_mut_cov,mel11.3_wt_cov,mel11.3_mut_cov,mel11.4_wt_cov,mel11.4_mut_cov,mel11_cellline.RNA_wt_cov,mel11_cellline.RNA_mut_cov,13240-011.Nor.WES_wt_cov,13240-011.Nor.WES_mut_cov,13240-011.Tum.WES_wt_cov,13240-011.Tum.WES_mut_cov,13240-011.Tum.RNA_wt_cov,13240-011.Tum.RNA_mut_cov,Mel11_PBMCs_WGS_wt_cov,Mel11_PBMCs_WGS_mut_cov,cellline_WGS_wt_cov,cellline_WGS_mut_cov,Nor_origWESbam_wt_cov,Nor_origWESbam_mut_cov,Tum_origWESbam_wt_cov,Tum_origWESbam_mut_cov,Tum_origRNAbam_wt_cov,Tum_origRNAbam_mut_cov,Mel11pre_wt_cov,Mel11pre_mut_cov,wt_rpf_sum,mut_rpf_sum,A0103_nM,A6601_nM,B0801_nM,ribo_wt_cov,ribo_mut_cov,TPM,MoW
0,NSLELLSSHY,2,"['CDS', 'canonical']",annotated,"['PIK3C3|ENST00000262039.8_1_18:39535256-39661101:+|CDS|18|39584419|G|A', 'PIK3C3|ENST00000398870.7_1_18:39535256-39661098:+|canonical|18|39584419|G|A']",PIK3C3|ENST00000262039.8_1_18:39535256-39661101:+|CDS|18|39584419|G|A,PIK3C3|ENST00000262039.8_1_18:39535256-39661101:+|CDS,PASS,CDS,D362N,0.192654,0.079721,0.060676,0.586291,18_39584419_G_A,7,5,0,3,2,2,7,5,97,0,61,4,172,22,66,0,18,20,180,0,167,27,164,27,20,0,9,10,453.0042,5873.3237,29070.5254,9,10,0.111017,0.526316
1,FTRSDSLFK,1,['canonical'],annotated,['PHB2|ENST00000399433.6_1_12:7074850-7079706:-|canonical|12|7074865|T|A'],PHB2|ENST00000399433.6_1_12:7074850-7079706:-|canonical|12|7074865|T|A,PHB2|ENST00000399433.6_1_12:7074850-7079706:-|canonical,PASS,canonical,I294F,28.571252,27.71494,31.38816,16.498377,12_7074865_T_A,57,52,3,1,9,3,8,9,258,0,221,73,1226,643,36,0,20,33,715,0,477,176,1229,796,218,0,69,56,10526.1924,448.5028,28936.209,69,56,29.224784,0.448
2,RTTGEILDFY,2,"['noncoding_other', 'canonical']",annotated,"['MGAM2|ENST00000496337.1_1_7:141870985-141908039:+|noncoding_other|7|141872500|G|A', 'MGAM2|ENST00000477922.3_1_7:141816673-141921856:+|canonical|7|141872500|G|A']",MGAM2|ENST00000496337.1_1_7:141870985-141908039:+|noncoding_other|7|141872500|G|A,MGAM2|ENST00000496337.1_1_7:141870985-141908039:+|noncoding_other,PASS,noncoding_other,G134E,0.007864,0.017174,0.015686,0.0,7_141872500_G_A,0,0,1,4,1,7,5,1,39,0,118,14,2,0,48,0,98,9,153,0,286,51,1,0,0,0,2,11,369.1694,7978.3311,39779.7344,2,11,0.013575,0.846154
3,HTFTPGNFKR,1,['iORF'],nuORF,['MGME1|ENST00000377710.9_1_20:17956345-17956408:+|iORF|20|17956391|C|T'],MGME1|ENST00000377710.9_1_20:17956345-17956408:+|iORF|20|17956391|C|T,MGME1|ENST00000377710.9_1_20:17956345-17956408:+|iORF,PASS,iORF,L16F,0.343012,1.685521,0.256574,0.0,20_17956391_C_T,15,13,3,3,4,4,11,6,211,0,190,59,242,75,46,0,33,31,546,0,554,199,229,84,40,0,22,20,23253.8223,149.0816,35789.2891,22,20,0.761702,0.47619
4,RPLLHGPPV,1,['canonical'],annotated,['PARP4|ENST00000381989.3_1_13:24995259-25077914:-|canonical|13|25060340|A|G'],PARP4|ENST00000381989.3_1_13:24995259-25077914:-|canonical|13|25060340|A|G,PARP4|ENST00000381989.3_1_13:24995259-25077914:-|canonical,PASS,canonical,S440P,2.978023,4.455289,5.078653,2.470887,13_25060340_A_G,4,16,0,1,2,7,8,19,97,0,53,10,597,189,42,0,20,20,311,0,202,58,575,209,15,0,6,24,35951.5195,27105.082,470.804,6,24,4.170655,0.8


In [49]:
out_dedup_pass_mut_locs = out_dedup_pass.drop_duplicates(subset=['mut_loc'])
print(str(len(out_dedup_pass_mut_locs)))

47


In [194]:
neoantigen_loc_count = out_dedup_pass.groupby('category')['mut_loc'].nunique()
neoantigen_loc_count

category
annotated    28
both         14
nuORF        5 
Name: mut_loc, dtype: int64

In [195]:
5 / (5+14+28)

0.10638297872340426

In [149]:
neoantigen_loc_count.to_csv('/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/allCan/snv_coverage/pass_filt_annot_vs_nuORF_mut_locs.tsv', header=True, index=True, sep='\t')

In [7]:
neoantigen_count = out_dedup_pass.groupby('peptide_type')['peptide'].nunique()
neoantigen_count

peptide_type
annotated    56
nuORF        19
Name: peptide, dtype: int64

In [152]:
neoantigen_count.to_csv('/ahg/regevdata/projects/Ribo-seq/MHC-I/variants/mel11/mel11_snvs/allCan/snv_coverage/pass_filt_annot_vs_nuORF_peptides.tsv', header=True, index=True, sep='\t')