In [53]:
#! python3
import pandas as pd
import re

In [56]:
## Open exomes file
exomes = pd.read_csv(r'TRPC6_NPHS1_NPHS2_LMX1B_196KCLexomes.txt', sep='\t', header=None)

gene_list = [       ## list of genes to be analysed
    'TRPC6',
    'NPHS1',
    'NPHS2',
    'LMX1B'
]

## Create empty dataframes for later use
protein_df = pd.DataFrame()
all_pred_df = pd.DataFrame(columns=exomes.columns)

## Create protein_df dataframe to correlate genes with their respective protein IDs in EVE database
with open(r'identifiers.txt', 'r') as f:
    identifiers = f.read()
protein_list = identifiers[2:-2].split('},{')
protein_list = [x.split(',') for x in protein_list]
protein_list = [x[0].split(':') + x[1].split(':') for x in protein_list]
for i in range(len(protein_list)):
    protein_df.loc[i, protein_list[i][0].strip('"')] = protein_list[i][1].strip('"')
    protein_df.loc[i, protein_list[i][2].strip('"')] = protein_list[i][3].strip('"')

In [57]:
## Use regex to extract variant from column in file
for index, row in exomes.iterrows():
    variant = re.search('[A-Z]\d+[A-Z],$', row[10])
    try:
        exomes.loc[index,'variant'] = variant.group(0)[:-1]
    except AttributeError:
        exomes.loc[index,'variant'] = 'NA'
        print("Regex not found:", index, row[10])

## Merge EVE pred data with exomes data
for gene in gene_list:
        ## Select gene from gene list
    df = protein_df[protein_df['gene_symbol'] == gene]
        ## Correlate with EVE protein file
    protein = df['protein_symbol'].to_list()[0]
    pred_file = pd.read_csv(f'{protein}.csv')
        ## Match EVE variant with exomes variant format
    pred_file['variant'] = pred_file['wt_aa'].astype(str)+pred_file['position'].astype(str)+pred_file['mt_aa'].astype(str)
        ## Merge one gene at a time to avoid duplicate columns
    select_exomes = exomes[exomes[8] == gene]
    select_exomes = pd.merge(select_exomes, pred_file[['variant', 'EVE_scores_ASM', 'uncertainty_ASM']], on='variant', how='left')
        ## Combine info into all_pred_df
    all_pred_df = all_pred_df.append(select_exomes, ignore_index=True)

Regex not found: 10 NPHS1:NM_004646:exon30:c.3595-9G>T
Regex not found: 20 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 35 NPHS1:NM_004646:exon15:c.1930+10C>T
Regex not found: 39 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 45 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 67 NPHS1:NM_004646:exon30:c.3595-9G>T
Regex not found: 68 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 79 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 84 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 96 LMX1B:NM_002316:exon2:c.326+7G>C,NM_001174146:exon2:c.326+7G>C,NM_001174147:exon2:c.326+7G>C
Regex not found: 98 NPHS1:NM_004646:exon30:

In [58]:
## Save all_pred_df to csv and preview
all_pred_df.to_csv(r'196KCLexomes_pred.txt', sep='\t')
all_pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,EVE_scores_ASM,uncertainty_ASM,variant
0,S0618_annovar_coding_annotated.txt:S0618,chr11,101454192,101454192,G,A,HET,exonic,TRPC6,nonsynonymous SNV,...,6_hom,45_het,PASS,212.0,"DP=47;VDB=0.0281;AF1=0.5;AC1=1;DP4=8,7,9,15;MQ...",GT:PL:DP:SP:GQ,"0/1:242,0,175:39:3:99",,,P15S
1,S0620_annovar_coding_annotated.txt:S0620,chr11,101359750,101359750,G,A,HET,exonic,TRPC6,nonsynonymous SNV,...,7_hom,155_het,PASS,225.0,"DP=51;VDB=0.0399;AF1=0.5;AC1=1;DP4=12,8,18,11;...",GT:PL:DP:SP:GQ,"0/1:255,0,215:49:0:99",0.13168,0.389568,A404V
2,S0620_annovar_coding_annotated.txt:S0620,chr11,101323770,101323770,C,T,HET,exonic,TRPC6,synonymous SNV,...,12_hom,146_het,PASS,225.0,"DP=207;VDB=0.0355;AF1=0.5;AC1=1;DP4=32,68,34,7...",GT:PL:DP:SP:GQ,"0/1:255,0,255:205:0:99",,,Q904Q
3,S0620_annovar_coding_annotated.txt:S0620,chr11,101375177,101375177,G,A,HET,exonic,TRPC6,nonsynonymous SNV,...,.,.,PASS,201.0,"DP=167;VDB=0.0355;AF1=0.5;AC1=1;DP4=78,16,59,1...",GT:PL:DP:SP:GQ,"0/1:231,0,255:164:0:99",0.613749,0.667041,R175W
4,S0620_annovar_coding_annotated.txt:S0620,chr11,101347093,101347093,A,G,HET,exonic,TRPC6,synonymous SNV,...,41_hom,274_het,PASS,220.0,"DP=95;VDB=0.0374;AF1=0.5;AC1=1;DP4=38,13,30,14...",GT:PL:DP:SP:GQ,"0/1:250,0,255:95:3:99",,,N561N
