In [None]:
#! python3
import pandas as pd
import re

In [None]:
## Open exomes file
exomes = pd.read_csv(r'TRPC6_NPHS1_NPHS2_LMX1B_196KCLexomes.txt', sep='\t', header=None)

gene_list = [       ## list of genes to be analysed
    'TRPC6',
    'NPHS1',
    'NPHS2',
    'LMX1B'
]

## Create empty dataframes for later use
protein_df = pd.DataFrame()
all_pred_df = pd.DataFrame(columns=exomes.columns)

## Create protein_df dataframe to correlate genes with their respective protein IDs in EVE database
with open(r'identifiers.txt', 'r') as f:
    identifiers = f.read()
protein_list = identifiers[2:-2].split('},{')
protein_list = [x.split(',') for x in protein_list]
protein_list = [x[0].split(':') + x[1].split(':') for x in protein_list]
for i in range(len(protein_list)):
    protein_df.loc[i, protein_list[i][0].strip('"')] = protein_list[i][1].strip('"')
    protein_df.loc[i, protein_list[i][2].strip('"')] = protein_list[i][3].strip('"')

In [None]:
## Use regex to extract variant from column in file
for index, row in exomes.iterrows():
    variant = re.search('[A-Z]d+[A-Z],$', row[10])
    try:
        exomes.loc[index,'variant'] = variant.group(0)[:-1]
    except AttributeError:
        exomes.loc[index,'variant'] = 'NA'
        print("Regex not found:", index, row[10])

## Merge EVE pred data with exomes data
for gene in gene_list:
        ## Select gene from gene list
    df = protein_df[protein_df['gene_symbol'] == gene]
        ## Correlate with EVE protein file
    protein = df['protein_symbol'].to_list()[0]
    pred_file = pd.read_csv(f'{protein}.csv')
        ## Match EVE variant with exomes variant format
    pred_file['variant'] = pred_file['wt_aa'].astype(str)+pred_file['position'].astype(str)+pred_file['mt_aa'].astype(str)
        ## Merge one gene at a time to avoid duplicate columns
    select_exomes = exomes[exomes[8] == gene]
    select_exomes = pd.merge(select_exomes, pred_file[['variant', 'EVE_scores_ASM', 'uncertainty_ASM']], on='variant', how='left')
        ## Combine info into all_pred_df
    all_pred_df = all_pred_df.append(select_exomes, ignore_index=True)

In [None]:
## Save all_pred_df to csv and preview
all_pred_df.to_csv(r'196KCLexomes_pred.txt', sep='\t')
all_pred_df.head()