In [None]:
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
import statsmodels.api as sm

In [None]:
ambry = '/Users/ivan/Downloads/Ambry_BARD1.xlsx'
sge = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx'
gnomad = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx'

#Functional Class Cutoffs
path_max = 0.689682159032362 
benign_min = 0.807231141721117


aa_dict = {
    'Ala': 'A',
    'Cys': 'C',
    'Asp': 'D',
    'Glu': 'E',
    'Phe': 'F',
    'Gly': 'G',
    'His': 'H',
    'Ile': 'I',
    'Lys': 'K',
    'Leu': 'L',
    'Met': 'M',
    'Asn': 'N',
    'Pro': 'P',
    'Gln': 'Q',
    'Arg': 'R',
    'Ser': 'S',
    'Thr': 'T',
    'Val': 'V',
    'Trp': 'W',
    'Tyr': 'Y',
    'Ter': '*'   # Termination codon
}

In [None]:
def read_data(ambry, gnomad, sge):
    ambry = pd.read_excel(ambry)
    ambry = ambry[['Gene', 'c_variant', 'p_variant', 'Classification']]

    
    sge = pd.read_excel(sge)
    sge = sge[['chrom', 'target','AAsub', 'pos_id','Consequence', 'snv_score']]

    
    gnomad = pd.read_excel(gnomad)
    gnomad = gnomad[['Chromosome', 'Position', 'Reference', 'Alternate', 'Allele Frequency', 'Protein Consequence']]

    return ambry, sge, gnomad

In [None]:
def classify_vars(sge, path_max, benign_min):
    sge['Classification'] = None

    sge.loc[sge['snv_score'] <= path_max, 'Classification'] = 'NF'
    sge.loc[sge['snv_score'] >= benign_min, 'Classification'] = 'F'
    sge.loc[sge['Classification'] == None, 'Classification'] = 'I'
    
    return sge

In [None]:
def get_missense_stats(df):
    missense_df = df[df['Consequence'].isin(['missense_variant'])]
    benign = missense_df['Classification'].value_counts()['F']
    pathogenic = missense_df['Classification'].value_counts()['NF']
    total_classified = benign + pathogenic
    
    print('Total Missense Classified: ', total_classified)
    print('Benign Missense: ', benign)
    print('Pathogenic Missense: ', pathogenic)

In [None]:
def convert_aa(aa_string, aa_dict):
    first_aa = aa_string[0:3]
    last_aa = aa_string[-3::]
    position = aa_string[3:-3]

    first_aa_code = aa_dict.get(first_aa, first_aa)
    last_aa_code = aa_dict.get(last_aa, last_aa)

    return f"{first_aa_code}{position}{last_aa_code}"

In [None]:
def process_gnomad(gnomad):

    gnomad['Position'] = gnomad['Position'].astype(str)
    gnomad['pos_id'] = gnomad['Position'] + ':' + gnomad['Alternate']
    
    #gnomad['AAsub'] = gnomad['Protein Consequence'].str[2:]
    #gnomad['AAsub'] = gnomad['AAsub'].apply(lambda x: convert_aa(x, aa_dict))
    
    #gnomad = gnomad.dropna(subset = ['Protein Consequence'])
    return gnomad

In [None]:
def process_ambry(ambry, aa_dict):
    
    ambry['AAsub'] = ambry['p_variant'].str[2:]

    ambry = ambry.dropna(subset = ['AAsub']).copy()
    ambry = ambry.reset_index(drop = True)
    #ambry['AAsub'] = ambry['AAsub'].apply(lambda x: convert_aa(x, aa_dict))
    return ambry

In [None]:
def get_oddsratio_beta(sge, gnomad, ambry):
    
    sge_gnomad = pd.merge(sge, gnomad, on = 'pos_id', how = 'inner')
    gnomad_func = sge_gnomad['Classification'].value_counts()['F']
    gnomad_nf = sge_gnomad['Classification'].value_counts()['NF']
    
    sge_ambry = pd.merge(sge,ambry, on = 'AAsub', how = 'inner')

    ambry_func = sge_ambry['Classification_x'].value_counts()['F']
    ambry_nf = sge_ambry['Classification_x'].value_counts()['NF'] 

    contingency_table = np.array([[ambry_nf, ambry_func],
                                    [gnomad_nf, gnomad_func]])

    df = pd.DataFrame(contingency_table, columns = ['NF', 'F'], index = ['Ambry', 'gnomAD'])
    display(df)
    oddsratio, p_value = stats.fisher_exact(contingency_table)
    
    print(f"Odds Ratio: {oddsratio}")
    print(f"P-value: {p_value}")

In [None]:
def get_oddsratio_logistic(sge, gnomad, ambry):
    sge_gnomad = pd.merge(sge, gnomad, on = 'pos_id', how = 'inner')
    

In [None]:
def get_oddsratio(sge, gnomad, ambry):
    sge_nf = sge.loc[sge['Classification'].isin(['NF'])]

    #sge_nf = sge_nf.dropna(subset = ['AAsub'])

    
    total_gnomad = len(gnomad)
    sge_gnomad = pd.merge(sge_nf, gnomad, on = 'pos_id', how = 'inner')

    sge_nf_gnomad = len(sge_gnomad)
    odds_gnomad = sge_nf_gnomad / total_gnomad

    total_ambry = len(ambry)
    sge_ambry = pd.merge(sge_nf, ambry, on = 'AAsub', how = 'inner')
    sge_nf_ambry = len(sge_ambry)

    odds_ambry = sge_nf_ambry/total_ambry

    odds_ratio = odds_ambry/odds_gnomad

    odds_df = pd.DataFrame({'gnomad_odds': odds_gnomad, 
                            'ambry_odds': odds_ambry,
                            'ratio': odds_ratio,
                           }, index = [0])
    print(total_gnomad, total_ambry)
    print(sge_nf_gnomad, sge_nf_ambry)
    display(odds_df)

In [None]:
def main():
    ambry_data, sge_data, gnomad_data = read_data(ambry, gnomad, sge)
    sge_classified = classify_vars(sge_data, path_max, benign_min)
    get_missense_stats(sge_classified)
    gnomad_data = process_gnomad(gnomad_data)
    ambry_data = process_ambry(ambry_data, aa_dict)
    
    #get_oddsratio(sge_classified, gnomad_data, ambry_data)
    get_oddsratio_beta(sge_classified, gnomad_data, ambry_data)

In [None]:
main()