In [None]:
import pandas as pd
import altair as alt
import numpy as np
import statsmodels.api as sm
from scipy import stats

In [None]:
#Functional Class Cutoffs
path_max = 0.689682159032362 
benign_min = 0.807231141721117

sge = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx'
cc = '/Users/ivan/Downloads/7_genes_for_Starita_02282025_hg38.xlsx'


In [None]:
def read_data(sge, cc): #Reads all data
    sge = pd.read_excel(sge) #Reads SGE data
    sge = sge.drop(columns = ['R1_score', 'R2_score', 'R3_score']) #Drops replicate score columns
    
    cc = pd.read_excel(cc) #Reads case-control data
    cc = cc[cc['CAVA_GENE'].isin(['BARD1'])] #Filters only for BARD1
    cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_AAF', 'Sample_ID', 'CaseControl','ER_status1', 'hg38_start']].copy() #Keeps necessary columns
    cc = cc[cc['ER_status1'].isin([0, 777])] #ER negative cases only 

    return sge, cc

In [None]:
def classify_vars(sge, path_max, benign_min):
    sge['Classification'] = None

    #under development - trying to figure out which parameters are best

    ard_brct = 'X4L|X5|X6|X7|X8|X9|X10|X11'
    brct = 'X8|X9|X10|X11'
    ard = 'X5|X6|X7'
    ring = 'X1|X2|X3'

    sge = sge[sge['Consequence'].isin(['missense_variant'])]
    #sge = sge[~sge['Consequence'].isin(['synonymous_variant'])]
    sge = sge[sge['target'].str.contains(ard_brct)]


    #Classifies variants by GMM cutoffs
    sge.loc[sge['snv_score'] <= path_max, 'Classification'] = 'NF'
    sge.loc[sge['snv_score'] >= benign_min, 'Classification'] = 'F'
    sge.loc[sge['Classification'] == None, 'Classification'] = 'I'

    sge_nf = sge[sge['Classification'].isin(['NF'])]
    sge_func = sge[sge['Classification'].isin(['F'])]

    return sge, sge_func, sge_nf

In [None]:
def process_cc(cc): #Adds Position ID column to case-control data and splits into cases and controls
    cc['pos_id'] = None #Creates emtpy pos_id column
    cc = cc[cc['ALT'].str.len() == 1].copy()
    cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
    cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID

    cc_control = cc[cc['CaseControl'].isin(['Control'])] #Creates control df
    cc_case = cc[cc['CaseControl'].isin(['Case'])] #creates case df

    return cc, cc_control, cc_case

In [None]:
def analyze(cc, sge):
    merged = pd.merge(cc, sge, on = 'pos_id', how = 'inner') #Merges case-control and SGE data
    merged = merged.dropna(subset = ['Classification_y']) #drops any columsn without a classification
    
    contingency_tab = merged[['CaseControl', 'Classification_y']] #Creates dataframe for contingency table

    contingency_tab = pd.crosstab(merged['CaseControl'], merged['Classification_y']) #Creates contingency table
    contingency_tab = contingency_tab[contingency_tab.columns[::-1]] #Swaps NF and F columns in df
    table_array = contingency_tab.values #Gets values from contingency table


    oddsratio, p_value = stats.fisher_exact(table_array) #Tabulates odds-ratio and p-value from Fischer's exact test
    table = sm.stats.Table2x2(table_array) #Generates confidence intervals
    
    result = table.summary() #Summary table with Odds and Risk ratios
    display(contingency_tab) #Displays contingency table 
    display(result) #Displays summary stats
    

    
    #Prints out summary stats
    print(f"Odds Ratio: {table.oddsratio:.2f}")
    print(f"95% CI: ({table.oddsratio_confint()[0]:.2f}, {table.oddsratio_confint()[1]:.2f})")
    print(f"P-value (Fisher's exact): {p_value:.4f}")

In [None]:
def main():
    sge_df, cc_df = read_data(sge, cc)
    sge_all,sge_func, sge_nf = classify_vars(sge_df, path_max, benign_min)
    cc_all, controls, cases = process_cc(cc_df)
    analyze(cc_all, sge_all)

In [None]:
main()