In [None]:
import pandas as pd
import altair as alt

In [None]:
#Functional Class Cutoffs
path_max = 0.689682159032362 
benign_min = 0.807231141721117

sge = '../Data/20250122_BARD1_SGEscores_wAAsub.xlsx'
cc = '/Users/ivan/Downloads/7_genes_for_Starita_02282025_hg38.xlsx'

In [None]:
def read_data(sge, cc): #Reads all data
    sge = pd.read_excel(sge) #Reads SGE data
    sge = sge.drop(columns = ['R1_score', 'R2_score', 'R3_score']) #Drops replicate score columns
    
    cc = pd.read_excel(cc) #Reads case-control data
    cc = cc[cc['CAVA_GENE'].isin(['BARD1'])] #Filters only for BARD1
    cc = cc[['Classification', '#CHROM', 'REF', 'ALT', 'CAVA_GENE', 'CAVA_CSN', 'CAVA_SO', 'Sample_ID', 'CaseControl','hg38_start']].copy() #Keeps necessary columns

    return sge, cc

In [None]:
def classify_vars(sge, path_max, benign_min):
    sge['Classification'] = None

    sge.loc[sge['snv_score'] <= path_max, 'Classification'] = 'NF'
    sge.loc[sge['snv_score'] >= benign_min, 'Classification'] = 'F'
    sge.loc[sge['Classification'] == None, 'Classification'] = 'I'

    sge_nf = sge[sge['Classification'].isin(['NF'])]
    sge_func = sge[sge['Classification'].isin(['F'])]
    
    return sge_func, sge_nf

In [None]:
def process_cc(cc): #Adds Position ID column to case-control data and splits into cases and controls
    cc['pos_id'] = None #Creates emtpy pos_id column

    cc['hg38_start'] = cc['hg38_start'].astype(str) #Sets hg38 coordinates as str data type
    cc['pos_id'] = cc['hg38_start'] + ':' + cc['ALT'] #Creates position ID

    cc_control = cc[cc['CaseControl'].isin(['Control'])] #Creates control df
    cc_case = cc[cc['CaseControl'].isin(['Case'])] #creates case df

    return cc_control, cc_case

In [None]:
def analyze(controls, cases, sge_func, sge_nf):
    control_val = len(controls)
    cases_val = len(cases)

    controls_nf = pd.merge(controls, sge_nf, on = 'pos_id', how = 'inner')
    control_nf_val = len(controls_nf)
    control_nf_odds = control_nf_val / control_val

    controls_func = pd.merge(controls, sge_func, on = 'pos_id', how = 'inner')
    control_func_val = len(controls_func)
    control_func_odds = control_func_val / control_val
    
    print(control_nf_odds, control_func_odds)


    cases_nf = pd.merge(cases, sge_nf, on = 'pos_id', how = 'inner')
    cases_nf_val = len(cases_nf)
    cases_nf_odds = cases_nf_val / cases_val

    cases_func = pd.merge(cases, sge_func, on = 'pos_id', how = 'inner')
    cases_func_val = len(cases_func)
    cases_func_odds = cases_func_val / cases_val
    
    print(cases_nf_odds, cases_func_odds)

In [None]:
def main():
    sge_df, cc_df = read_data(sge, cc)
    sge_func, sge_nf = classify_vars(sge_df, path_max, benign_min)
    controls, cases = process_cc(cc_df)
    analyze(controls, cases, sge_func, sge_nf)

In [None]:
main()