In [None]:
import pandas as pd
import matplotlib as plt
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.stats import gaussian_kde

In [None]:
#ClinVar data file
file = '../Data/20250128_BARD1_ClinVar_SNVsOnly_1Starplus.txt'

#SGE data file
sge = '../Data/20250122_BARD1_PillarProjectScores_filtered.xlsx'

#gnomAD Data File
gnomAD = '../Data/20240905_BARD1_gnomADv4.1.0_SNVs.xlsx'

#GMM Cutoffs
path_max = 0.689682159032362 
benign_min = 0.807231141721117

In [None]:
def read_data(file):
    df = pd.read_csv(file, delimiter='\t') #reads ClinVar tabular .txt 
    df = df[['Name','Protein change','GRCh38Chromosome','GRCh38Location','Germline classification']] #pulls useful columns
    df = df.dropna(subset = ['GRCh38Location']) #Drops variants without genomic coordinate
    df.GRCh38Location = df.GRCh38Location.astype(int) #Sets coordinates to integer data type
    df['Base Change'] = None #preps for next function
    return df

In [None]:
def get_pair(base): #ClinVar gives base changes on negative sense strand, SGE pos_id on positive sense
    if base == 'A':
        return 'T'
    elif base == 'T':
        return 'A'
    elif base == 'C':
        return 'G'
    else:
        return 'C'

In [None]:
def prep_sge(sge): #reads SGE data
    df = pd.read_excel(sge)
    df = df[['target', 'Consequence', 'pos_id', 'snv_score']]

    df = df.loc[(df['snv_score'] > -3) & (df['snv_score'] < 2)]
    return df

In [None]:
def get_base_changes(df):
    #creates pos_id in format of SGE data file
    
    k = 0
    while k < len(df):
        var = df['Name'][k]
        coord = str(df['GRCh38Location'][k])
        k += 1
        i = 0
        j = 3
        while j < (len(var) + 1):
            test_str = var[i:j]
            j += 1
            i += 1
            sense_base = get_pair(test_str[2])
            if test_str[1] == '>':
                change = coord + ":" + sense_base
                df.loc[df['Name'] == var, 'Base Change'] = change

    return df

In [None]:
def merge(clin,sge, nf_cutoff, func_cutoff):
    #merges ClinVar dataframe and SGE dataframe based on shared pos_id
    clin_data = clin
    sge_data = sge
    df = pd.merge(clin_data, sge_data, left_on = 'Base Change', right_on = 'pos_id', how = 'inner')
    df['Function Type'] = None

    #categorizes each variant based on SGE score 
    path_max = nf_cutoff #SGE scores used to create each score group
    benign_min = func_cutoff
    
    i = 0 
    while i < len(df):
        score = df['snv_score'][i]
        id = df['pos_id'][i]
        if score < path_max:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Pathogenic'
        elif path_max < score < benign_min:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Intermediate'
        elif score > benign_min:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Benign'
        i += 1

    #makes dataframe nicer    
    df = df[['Name', 'Protein change', 'Germline classification', 'Base Change', 'Consequence', 'target', 'snv_score', 'Function Type']]
    reordered = ['target', 'Name', 'Protein change', 'Base Change', 'Consequence', 'snv_score', 'Germline classification', 'Function Type']
    df = df[reordered]

    df.rename(columns = {'target': 'target', 'snv_score': 'SGE Score'}, inplace = True)
    #df.to_excel('20240801_BARD1SGE_ClinVar_vars.xlsx')
    return df

In [None]:
def histogram(df, nf_cutoff, func_cutoff): #creates histograms showing distribution of scores for variants in ClinVar

    nf_line = alt.Chart(pd.DataFrame({'x': [nf_cutoff]})).mark_rule(color = 'red').encode(
        x = 'x')

    func_lin = alt.Chart(pd.DataFrame({'x': [func_cutoff]})).mark_rule(color = 'blue').encode(
        x = 'x')
    
    #sets bins and domain of scale for histograms
    bins = 50
    scale = [-3,2]
    ticks = list(range(-3,3))
    
    #extracts P/LP and B/LB variants from ClinVar data
    plp = ['Benign', 'Benign/Likely benign', 'Likely benign', 'Pathogenic', 'Likely pathogenic', 'Pathogenic/Likely pathogenic']
    p_df = df.loc[df['Germline classification'].isin(plp)]
    p_df.loc[p_df['Germline classification'] == 'Benign/Likely benign','Germline classification'] = 'Likely benign' 
    p_df.loc[p_df['Germline classification'] == 'Pathogenic/Likely pathogenic', 'Germline classification'] = 'Likely pathogenic'
    #p_df.to_excel('output_path.xlsx')
    #Creates histogram showing distribution of P/LP and B/LB variants in ClinVar vs. SGE score
    plp_hist = alt.Chart(p_df).mark_bar().encode(
        alt.X('SGE Score', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins),
             scale = alt.Scale(domain = scale)),
        alt.Y('count()', axis = alt.Axis(title = 'Number of Variants', labelFontSize = 16, titleFontSize = 20)),
        color = alt.Color('Germline classification:N')
    ).properties(
        width = 553,
        height = 322,
        title = alt.TitleParams(text = 'P/LP and B/LB Variants vs. SGE Score' + ' (n= ' + str(len(p_df)) + ')', fontSize = 22)
    )
    plp_hist = nf_line + func_lin + plp_hist
    plp_hist.show()

    #extracts VUS and conflicting data from ClinVar
    vus = ['Uncertain significance', 'Conflicting classifications of pathogenicity']
    v_df = df.loc[df['Germline classification'].isin(vus)]
    v_df.loc[v_df['Germline classification'] == 'Conflicting classifications of pathogenicity', 'Germline classification'] = 'Conflicting'
    #v_df.to_excel('output.xlsx')
    #create histogram showing distribution of VUS and conflicting variants in ClinVar vs. SGE score
    vus_hist = alt.Chart(v_df).mark_bar().encode(
            alt.X('SGE Score', axis = alt.Axis(values = ticks, title = 'SGE Score', labelFontSize = 16, titleFontSize = 20), bin = alt.Bin(maxbins = bins),
                 scale = alt.Scale(domain = scale)),
            alt.Y('count()', axis = alt.Axis(title = 'Number of Variants',labelFontSize = 16, titleFontSize = 20)),
            color = alt.Color('Germline classification:N')
        ).properties(
            width = 553,
            height = 322,
            title = alt.TitleParams( text = 'VUS and Conflicting Variants vs. SGE Score' + ' (n=' + str(len(v_df)) + ')', fontSize = 22)
        )

    vus_hist = nf_line + func_lin + vus_hist
    vus_hist.show()



In [None]:
def merge_gnomAD(sge_df, clinvar_df, file): #Merges gnomAD variants with SGE scores for gnomAD track in the ridgeline plot
    raw_gnomAD = pd.read_excel(file) #Reads gnomAD output

    for i in range(len(raw_gnomAD)):
        raw_gnomAD.loc[raw_gnomAD['gnomAD ID'] == raw_gnomAD['gnomAD ID'][i], 'gnomAD ID'] = raw_gnomAD['gnomAD ID'][i][2:11] + ':' + raw_gnomAD['gnomAD ID'][i][14]

    gnomad_vars = raw_gnomAD[['gnomAD ID']] #extracts the gnomAD ID column
    gnomad_vars = gnomad_vars.copy() #set copy
    gnomad_vars['Germline classification'] = 'gnomAD' #Sets 'classification' to gnomAD

    gnomad_sge = pd.merge(sge_df, gnomad_vars, left_on = 'pos_id', right_on = 'gnomAD ID', how = 'inner') #merges with SGE
    gnomad_sge = gnomad_sge.rename(columns = {'snv_score': 'SGE Score'}) #Renames for same columnn names as the clinvar-SGE df

    all_merged = pd.concat([gnomad_sge, clinvar_df]) #concatenates

    return all_merged
    

In [None]:
def rename_germline(df): #Renames ClinVar germline classifications to fit on plot better

    df = df.copy() #so the original df isn't overwritten
    
    #renames germline classification categories
    df.loc[(df['Germline classification'] == 'Pathogenic') | (df['Germline classification'] == 'Likely pathogenic'), 'Germline classification'] = 'P/LP'
    df.loc[(df['Germline classification'] == 'Benign') | (df['Germline classification'] == 'Likely benign'), 'Germline classification' ] = 'B/LB'
    df.loc[df['Germline classification'] == 'Uncertain significance', 'Germline classification' ] = 'VUS'
    df.loc[df['Germline classification'] == 'Conflicting classifications of pathogenicity', 'Germline classification' ] = 'Conflicting'
    df.loc[df['Germline classification'] == 'Benign/Likely benign', 'Germline classification'] = 'B/LB'
    df.loc[df['Germline classification'] == 'Pathogenic/Likely pathogenic', 'Germline classification'] = 'P/LP'

    return df

In [None]:
def compute_density(df, category): #used to calculate the density - mostly GPT code
    
    if len(df) == 0:
        print(f"Warning: No data found for category {category}")
        return None
        
    values = df['SGE Score'].values

    valid_values = values[~np.isnan(values)]
    if len(valid_values) <= 1:
        print(f"Warning: Not enough valid data points for category {category}")
        return None

    density = gaussian_kde(valid_values)
    xs = np.linspace(valid_values.min(), valid_values.max(), 200)
    ys = density(xs)
    
    return pd.DataFrame({'SGE_score': xs, 'density': ys, 'Germline classification': category})


In [None]:
def ridgeline(df,rawdf): #creates ridgeline plot

    


    df = pd.concat([df,rawdf], ignore_index = True) #concatenates the density data and raw data so that the ticks and density plots share a dataframe (required for graphing)
    df = df.dropna(subset  = ['Germline classification'])

    #Base creates the density plots using the density data 
    base = alt.Chart(df).mark_area(
        interpolate = 'monotone',
        line = True
    ).encode(
        x = alt.X('SGE_score:Q', axis = alt.Axis(title = 'SGE Score', titleFontSize = 20, labelFontSize = 16)),
        y = alt.Y('density:Q', axis = None),
        color = alt.Color('Germline classification:N', legend = None)
    )

    #ticks creates the tick marks using the raw data
    ticks = alt.Chart(df).mark_tick(
        color = 'black',
        thickness = 1,
        size = 5
    ).encode(
        x = alt.X('SGE Score:Q', title = ''),
        y = alt.value(77.5), 
        tooltip = [alt.Tooltip('target', title = 'SGE Region: '),
                   alt.Tooltip('Consequence', title = 'Consequence: '),
                   alt.Tooltip('SGE Score:Q', title = 'Score: ')]
    )

    #Plots are layered
    combined_plot = alt.layer(ticks, base).properties(
        width = 1000,
        height = 75
    )

    #plots are faceted by consequence of the variant to yield the final plot (faceting requires that all data is in the same dataframe
    plot = combined_plot.facet(
        row = alt.Row('Germline classification:N', title = '', sort = ['P/LP', 'B/LB', 'gnomAD', 'VUS', 'Conflicting']),
        spacing = 2
    ).properties(
        title = '',
        bounds = 'flush'
    ).configure_facet(
        spacing= 1
    ).configure_header(
        titleFontSize = 20,
        labelFontSize = 16
    ).configure_title(
        anchor='start'
    ).configure_axis(
        grid=False
    ).configure_view(
        stroke = None
    )
    
    plot.show()

In [None]:
def roc_df(df): #makes dataframe needed for ROC
    #filter out non P/LP and B/LB variants
    nv_df = df[['Base Change','Germline classification', 'Function Type']]
    non_vus_list = ['Benign', 'Benign/Likely benign', 'Likely benign', 'Pathogenic', 'Likely pathogenic', 'Pathogenic/Likely pathogenic']
    nv_df = nv_df.loc[nv_df['Germline classification'].isin(non_vus_list)]

    #creates columns in dataframe needed for ROC analysis
    nv_df = nv_df.copy()
    nv_df['Germline Num'] = np.nan
    nv_df['SGE Num'] = np.nan
    nv_df['target'] = np.nan

    #reindexes new df
    new_index = []
    for i in range(len(nv_df)):
            new_index.append(i)
    nv_df = nv_df.reset_index(drop = True)
    nv_df.index = new_index

    #assigns 1s and 0s to each variant type - both for ClinVar classification and SGE classification
    nv_df.loc[(nv_df['Germline classification'] == 'Benign') | (nv_df['Germline classification'] == 'Likely benign') | (nv_df['Germline classification'] == 'Benign/Likely benign') , 'Germline Num'] = 1
    nv_df.loc[(nv_df['Germline classification'] == 'Pathogenic') | (nv_df['Germline classification'] == 'Likely pathogenic') | (nv_df['Germline classification'] == 'Pathogenic/Likely pathogenic') , 'Germline Num'] = 0

    nv_df.loc[nv_df['Function Type'] == 'Pathogenic', 'SGE Num'] = 0
    nv_df.loc[(nv_df['Function Type'] == 'Benign') | (nv_df['Function Type'] == 'Intermediate'), 'SGE Num'] = 1

    #determines if ClinVar an SGE agree (1 - yes, 0 - no)
    i = 0
    while i < len(nv_df):
        clin = nv_df['Germline Num'][i]
        sge = nv_df['SGE Num'][i]
        id = nv_df['Base Change'][i]

        if clin == sge:
            nv_df.loc[nv_df['Base Change'] == id, 'target'] = 1
        else:
            nv_df.loc[nv_df['Base Change'] == id, 'target'] = 0

        i += 1
    
    
    return nv_df
    

In [None]:
def roc_qc(df):
    i = 0
    concor = 0
    total = 0
    while i < len(df):
        test = df['target'][i]
        if test == 1:
            concor += 1
            total += 1
        else:
            total += 1
        i += 1
    print('Cocordant: ' + str(concor))
    print('Discordant: ' + str(total-concor))

In [None]:
def concor_stats(df):
    i = 0
    plp_total = 0
    plp_concor = 0
    blb_total = 0
    blb_concor = 0
    while i < len(df):
        test = df['target'][i]
        type = df['Germline classification'][i]
        if type == 'Pathogenic' or type == 'Likely pathogenic' or type == 'Pathogenic/Likely pathogenic':
            plp_total += 1
            if test == 1:
                plp_concor += 1
                i += 1
            else:
                i += 1
        elif type == 'Benign' or type == 'Likely benign' or type == 'Benign/Likely benign':
            blb_total += 1
            if test == 1:
                blb_concor += 1
                i += 1
            else: 
                i += 1
    total = plp_total + blb_total
    total_concor = plp_concor + blb_concor
    
    print(str(total_concor),' of ', str(total), ' variants concordant')
    print(str(plp_concor), ' of ', str(plp_total), ' P/LP variants concordant')
    print(str(blb_concor), ' of ', str(blb_total), ' B/LB variants concordant')

In [None]:
def roc(df):
    X = df[['Germline classification', 'Function Type']]
    y = df['target']


    # One-hot encode categorical features
    column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Germline classification', 'Function Type'])
    ],
    remainder='passthrough'
    )

    pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', LogisticRegression())
    ])

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    print("Training set size:", X_train.shape)
    print("Testing set size:", X_test.shape)
    
    # Train a classification model
    pipeline.fit(X_train, y_train)

    
    # Predict probabilities for the test set
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    
    # Compute ROC curve and AUC
    precision, recall, thresholds = precision_recall_curve(y_test,y_prob)
    pr_auc = auc(recall, precision)
    print('AUC: ', str(pr_auc))
    average_precision = average_precision_score(y_test, y_prob)
    # Plot the ROC curve
    to_plot = pd.DataFrame({'Precision': precision, 'Recall': recall, 'Threshold': np.append(thresholds, 1)})

    pr_chart = alt.Chart(to_plot).mark_line(color='darkorange').encode(
    x=alt.X('Recall:Q', scale=alt.Scale(domain=[0, 1.05])),
    y=alt.Y('Precision:Q', scale=alt.Scale(domain=[0, 1.05])),
    tooltip=['Recall:Q', 'Precision:Q', 'Threshold:Q']
    ).properties(
    title=f'Precision-Recall Curve (AP = {average_precision:.2f})'
    )

    diagonal = alt.Chart(pd.DataFrame({'x': [0, 1], 'y': [0, 1]})).mark_line(strokeDash=[5, 5], color='navy').encode(
    x='x',
    y='y'
    )
    pr_chart.show()
    #final = roc_chart + diagonal
    #final.show()

In [None]:
def main():
    alt.data_transformers.disable_max_rows()
    clin_data = read_data(file)
    print('ClinVar Variants: ', len(clin_data))
    with_base = get_base_changes(clin_data)
    #print(len(clin_data.loc[(clin_data['Germline classification'] == 'Benign') | (clin_data['Germline classification'] == 'Likely benign') |(clin_data['Germline classification'] == 'Benign/Likely benign')]))
    sge_data = prep_sge(sge)
    clinvar_data = merge(with_base, sge_data,path_max, benign_min)
    data = merge_gnomAD(sge_data, clinvar_data, gnomAD)
    germ_relabeled = rename_germline(data)
    df_roc= roc_df(data)
    concor_stats(df_roc)
    histogram(clinvar_data, path_max, benign_min)

    density_data = pd.concat([compute_density(germ_relabeled[germ_relabeled['Germline classification'] == category], category)
                          for category in germ_relabeled['Germline classification'].unique()])
    
    ridgeline(density_data, germ_relabeled)
    #roc(df_roc)
    #roc_qc(df_roc)
    

In [None]:
main()