In [None]:
import pandas as pd

In [None]:
sge_scores = '../Data/20250508_BARD1scores_update_FILTERED.xlsx' #SGE Score file

In [None]:
def process_scores(file): #Reads and processes score file
    df = pd.read_excel(sge_scores) #Reads scores

    df = df.loc[df['simplified_consequence'].isin(['missense_variant', 'synonymous_variant', 'splicing_variant'])] #Pick desired variants here
    df = df.loc[df['functional_consequence'].isin(['functionally_abnormal'])] #Selects for only abnormal variants
    df = df[['chrom', 'pos', 'ref', 'allele']] #Pulls out necessary columns

    df['chrom'] = df['chrom'].astype(str).str.replace('chr', '') #Drops chr from chromosme name

    df = df.rename(columns = {'allele': 'alt'}) #Renames allele column

    return df

In [None]:
def dataframe_to_vcf(df, output_file): #Builds VCF
    """
    Create a minimal VCF file from a pandas DataFrame
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Must contain columns: chrom, pos, ref, alt
    output_file : str
        Path to output VCF file
    
    Example:
    --------
    df = pd.DataFrame({
        'chrom': ['2', '2', '2'],
        'pos': [214796956, 214796957, 214796958],
        'ref': ['G', 'A', 'A'],
        'alt': ['A', 'C', 'G']
    })
    dataframe_to_vcf(df, 'output.vcf')
    """
    
    # Validate required columns
    required = ['chrom', 'pos', 'ref', 'alt']
    missing = [col for col in required if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    
    with open(output_file, 'w') as f:
        # Write minimal VCF header
        f.write("##fileformat=VCFv4.3\n")
        #f.write(f"##fileDate={datetime.now().strftime('%Y%m%d')}\n")
        f.write("##reference=GRCh38\n")
        f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
        
        # Write variants
        for idx, row in df.iterrows():
            chrom = str(row['chrom']).replace('chr', '')  # Remove 'chr' prefix if present
            pos = str(row['pos'])
            ref = row['ref'].upper()
            alt = row['alt'].upper()
            
            f.write(f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\tPASS\t.\n")
    
    print(f"VCF file written to: {output_file}")
    print(f"Total variants: {len(df)}")

In [None]:
def main():
    df = process_scores(sge_scores)
    dataframe_to_vcf(df, '/Users/ivan/Desktop/20250707_BARD1_Abnormal_MisSyn_wSpliceRegion.vcf')
    print(df)

In [None]:
main()