In [5]:
import pandas as pd

# Define relevant clinical significance terms
clinically_relevant_terms = {'Pathogenic', 'Likely_pathogenic', 'Risk_factor', 'association'}

vcf_file = 'clinically_relevant_snps_filtered.txt'
clinically_relevant_snps = []

with open(vcf_file, 'r') as file:
    for line in file:
        if line.startswith('#'):
            continue
        parts = line.strip().split('\t')
        chrom, pos, rsid, ref, alt, info, genotype = parts[0], parts[1], parts[2], parts[3], parts[4], parts[7], parts[9]
        
        info_fields = {kv.split('=')[0]: kv.split('=')[1] for kv in info.split(';') if '=' in kv}
        clinical_significance = info_fields.get('CLNSIG', 'unknown')

        # Check if clinical significance is relevant
        if clinical_significance in clinically_relevant_terms:
            disease = info_fields.get('CLNDN', 'not_specified')
            gene = info_fields.get('GENEINFO', 'not_specified')
            clinically_relevant_snps.append([chrom, pos, rsid, ref, alt, genotype, clinical_significance, disease, gene])
# Create DataFrame
columns = ['Chromosome', 'Position', 'rsID', 'REF', 'ALT', 'Genotype', 'Clinical_Significance', 'Disease', 'Gene']
clinical_df = pd.DataFrame(clinically_relevant_snps, columns=columns)

# Save results
clinical_df.to_csv('clinically_relevant_snps.csv', sep='\t', index=False)

# Display DataFrame
print(clinical_df)



   Chromosome   Position               rsID REF ALT Genotype  \
0           1   31349647    rs2491132;12756   C   T      0/1   
1           2   33682737   rs2124437;236028   G   T      0/1   
2           2  103092513  rs1468788;1120031   C   T      0/1   
3           2  103125984  rs4851608;1120028   C   T      0/1   
4           4    9966380     rs7442295;4593   A   G      0/1   
5           4  145460230  rs13147758;870129   A   G      0/1   
6           4  145480780   rs1828591;870131   A   G      0/1   
7           4  145486389  rs13118928;870130   A   G      0/1   
8           5  167845791    rs17070145;1213   C   T      0/1   
9           6   31794592   rs2763979;694519   C   T      0/1   
10          7   28532464  rs4722804;1691115   G   T      0/1   
11          8   61452046  rs2272620;1691117   A   G      0/1   
12         10   96797470  rs1934953;1693596   C   T      0/1   
13         15   28365618    rs12913832;4745   A   G      0/1   
14         15   28530182     rs1667394;4

In [3]:
clinical_df['Genotype_Interpretation'] = clinical_df.apply(
    lambda row: 'Carrier (possible risk)' if row['Clinical_Significance'].lower() in ['pathogenic', 'likely_pathogenic'] 
                else 'Increased risk/association', axis=1)


In [4]:
clinical_df

Unnamed: 0,Chromosome,Position,rsID,REF,ALT,Genotype,Clinical_Significance,Disease,Gene,Genotype_Interpretation
0,1,31349647,rs2491132;12756,C,T,0/1,association,"Obesity,_association_with",SDC3:9672,Increased risk/association
1,2,33682737,rs2124437;236028,G,T,0/1,association,Lip_and_oral_cavity_carcinoma,RASGRP3:25780,Increased risk/association
2,2,103092513,rs1468788;1120031,C,T,0/1,association,Ascending_aortic_dissection,SLC9A4:389015,Increased risk/association
3,2,103125984,rs4851608;1120028,C,T,0/1,association,Ascending_aortic_dissection,SLC9A4:389015,Increased risk/association
4,4,9966380,rs7442295;4593,A,G,0/1,association,"Uric_acid_concentration,_serum,_quantitative_t...",SLC2A9:56606,Increased risk/association
5,4,145460230,rs13147758;870129,A,G,0/1,association,Chronic_obstructive_pulmonary_disease|Chronic_...,HHIP:64399,Increased risk/association
6,4,145480780,rs1828591;870131,A,G,0/1,association,Chronic_obstructive_pulmonary_disease|Chronic_...,HHIP:64399,Increased risk/association
7,4,145486389,rs13118928;870130,A,G,0/1,association,Chronic_obstructive_pulmonary_disease|Chronic_...,HHIP:64399,Increased risk/association
8,5,167845791,rs17070145;1213,C,T,0/1,association,Memory_quantitative_trait_locus,WWC1:23286,Increased risk/association
9,6,31794592,rs2763979;694519,C,T,0/1,association,Chronic_obstructive_pulmonary_disease,HSPA1B:3304,Increased risk/association
