In [1]:
import pandas as pd
import numpy as np
import gzip

In [2]:
pd.set_option('display.max_rows',500)
pd.set_option('display.min_rows',200)

In [3]:
def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    return vcf_names

vcf_file = '/ibex/scratch/penaguka/run_pipeline/clinvar_20230702.vcf.gz'
names = get_vcf_names(vcf_file)
vcf = pd.read_csv(vcf_file, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)

In [4]:
variants_df = vcf.read()
variants_df

  variants_df = vcf.read()


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO\n
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:..."
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:..."
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:..."
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:..."
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:..."
5,1,925946,1924157,C,G,.,.,ALLELEID=1983057;CLNDISDB=MedGen:CN517202;CLND...
6,1,925952,1019397,G,A,.,.,ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLND...
7,1,925956,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
8,1,925961,2069387,A,T,.,.,ALLELEID=2129477;CLNDISDB=MedGen:CN517202;CLND...
9,1,925969,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...


In [5]:
variants_df.shape

(2181902, 8)

In [6]:
variants_df['#CHROM'].value_counts()

2                 210029
1                 189728
17                137694
11                128996
3                 116627
16                115756
19                112068
5                 111569
7                 104223
9                  98263
6                  97149
12                 97104
10                 80922
X                  80869
15                 76484
4                  74842
8                  74463
14                 69309
13                 47572
22                 46394
20                 43823
18                 38317
21                 25820
MT                  2904
22                   895
Y                     65
NT_187633.1            6
NT_187693.1            6
NT_187661.1            3
NT_113889.1            1
NW_009646201.1         1
Name: #CHROM, dtype: int64

In [5]:
annotations_split = variants_df['INFO\n'].str.split(';')
list_sizes = annotations_split.apply(len)

# Get the unique sizes
unique_sizes = list_sizes.unique()
# Find the index with the highest number
max_index = np.argmax(list_sizes)

# Output the index with the highest number
# Extract the words before the '=' sign from the lists
column_names = list({item.split('=')[0] for sublist in annotations_split for item in sublist})

variants_add = variants_df.reindex(columns=variants_df.columns.tolist() + column_names)

def extract_value(row):
    info = row['INFO\n']
    for item in info.split(';'):
        key_value = item.split('=')
        if len(key_value) == 2 and key_value[0] in column_names:
            row[key_value[0]] = key_value[1]
    return row

# Apply the custom function to fill the empty columns in 'variants_add'
variants_add = variants_add.apply(extract_value, axis=1)

In [6]:
variants_add

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO\n,CLNVI,DBVARID,...,CLNVC,CLNDISDBINCL,CLNDISDB,CLNSIGINCL,AF_TGP,CLNVCSO,CLNREVSTAT,AF_ESP,CLNHGVS,MC
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69134A>G,SO:0001583|missense_variant
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69581C>G,SO:0001583|missense_variant
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69682G>A,SO:0001583|missense_variant
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69769T>C,SO:0001583|missense_variant
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69995G>C,SO:0001583|missense_variant
5,1,925946,1924157,C,G,.,.,ALLELEID=1983057;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925946C>G,SO:0001583|missense_variant
6,1,925952,1019397,G,A,.,.,ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925952G>A,SO:0001583|missense_variant
7,1,925956,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925956C>T,SO:0001819|synonymous_variant
8,1,925961,2069387,A,T,.,.,ALLELEID=2129477;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925961A>T,SO:0001583|missense_variant
9,1,925969,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925969C>T,SO:0001583|missense_variant


In [7]:
variants_add['CLNVC'].value_counts()

single_nucleotide_variant    1992119
Deletion                       96283
Duplication                    43991
Microsatellite                 27211
Indel                          11500
Insertion                       9391
Inversion                       1067
Variation                        340
Name: CLNVC, dtype: int64

In [8]:
variants_filter=variants_add[variants_add['CLNVC']=='single_nucleotide_variant']    

In [9]:
variants_add['CLNDN'].value_counts()

not_provided                                                                                                                                                                                                                                                                                                                           635046
Inborn_genetic_diseases                                                                                                                                                                                                                                                                                                                277937
Hereditary_cancer-predisposing_syndrome                                                                                                                                                                                                                                                                                                 4546

In [10]:
variants_add['CLNSIG'].value_counts()

Uncertain_significance                                                  1018377
Likely_benign                                                            598042
Benign                                                                   193999
Pathogenic                                                               132922
Conflicting_interpretations_of_pathogenicity                             100746
Likely_pathogenic                                                         65579
Benign/Likely_benign                                                      36746
Pathogenic/Likely_pathogenic                                              19219
not_provided                                                              10430
drug_response                                                              1881
other                                                                      1601
risk_factor                                                                 449
association                             

In [11]:
variants_filter

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO\n,CLNVI,DBVARID,...,CLNVC,CLNDISDBINCL,CLNDISDB,CLNSIGINCL,AF_TGP,CLNVCSO,CLNREVSTAT,AF_ESP,CLNHGVS,MC
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69134A>G,SO:0001583|missense_variant
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69581C>G,SO:0001583|missense_variant
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69682G>A,SO:0001583|missense_variant
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69769T>C,SO:0001583|missense_variant
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:...",,,...,single_nucleotide_variant,,"MeSH:D030342,MedGen:C0950123",,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.69995G>C,SO:0001583|missense_variant
5,1,925946,1924157,C,G,.,.,ALLELEID=1983057;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925946C>G,SO:0001583|missense_variant
6,1,925952,1019397,G,A,.,.,ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925952G>A,SO:0001583|missense_variant
7,1,925956,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925956C>T,SO:0001819|synonymous_variant
8,1,925961,2069387,A,T,.,.,ALLELEID=2129477;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925961A>T,SO:0001583|missense_variant
9,1,925969,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...,,,...,single_nucleotide_variant,,MedGen:CN517202,,,SO:0001483,"criteria_provided,_single_submitter",,NC_000001.11:g.925969C>T,SO:0001583|missense_variant


In [16]:
variants_filter['CLNSIG'].value_counts()

Uncertain_significance                                                  976583
Likely_benign                                                           574570
Benign                                                                  169561
Conflicting_interpretations_of_pathogenicity                             96376
Pathogenic                                                               66594
Likely_pathogenic                                                        46294
Benign/Likely_benign                                                     34601
Pathogenic/Likely_pathogenic                                             13699
not_provided                                                              8600
drug_response                                                             1686
other                                                                     1563
risk_factor                                                                378
association                                         

In [17]:
variants_add['CLNSIG'].value_counts()

Uncertain_significance                                                  1018377
Likely_benign                                                            598042
Benign                                                                   193999
Pathogenic                                                               132922
Conflicting_interpretations_of_pathogenicity                             100746
Likely_pathogenic                                                         65579
Benign/Likely_benign                                                      36746
Pathogenic/Likely_pathogenic                                              19219
not_provided                                                              10430
drug_response                                                              1881
other                                                                      1601
risk_factor                                                                 449
association                             

In [6]:
def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    return vcf_names

vcf_file = 'gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz'
names = get_vcf_names(vcf_file)
vcf = pd.read_csv(vcf_file, compression='gzip', comment='#', chunksize=10000, delim_whitespace=True, header=None, names=names)


In [None]:
benign_df = vcf.read()
benign_df

In [None]:
benign_df.shape