### There are two related tab-delimited vcf files. 
-regular
-annotated

### This notebook cleans up the regular (basic-info) dataframe and saves it as pickled dataframe

In [1]:
import csv
import gzip
import re

import pandas as pd

In [2]:
# get columns names from tab delimited file
cv_columns = {}
with gzip.open("raw_vcf/clinvar.vcf.gz", "rt") as f:
    for metaline in f:
        if metaline.startswith("##INFO"):
            colname = re.search("ID=(\w+),", metaline.strip("#\n"))
            coldesc = re.search(".*Description=(.*)>", metaline.strip("#\n"))
            cv_columns[colname.group(1)] = coldesc.group(1).strip('"')

In [3]:
# read tab delimited
cv_df = pd.read_csv(
    "raw_vcf/clinvar.vcf.gz",
    sep="\t",
    comment="#",
    usecols=[0, 1, 2, 3, 4, 7], # rid of columns 5, 6
    header=None,
)

In [4]:
# convert the long dictionary in column 7 to actual columns
def list_to_dict(l):
    """Convert list to dict."""
    return {k: v for k, v in (x.split("=") for x in l)}

cv_df = pd.concat(
    [
        cv_df.drop([7], axis=1),
        cv_df[7].str.split(";").apply(list_to_dict).apply(pd.Series),
    ],
    axis=1,
)

In [5]:
# drop columns not needed (except ID, we need that)
cv_df.rename(columns={0: "CHROM", 1: "POS", 2: "ID", 3: "REF", 4: "ALT"}, inplace=True)
cv_df = cv_df.drop(columns=["CHROM", "POS", "REF", "ALT"])