## This notebook extracts data from the annotated vcf, cleans up a little, and saves it as pickled dataframe

### Import vcf and convert into dataframe. Extract features inside cells as necessary.

In [1]:
import csv
import gzip
import re
import pickle

import pandas as pd

In [2]:
# read tab delimited
cv_df = pd.read_csv(
    "clinvar.annotated.vcf.gz",
    sep="\t",
    skiprows=35,
    usecols=[0, 1, 2, 3, 4, 7], # rid of columns 5, 6
    header=None,
)

In [3]:
cv_df.rename(columns={0: "CHROM", 1: "POS", 2: "ID", 3: "REF", 4: "ALT"}, inplace=True)

In [4]:
cv_df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,7
0,1,1149118,96692,G,A,AF_EXAC=0.00004;ALLELEID=102585;CLNDISDB=MedGe...
1,1,1167674,60493,C,T,"ALLELEID=75088;CLNDISDB=MedGen:C3809210,OMIM:6..."
2,1,1167851,60488,A,G,"ALLELEID=75083;CLNDISDB=MedGen:C0432243,OMIM:2..."
3,1,1167858,60489,C,T,"ALLELEID=75084;CLNDISDB=MedGen:C0432243,OMIM:2..."
4,1,1168124,60486,G,A,"ALLELEID=75081;CLNDISDB=MedGen:C0432243,OMIM:2..."


In [5]:
# column 7 has a different format and actually contains dozens of more columns
# convert the long dictionary in column 7 to actual columns
def list_to_dict(l):
    """Convert list to dict."""
    return {k: v for k, v in (x.split("=") for x in l)}

cv_df = pd.concat(
    [
        cv_df.drop([7], axis=1),
        cv_df[7].str.split(";").apply(list_to_dict).apply(pd.Series),
    ],
    axis=1,
)

In [6]:
cv_df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,AF_EXAC,ALLELEID,CLNDISDB,CLNDN,CLNHGVS,...,RS,CSQ,AF_ESP,AF_TGP,CLNSIGCONF,CLNDISDBINCL,CLNDNINCL,CLNSIGINCL,SSR,DBVARID
0,1,1149118,96692,G,A,4e-05,102585,"MedGen:C3810053,OMIM:615593,Orphanet:ORPHA431149",Immunodeficiency_16,NC_000001.10:g.1149118G>A,...,587777075,A|5_prime_UTR_variant|MODIFIER|TNFRSF4|7293|Tr...,,,,,,,,
1,1,1167674,60493,C,T,,75088,"MedGen:C3809210,OMIM:615349","Ehlers-Danlos_syndrome,_progeroid_type,_2",NC_000001.10:g.1167674C>T,...,397514722,T|missense_variant|MODERATE|B3GALT6|126792|Tra...,,,,,,,,
2,1,1167851,60488,A,G,,75083,"MedGen:C0432243,OMIM:271640,Orphanet:ORPHA9335...",Spondyloepimetaphyseal_dysplasia_with_joint_la...,NC_000001.10:g.1167851A>G,...,397514719,G|missense_variant|MODERATE|B3GALT6|126792|Tra...,,,,,,,,
3,1,1167858,60489,C,T,,75084,"MedGen:C0432243,OMIM:271640,Orphanet:ORPHA9335...",Spondyloepimetaphyseal_dysplasia_with_joint_la...,NC_000001.10:g.1167858C>T,...,397514720,T|missense_variant|MODERATE|B3GALT6|126792|Tra...,,,,,,,,
4,1,1168124,60486,G,A,,75081,"MedGen:C0432243,OMIM:271640,Orphanet:ORPHA9335...",Spondyloepimetaphyseal_dysplasia_with_joint_la...,NC_000001.10:g.1168124G>A,...,397514718,A|missense_variant|MODERATE|B3GALT6|126792|Tra...,,,,,,,,


In [7]:
# the CSQ column (9th column from right side) has a bit different syntax. Unpack and convert into actual columns

# get column names for CSQ
with gzip.open("clinvar.annotated.vcf.gz", "rt") as f:
        for line in f:
            if line.startswith("##INFO=<ID=CSQ"):
                m = re.search(r'.*Format: (.*)">', line)
                cols = m.group(1).split("|")

# pipe to dict
def CSQ_to_dict(l):
    '''
    Convert the pipe_separated values in the CSQ column to dict
    '''
    annotation_data = {}
    for csq_column, csq_value in zip(cols, l):
        annotation_data[csq_column] = csq_value
    return annotation_data
        
# convert and concat
cv_df = pd.concat(
    [
        cv_df.drop(['CSQ'], axis=1),
        cv_df['CSQ'].str.split("|").apply(CSQ_to_dict).apply(pd.Series),
    ],
    axis=1,
)

In [8]:
cv_df.shape

(354199, 67)

### Only extract rows that have unambiguous clinical diagnoses. Then, clean-up the numerous targets into three different bins (benign, uncertain, or pathogenic).

In [9]:
# narrow rows down to just unambiguous clinical diagnoses
unamb_stat = ['criteria_provided,_multiple_submitters,_no_conflicts','reviewed_by_expert_panel','practice_guideline ']
cv_df = cv_df.loc[cv_df['CLNREVSTAT'].isin(unamb_stat)]

In [10]:
# drop rows with just drug_response as a CLNSIG
cv_df = cv_df.drop(cv_df[cv_df['CLNSIG'] == 'drug_response'].index)

#convert ['CLNSIG'] into just three catagories
cv_df.loc[cv_df['CLNSIG'].str.startswith("Benign") | cv_df['CLNSIG'].str.startswith("Likely_benign"), ['CLNSIG']] = 'Benign_cat'
cv_df.loc[cv_df['CLNSIG'].str.startswith("Uncertain_significance"), ['CLNSIG']] = 'Uncertain_significance_cat'
cv_df.loc[cv_df['CLNSIG'].str.startswith("Pathogenic") | cv_df['CLNSIG'].str.startswith("Likely_pathogenic"), ['CLNSIG']] = 'Pathogenic_cat'

In [11]:
cv_df['CLNSIG'].value_counts()

Benign_cat                    28086
Uncertain_significance_cat    17223
Pathogenic_cat                12354
Name: CLNSIG, dtype: int64

### Drop columns absolutely not needed, and save cleaned up file as pickled file

In [12]:
#drop columns not needed
cv_df = cv_df.drop(columns=['ALLELEID', 'CLNDISDB',
       'CLNHGVS', 'CLNREVSTAT', 'CLNVCSO', 'CLNVI',
       'GENEINFO', 'RS', 'CLNSIGCONF',
       'CLNDISDBINCL', 'CLNSIGINCL', 'SSR', 'DBVARID','Allele', 
       'IMPACT', 'SYMBOL', 'Gene','Feature_type','Feature','HGVSc','HGVSp','Existing_variation','DISTANCE',
       'STRAND','FLAGS','SYMBOL_SOURCE','HGNC_ID','TSL','APPRIS','REFSEQ_MATCH',
       'GIVEN_REF','USED_REF','MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE','MPC'])

In [13]:
# shape of df to be used for training and testing
cv_df.shape

(57663, 30)

In [14]:
file = open('cv_df_extracted', 'wb')
pickle.dump(cv_df, file)
file.close()