In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import pairwise_distances

import settings as conf
from utils import is_number, chunker

# Load metadata

In [3]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# ClinVar data loading

In [4]:
clinvar_data_file = os.path.join(conf.CLINVAR_DATA_FILE)
display(clinvar_data_file)
clinvar_data = pd.read_csv(clinvar_data_file, sep='\t')

'/mnt/phenomexcan_base/data/2019-07-16-gene_condition_source_id'

In [5]:
display(clinvar_data.shape)
display(clinvar_data.head())

(11303, 9)

Unnamed: 0,#GeneID,AssociatedGenes,RelatedGenes,ConceptID,DiseaseName,SourceName,SourceID,DiseaseMIM,LastUpdated
0,2,A2M,,C0002395,Alzheimer's disease,SNOMED CT,26929004.0,104300.0,16 Feb 2016
1,2,A2M,,C3279661,Alpha-2-macroglobulin deficiency,NCBI curation,,614036.0,16 Feb 2016
2,144568,A2ML1,,C1833692,"Otitis media, susceptibility to",NCBI curation,,166760.0,16 Feb 2016
3,53947,A4GALT,,C3549485,p phenotype,SNOMED CT,24403008.0,111400.0,16 Feb 2016
4,8086,AAAS,,C0271742,Glucocorticoid deficiency with achalasia,SNOMED CT,45414006.0,231550.0,16 Feb 2016


In [6]:
clinvar_data['SourceName'].value_counts().head()

NCBI curation               3683
SNOMED CT                   2213
Office of Rare Diseases     1816
Human Phenotype Ontology     662
OMIM                         580
Name: SourceName, dtype: int64

# Prepare dataframe with gene associations

In [7]:
clinvar_non_empty_genes_data = clinvar_data.dropna(subset=['AssociatedGenes'])
display(clinvar_non_empty_genes_data.shape)

(7829, 9)

In [8]:
clinvar_unique_traits = pd.Index(clinvar_non_empty_genes_data['DiseaseName'].unique())
display(clinvar_unique_traits)

Index(['Alzheimer's disease', 'Alpha-2-macroglobulin deficiency',
       'Otitis media, susceptibility to', 'p phenotype',
       'Glucocorticoid deficiency with achalasia',
       'Keratosis palmoplantaris papulosa',
       'Charcot-Marie-Tooth disease, type 2N',
       'Epileptic encephalopathy, early infantile, 29',
       'Combined oxidative phosphorylation deficiency 8',
       'Leukoencephalopathy, progressive, with ovarian failure',
       ...
       'Myopia 21, autosomal dominant', 'Paget disease of bone 6',
       'ZNF711-Related X-linked Mental Retardation',
       'Seborrhea-like dermatitis with psoriasiform elements', 'PEHO syndrome',
       'Oocyte maturation defect 1', 'OOCYTE MATURATION DEFECT 6',
       'OOCYTE MATURATION DEFECT 3', 'Acromelic frontonasal dysostosis',
       'NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES'],
      dtype='object', length=5586)

In [9]:
# clinvar unique genes
clinvar_unique_genes = pd.Index(clinvar_non_empty_genes_data['AssociatedGenes'].unique())
display(clinvar_unique_genes)

Index(['A2M', 'A2ML1', 'A4GALT', 'AAAS', 'AAGAB', 'AARS', 'AARS2', 'AASS',
       'ABAT', 'ABCA1',
       ...
       'ZNF513', 'ZNF644', 'ZNF687', 'ZNF711', 'ZNF750', 'ZNHIT3', 'ZP1',
       'ZP2', 'ZP3', 'ZSWIM6'],
      dtype='object', length=4194)

In [10]:
diseases_columns = {}

for disease_name, disease_data in clinvar_non_empty_genes_data.groupby('DiseaseName'):
    common_genes = disease_data['AssociatedGenes'].unique()
    diseases_columns[disease_name] = pd.Series(index=common_genes, data=1)

In [11]:
clinvar_genes_associations = pd.DataFrame(data=diseases_columns, index=clinvar_unique_genes, columns=clinvar_unique_traits).fillna(0).astype('uint8')

In [12]:
assert clinvar_genes_associations.shape == (clinvar_unique_genes.shape[0] , clinvar_unique_traits.shape[0])

In [13]:
clinvar_genes_associations.shape

(4194, 5586)

In [14]:
clinvar_genes_associations.fillna(0).head()

Unnamed: 0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,"Myopia 21, autosomal dominant",Paget disease of bone 6,ZNF711-Related X-linked Mental Retardation,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
A2M,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2ML1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAS,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGAB,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
assert clinvar_genes_associations.isna().sum().sum() == 0

In [16]:
# some testing
clinvar_genes_associations["Alzheimer's disease"].sort_values(ascending=False).head(10)

A2M      1
APP      1
MPO      1
NOS3     1
PLAU     1
HFE      1
FOXN1    0
FOXO1    0
FTSJ1    0
FTO      0
Name: Alzheimer's disease, dtype: uint8

In [17]:
clinvar_genes_associations.sum().sort_values(ascending=False).head()

Diabetes mellitus type 2          28
Primary dilated cardiomyopathy    27
Familial cancer of breast         23
Mitochondrial diseases            21
Acute myeloid leukemia            20
dtype: int64

In [18]:
# define filename
os.makedirs(conf.GENE_ASSOC_DIR, exist_ok=True)
clinvar_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, 'clinvar-gene_associations.pkl.xz')
display(clinvar_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/clinvar-gene_associations.pkl.xz'

In [19]:
clinvar_genes_associations.index.rename('gene', inplace=True)

In [20]:
assert clinvar_genes_associations.dtypes.value_counts().shape[0] == 1

In [21]:
clinvar_genes_associations.dtypes.value_counts()

uint8    5586
dtype: int64

In [22]:
clinvar_genes_associations.to_pickle(clinvar_genes_associations_filename)

# Export to CSV

In [None]:
os.makedirs(conf.GENE_ASSOC_DIR, exist_ok=True)

In [23]:
export_path = os.path.join(conf.GENE_ASSOC_DIR, 'clinvar-gene_associations.tsv.gz')
display(export_path)

clinvar_genes_associations.to_csv(export_path, sep='\t')

'/mnt/phenomexcan_base/gene_assoc/clinvar-gene_associations.tsv.gz'

## Distribution of number of genes per trait

In [24]:
# import matplotlib.pyplot as plt
# import seaborn as sns

In [25]:
# clinvar_traits_n_genes = clinvar_genes_associations.sum()
# display(clinvar_traits_n_genes.shape)
# display(clinvar_traits_n_genes.sort_values(ascending=False).head())

In [26]:
# sns.countplot(clinvar_traits_n_genes[clinvar_traits_n_genes <= 10])