In [1]:
%run utils.py
import codecs
import json
import hashlib
import tqdm
import os.path as osp
import os
import numpy as np
import pandas as pd
import collections
import more_itertools
from mrtarget.common import IO
export_dir = '/lab/data/raw/evidence-data-stage-1'

In [2]:
cfg = get_data_config()
files = cfg['input-file']
files = [f for f in files if 'epmc' not in f]
pd.Series(files).apply(lambda v: v.split('/')[-1])

0               chembl-2019-08-16.json.gz
1              uniprot-2019-08-13.json.gz
2           slapenrich-2018-11-29.json.gz
3            phenodigm-2019-08-20.json.gz
4               sysbio-2019-01-31.json.gz
5     genomics_england-2018-10-02.json.gz
6              progeny-2018-07-23.json.gz
7       phewas_catalog-2018-11-28.json.gz
8               crispr-2019-08-21.json.gz
9       gene2phenotype-2019-08-19.json.gz
10                gwas-2019-08-14.json.gz
11            reactome-2019-08-15.json.gz
12             intogen-2019-08-16.json.gz
13                 eva-2019-08-20.json.gz
14              cosmic-2019-08-05.json.gz
15               atlas-2019-08-13.json.gz
dtype: object

In [3]:
! [ -d $export_dir ] && rm -rf $export_dir

In [4]:
def to_rec(line):
    # Decode bytes and deserialized json
    # See: https://github.com/opentargets/data_pipeline/blob/f0b508d0e0a6e7c5627dfa9be5ff37d2f2173cf8/mrtarget/modules/Evidences.py#L117
    (filename, (line_n, l)) = line
    line_str = codecs.decode(l, 'utf-8', 'replace')
    try:
        rec = json.loads(line_str)
    except Exception as e:
        return None
    rec['line_number'] = line_n
    rec['filename'] = osp.basename(filename)
    return rec

def to_value(v):
    # Convert objects back to json string
    return v if np.isscalar(v) else json.dumps(v)

# Download data for all files as separate parquet partitions
cts = collections.defaultdict(lambda: 0)
for f in tqdm.tqdm(files):
    batches = more_itertools.chunked(IO.open_to_read(f), 100000)
    for i, batch in enumerate(batches):
        df = []
        for line in batch:
            rec = to_rec(line)
            if rec is None:
                cts['failure'] += 1
                continue
            cts['success'] += 1
            rec = {k: to_value(v) for k, v in rec.items()}
            df.append(rec)
        df = pd.DataFrame(df)
        df = df[sorted(df.columns.tolist())]
        df['batch'] = i
        for k, g in df.groupby('sourceID'):
            export_file = osp.join(export_dir, f'sourceID-{k}:batch-{i}.feather')
            if osp.exists(export_file):
                raise AssertionError(f'File "{export_file}" already exists')
            os.makedirs(export_dir, exist_ok=True)
            g.reset_index(drop=True).to_feather(export_file)
dict(cts)

100%|██████████| 16/16 [05:39<00:00, 21.22s/it]


{'success': 1681344, 'failure': 1}

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15075 entries, 0 to 15074
Data columns (total 11 columns):
access_level                 15075 non-null object
disease                      15075 non-null object
evidence                     15075 non-null object
filename                     15075 non-null object
line_number                  15075 non-null int64
literature                   7227 non-null object
sourceID                     15075 non-null object
target                       15075 non-null object
type                         15075 non-null object
unique_association_fields    15075 non-null object
batch                        15075 non-null int64
dtypes: int64(2), object(9)
memory usage: 1.3+ MB


In [6]:
df.head()

Unnamed: 0,access_level,disease,evidence,filename,line_number,literature,sourceID,target,type,unique_association_fields,batch
0,public,"{""id"": ""http://www.ebi.ac.uk/efo/EFO_0003834"",...","{""is_associated"": true, ""unique_experiment_ref...",atlas-2019-08-13.json.gz,200001,,expression_atlas,"{""id"": ""http://identifiers.org/ensembl/ENSG000...",rna_expression,"{""geneID"": ""http://identifiers.org/ensembl/ENS...",2
1,public,"{""id"": ""http://www.ebi.ac.uk/efo/EFO_0003834"",...","{""is_associated"": true, ""unique_experiment_ref...",atlas-2019-08-13.json.gz,200002,,expression_atlas,"{""id"": ""http://identifiers.org/ensembl/ENSG000...",rna_expression,"{""geneID"": ""http://identifiers.org/ensembl/ENS...",2
2,public,"{""id"": ""http://www.ebi.ac.uk/efo/EFO_0003834"",...","{""is_associated"": true, ""unique_experiment_ref...",atlas-2019-08-13.json.gz,200003,,expression_atlas,"{""id"": ""http://identifiers.org/ensembl/ENSG000...",rna_expression,"{""geneID"": ""http://identifiers.org/ensembl/ENS...",2
3,public,"{""id"": ""http://www.ebi.ac.uk/efo/EFO_0003834"",...","{""is_associated"": true, ""unique_experiment_ref...",atlas-2019-08-13.json.gz,200004,,expression_atlas,"{""id"": ""http://identifiers.org/ensembl/ENSG000...",rna_expression,"{""geneID"": ""http://identifiers.org/ensembl/ENS...",2
4,public,"{""id"": ""http://www.ebi.ac.uk/efo/EFO_0003834"",...","{""is_associated"": true, ""unique_experiment_ref...",atlas-2019-08-13.json.gz,200005,,expression_atlas,"{""id"": ""http://identifiers.org/ensembl/ENSG000...",rna_expression,"{""geneID"": ""http://identifiers.org/ensembl/ENS...",2


In [7]:
!du -ch $export_dir

5.6G	/lab/data/raw/evidence-data-stage-1
5.6G	total
