In [1]:
import os
import os.path as osp
import glob
import tqdm
import tarfile
import pandas as pd
%run env.py
%run src/integration.py

In [2]:
oadir = osp.join(DATA_DIR, 'pmc_oa', 'bulk', 'files')
oadir

'/lab/data/pmc_oa/bulk/files'

In [3]:
archives = glob.glob(osp.join(oadir, '*.gz'))
archives

['/lab/data/pmc_oa/bulk/files/comm_use.0-9A-B.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.0-9A-B.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.txt.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.xml.tar.gz']

In [8]:
def parse_path(p, archive):
    parts = p.split('/')
    assert len(parts) == 2, f'Path {p} in archive {archive} has more or less than two parts'
    venue, name = parts
    aid = name.replace('.txt', '') if name.startswith('PMC') else None
    return dict(id=aid, path=p, venue=venue, name=name, archive=archive)

def file_meta_stream(archives):
    for a in archives:
        tar = tarfile.open(a, "r:gz")
        files = [f for f in tar.getmembers() if f.isfile()]
        for f in tqdm.tqdm(files):
            yield tar, f, parse_path(f.path, a)
                
def doc_stream(file_meta):
    for tar, file, meta in file_meta:
        text = tar.extractfile(file).read().decode('utf-8')
        text = text.replace('==== Front', '')
        parts = text.split('==== Body')
        front, body = None, None
        if len(parts) >= 2:
            front, body = parts[0], ''.join(parts[1:])
        else:
            body = text
        yield meta, front, body
        
SEARCH_TERMS = ['human', 't cell', 't-cell', 't lymphocyte', 't-lymphocyte']

def row_stream(docs, terms=SEARCH_TERMS):
    for meta, front, body in docs:
        row = dict(meta)
        for t in SEARCH_TERMS:
            row[f'term:{t}'] = t in front or t in body
        yield row

In [9]:
import itertools
stream = row_stream(doc_stream(file_meta_stream(archives)))
res = list(itertools.islice(stream, 5))


  0%|          | 0/276158 [00:00<?, ?it/s][A

In [78]:
res[0]

{'id': None,
 'path': 'BMC_Cancer/BMC_Cancer_2006_Jan_4_6_1.txt',
 'venue': 'BMC_Cancer',
 'name': 'BMC_Cancer_2006_Jan_4_6_1.txt',
 'archive': '/lab/data/pmc_oa/bulk/files/comm_use.0-9A-B.txt.tar.gz',
 'term:human': True,
 'term:t cell': False,
 'term:t-cell': False,
 'term:t lymphocyte': False,
 'term:t-lymphocyte': False}

In [7]:
tar = tarfile.open(osp.join(oadir, 'non_comm_use.0-9A-B.txt.tar.gz'), "r:gz")

In [9]:
files = tar.getmembers()

In [10]:
len(files)

165826

In [24]:
files[:5]

[<TarInfo 'Breast_Cancer_Res' at 0x7f7091e19c00>,
 <TarInfo 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_1.txt' at 0x7f7091e19cc8>,
 <TarInfo 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_2-7.txt' at 0x7f7091e19d90>,
 <TarInfo 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_10-12.txt' at 0x7f70888ec4f8>,
 <TarInfo 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_8-9.txt' at 0x7f70888ec5c0>]

In [26]:
import pandas as pd

In [28]:
pd.Series([f.name for f in files]).sample(35).tolist()

['ACG_Case_Rep_J/ACG_Case_Rep_J_2013_Oct_8_1(1)_4-6.txt',
 'Br_J_Cancer/Br_J_Cancer_1955_Mar_9(1)_21-36.txt',
 'Ann_Med_Health_Sci_Res/Ann_Med_Health_Sci_Res_2014_Nov-Dec_4(6)_858-862.txt',
 'Br_J_Cancer/Br_J_Cancer_1992_Aug_66(2)_225-226.txt',
 'Am_J_Respir_Crit_Care_Med/PMC6444665.txt',
 'Adv_Sci_(Weinh)/PMC6325595.txt',
 'Br_J_Cancer/Br_J_Cancer_1984_Jun_49(6)_795-799.txt',
 'Br_J_Cancer/Br_J_Cancer_1991_Oct_64(4)_705-709.txt',
 'Br_J_Cancer/Br_J_Cancer_1980_Oct_42(4)_542-550.txt',
 'Anc_Sci_Life/Anc_Sci_Life_2009_Oct-Dec_29(2)_1.txt',
 'Arthritis_Res/Arthritis_Res_2000_Aug_3_2(6)_441-445.txt',
 'Biochemistry/Biochemistry_2014_May_13_53(18)_2979-2992.txt',
 'Ann_Dermatol/Ann_Dermatol_2014_Jun_12_26(3)_395-398.txt',
 'Am_J_Respir_Crit_Care_Med/PMC6444650.txt',
 'Allergy_Asthma_Clin_Immunol/Allergy_Asthma_Clin_Immunol_2010_Dec_22_6(Suppl_2)_P34.txt',
 'BMJ_Open/BMJ_Open_2012_Mar_15_2(2)_e000860.txt',
 'AMIA_Jt_Summits_Transl_Sci_Proc/PMC5543387.txt',
 'Anesth_Essays_Res/Anesth_Essays_

In [None]:
tar = tarfile.open(osp.join(oadir, 'non_comm_use.0-9A-B.txt.tar.gz'), "r:gz")
files = tar.getmembers()

In [67]:
#tar.extractfile([f for f in files if f.name == 'Am_J_Respir_Crit_Care_Med/PMC6444650.txt'][0]).read()
tar.extractfile([f for f in files if f.name == 'Bioinformatics/PMC6084620.txt'][0]).read().decode('utf-8')

'\n==== Front\nBioinformaticsBioinformaticsbioinformaticsBioinformatics1367-48031367-4811Oxford University Press 10.1093/bioinformatics/bty145bty145Original PapersSequence AnalysisNimbus: a design-driven analyses suite for amplicon-based NGS data Brouwer R W W 1van den Hout M C G N 1Kockx C E M 1Brosens E 2Eussen B 2de Klein A 2Sleutels F 1van IJcken W F J 1Berger Bonnie Associate Editor1 Center for Biomics, Department of Cell Biology, Erasmus MC, 3000CA Rotterdam, The Netherlands2 Department of Clinical Genetics, Erasmus MC, 3000CA Rotterdam, The NetherlandsTo whom correspondence should be addressed. Email: w.vanijcken@erasmusmc.nl15 8 2018 10 3 2018 10 3 2018 34 16 2732 2739 01 9 2017 14 2 2018 09 3 2018 © The Author(s) 2018. Published by Oxford University Press.2018This is an Open Access article distributed under the terms of the Creative Commons Attribution Non-Commercial License (http://creativecommons.org/licenses/by-nc/4.0/), which permits non-commercial re-use, distribution, an

In [35]:
df = pd.DataFrame([parse_path(f.name) for f in files if f.isfile()])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164726 entries, 0 to 164725
Data columns (total 3 columns):
id       63119 non-null object
name     164726 non-null object
venue    164726 non-null object
dtypes: object(3)
memory usage: 3.8+ MB


In [38]:
df[df['name'].str.contains('4337382')]

Unnamed: 0,id,name,venue


In [21]:
f = files[0]
f.isdir(), f.name, f.path, 

(True, 'Breast_Cancer_Res', 'Breast_Cancer_Res')

In [22]:
f = files[1]
f.isdir(), f.name, f.path

(False,
 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_1.txt',
 'Breast_Cancer_Res/Breast_Cancer_Res_2000_Dec_17_2(1)_1.txt')

In [None]:
for member in tar.getmembers():
    f = tar.extractfile(member)
    if f is not None:
        content = f.read()