In [2]:
import os
import os.path as osp
import glob
import tqdm
import tarfile
import pandas as pd
import re
from tcre.env import *

In [3]:
oadir = osp.join(DATA_DIR, 'pmc_oa', 'bulk', 'files')
oadir

'/lab/data/pmc_oa/bulk/files'

In [3]:
archives = glob.glob(osp.join(oadir, '*.xml.tar.gz'))
archives

['/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.xml.tar.gz']

In [1]:
SEARCH_TERMS = [
    'human', 'mouse', 'murine', 
    ' t cell', ' t-cell', ' t lymphocyte', ' t-lymphocyte',
    'cd3', 'cd4', 'cd8', 
    'expression', 'cytokine', 'phenotype', 'surface',
    'differentiate', 'differentiation', 'differentiated',
    'polarization', 'polarize', 'induce', 'induction'
]

In [4]:
def parse_path(p, archive):
    parts = p.split('/')
    # Curr_HIV/AIDS_Rep/Curr_HIV_AIDS_Rep_2014_Dec_4_11_487-495.txt
    venue, name = '/'.join(parts[:-1]), parts[-1]
    aid = name.replace('.txt', '').replace('.nxml', '') if name.startswith('PMC') else None
    return dict(id=aid, path=p, venue=venue, name=name, archive=archive)

def file_meta_stream(archives):
    for a in tqdm.tqdm(archives):
        tar = tarfile.open(a, "r:gz")
        files = [f for f in tar.getmembers() if f.isfile()]
        for f in tqdm.tqdm(files, mininterval=10):
            yield tar, f, parse_path(f.path, a)
                
BODY_REGEX = r'<body>.*</body>'
ABSTRACT_REGEX = r'<abstract>.*</abstract>'

def doc_stream(file_meta):
    for tar, file, meta in file_meta:
        try:
            xml = tar.extractfile(file).read().decode('utf-8', errors='ignore')
            text = (re.findall(BODY_REGEX, xml) or [''])[0] + (re.findall(ABSTRACT_REGEX, xml) or [''])[0] 
            yield meta, text
        except Exception as e:
            raise type(e)(e.message + f'; (archive = {tar.name}, file = {file.path}')

def row_stream(docs, terms=SEARCH_TERMS):
    for meta, text in docs:
        row = dict(meta)
        ltext = text.lower()
        for t in SEARCH_TERMS:
            row[f'term:{t}'] = t in ltext
        yield row

In [None]:
stream = row_stream(doc_stream(file_meta_stream(archives)))
rows = list(stream)
len(rows)

In [6]:
pd.set_option('display.max_info_rows', 10000000)
df = pd.DataFrame(rows)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 26 columns):
archive                 2459147 non-null object
id                      2459146 non-null object
name                    2459147 non-null object
path                    2459147 non-null object
term: t cell            2459147 non-null bool
term: t lymphocyte      2459147 non-null bool
term: t-cell            2459147 non-null bool
term: t-lymphocyte      2459147 non-null bool
term:cd3                2459147 non-null bool
term:cd4                2459147 non-null bool
term:cd8                2459147 non-null bool
term:cytokine           2459147 non-null bool
term:differentiate      2459147 non-null bool
term:differentiated     2459147 non-null bool
term:differentiation    2459147 non-null bool
term:expression         2459147 non-null bool
term:human              2459147 non-null bool
term:induce             2459147 non-null bool
term:induction          2459147 non-null bool
term:

### Export and Reload

In [4]:
export_file = osp.join(DATA_DIR, 'pmc_oa', 'bulk', 'file_meta.feather')
export_file

'/lab/data/pmc_oa/bulk/file_meta.feather'

In [7]:
df.to_feather(export_file)
export_file

In [7]:
pd.set_option('display.max_info_rows', int(3E6))
df = pd.read_feather(export_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 26 columns):
archive                 2459147 non-null object
id                      2459146 non-null object
name                    2459147 non-null object
path                    2459147 non-null object
term: t cell            2459147 non-null bool
term: t lymphocyte      2459147 non-null bool
term: t-cell            2459147 non-null bool
term: t-lymphocyte      2459147 non-null bool
term:cd3                2459147 non-null bool
term:cd4                2459147 non-null bool
term:cd8                2459147 non-null bool
term:cytokine           2459147 non-null bool
term:differentiate      2459147 non-null bool
term:differentiated     2459147 non-null bool
term:differentiation    2459147 non-null bool
term:expression         2459147 non-null bool
term:human              2459147 non-null bool
term:induce             2459147 non-null bool
term:induction          2459147 non-null bool
term:

In [9]:
grps = {
    'cd': ['cd3', 'cd4', 'cd8'],
    'differentiate': ['differentiate', 'differentiated', 'differentiation'],
    'induce': ['induce', 'induction'],
    'mouse': ['mouse', 'murine'],
    'polarize': ['polarization', 'polarize'],
    't cell': [' t cell', ' t lymphocyte', ' t-cell', ' t-lymphocyte']
}
dfg = df.copy()
for k, g in grps.items():
    k = 'term:' + k
    g = ['term:' + v for v in g]
    dfg[k] = dfg[g].any(axis=1)
    dfg = dfg.drop([c for c in g if c != k], axis=1)
dfg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 16 columns):
archive               2459147 non-null object
id                    2459146 non-null object
name                  2459147 non-null object
path                  2459147 non-null object
term:cytokine         2459147 non-null bool
term:differentiate    2459147 non-null bool
term:expression       2459147 non-null bool
term:human            2459147 non-null bool
term:induce           2459147 non-null bool
term:mouse            2459147 non-null bool
term:phenotype        2459147 non-null bool
term:polarize         2459147 non-null bool
term:surface          2459147 non-null bool
venue                 2459147 non-null object
term:cd               2459147 non-null bool
term:t cell           2459147 non-null bool
dtypes: bool(11), object(5)
memory usage: 119.6+ MB


In [8]:
#dfg.groupby(dfg.filter(regex='term:').columns.tolist()).size()

In [9]:
dfg.groupby(['term:' + c for c in ['t cell', 'human', 'mouse', 'cd']]).size()

term:t cell  term:human  term:mouse  term:cd
False        False       False       False      1712822
                                     True         14809
                         True        False        47241
                                     True          2368
             True        False       False       400022
                                     True         13460
                         True        False       138904
                                     True         18836
True         False       False       False        12775
                                     True         11734
                         True        False         2911
                                     True          3923
             True        False       False        14353
                                     True         16028
                         True        False        16295
                                     True         32666
dtype: int64

In [10]:
dfg.groupby(['term:' + c for c in ['human', 't cell', 'cd', 'phenotype', 'surface']]).size()

term:human  term:t cell  term:cd  term:phenotype  term:surface
False       False        False    False           False           1576049
                                                  True             128187
                                  True            False             46113
                                                  True               9714
                         True     False           False             11337
                                                  True               3062
                                  True            False              1673
                                                  True               1105
            True         False    False           False             11288
                                                  True               2391
                                  True            False              1541
                                                  True                466
                         True     False          

In [11]:
dfg.groupby(['term:' + c for c in [
    'human', 't cell', 'cd', 'differentiate', 'polarize', 'cytokine', 'induce'
]]).size().loc[(True, True, True)]

term:differentiate  term:polarize  term:cytokine  term:induce
False               False          False          False           3695
                                                  True            5803
                                   True           False           1258
                                                  True            9301
                    True           False          False             62
                                                  True             276
                                   True           False             52
                                                  True            1194
True                False          False          False           1169
                                                  True            4189
                                   True           False            845
                                                  True           15273
                    True           False          False             34
               

In [12]:
corpa = {
    'corpus_02': (lambda df: df['term:t cell'] & df['term:human'] & (~df['term:mouse']) & df['term:cd'],),
    'corpus_03': (lambda df: df['term:t cell'] & df['term:human'] & df['term:cd'],)
}

def get_mask(df):
    return corpa[target_corpus][0](df)

In [14]:
target_corpus = 'corpus_02'
mask = get_mask(dfg)
dfe = dfg[mask].drop(dfg.filter(regex='term:').columns.tolist(), axis=1)
mask.value_counts()

False    2443119
True       16028
dtype: int64

In [15]:
target_corpus = 'corpus_03'
mask = get_mask(dfg)
dfe = dfg[mask].drop(dfg.filter(regex='term:').columns.tolist(), axis=1)
mask.value_counts()

False    2410453
True       48694
dtype: int64

In [16]:
dfe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48694 entries, 54 to 2459109
Data columns (total 5 columns):
archive    48694 non-null object
id         48694 non-null object
name       48694 non-null object
path       48694 non-null object
venue      48694 non-null object
dtypes: object(5)
memory usage: 2.2+ MB


In [14]:
dfe.head(3)

Unnamed: 0,archive,id,name,path,venue
54,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC4075150,PMC4075150.nxml,Alzheimers_Res_Ther/PMC4075150.nxml,Alzheimers_Res_Ther
552,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1397804,PMC1397804.nxml,Biol_Direct/PMC1397804.nxml,Biol_Direct
583,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1636025,PMC1636025.nxml,Biol_Direct/PMC1636025.nxml,Biol_Direct


In [15]:
dfe['venue'].nunique(), dfe['archive'].nunique()

(2194, 8)

In [16]:
dfe[dfe['id'].notnull()].head(3)

Unnamed: 0,archive,id,name,path,venue
54,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC4075150,PMC4075150.nxml,Alzheimers_Res_Ther/PMC4075150.nxml,Alzheimers_Res_Ther
552,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1397804,PMC1397804.nxml,Biol_Direct/PMC1397804.nxml,Biol_Direct
583,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1636025,PMC1636025.nxml,Biol_Direct/PMC1636025.nxml,Biol_Direct


## Export Corpus

In [17]:
import pyarrow.parquet as pq
import pyarrow as pa

def filtered_file_meta_stream(df, meta_stream):
    paths = df['path'].unique()
    for tar, file, meta in meta_stream:
        if meta['path'] in paths:
            yield tar, file, meta

def full_text_stream(file_meta):
    for tar, file, meta in file_meta:
        try:
            text = tar.extractfile(file).read().decode('utf-8', errors='ignore')
            row = {'arch_' + k:v for k, v in meta.items()}
            yield row, text
        except Exception as e:
            raise type(e)(e.message + f'; (archive = {tar.name}, file = {file.path}')

def extract_corpus(stream, output_file, batch_size=1000):
    dfs = []
    writer = None
    
    def flush(dfs, writer):
        dfs = pd.concat(dfs)
        table = pa.Table.from_pandas(dfs, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(output_file, table.schema)
        writer.write_table(table)
        return writer
    
    for row, text in stream:
        df = parse_nxml(text)
        df = df.assign(**{k:v for k, v in row.items() if k.startswith('arch_')})
        
        # Convert to string to avoid issue with all null vs datetime type fields
        for c in df.filter(regex='date_'):
            df[c] = df[c].astype(str)

        if len(df) > 0:
            dfs.append(df)
        if len(dfs) >= batch_size:
            writer = flush(dfs, writer)
            dfs = []
    if len(dfs) > 0:
        writer = flush(dfs, writer)
    if writer is not None:
        writer.close()

In [18]:
output_file = osp.join(DATA_DIR, 'articles', 'import', '20190621', f'{target_corpus}.parquet')
output_file

'/lab/data/articles/import/20190621/corpus_03.parquet'

In [19]:
target_archives = list(dfe['archive'].unique())
target_archives

['/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.xml.tar.gz']

In [20]:
stream = full_text_stream(filtered_file_meta_stream(dfe, file_meta_stream(target_archives)))
extract_corpus(stream, output_file)     

  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/291368 [00:00<?, ?it/s][A
  1%|          | 3348/291368 [00:10<14:20, 334.77it/s][A
  2%|▏         | 6696/291368 [00:23<15:41, 302.50it/s][A
  2%|▏         | 6696/291368 [00:34<15:41, 302.50it/s][A
  3%|▎         | 8609/291368 [00:34<19:20, 243.64it/s][A
  4%|▍         | 11722/291368 [00:44<17:52, 260.63it/s][A
  4%|▍         | 11722/291368 [00:55<17:52, 260.63it/s][A
  5%|▌         | 15331/291368 [00:55<16:18, 282.12it/s][A
  5%|▌         | 15331/291368 [01:15<16:18, 282.12it/s][A
  6%|▋         | 18801/291368 [01:15<19:04, 238.16it/s][A
  6%|▋         | 18801/291368 [01:25<19:04, 238.16it/s][A
  7%|▋         | 20155/291368 [01:25<23:31, 192.10it/s][A
  8%|▊         | 22036/291368 [01:35<23:33, 190.50it/s][A
  9%|▉         | 25501/291368 [01:45<20:07, 220.25it/s][A
 10%|█         | 29701/291368 [01:55<16:58, 256.90it/s][A
 12%|█▏        | 33927/291368 [02:05<14:44, 291.14it/s][A
 13%|█▎        | 38570/291368 [02:1

In [27]:
pf = pq.ParquetFile(output_file)
pf.num_row_groups

49

In [25]:
pf.read_row_group(0).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
abstract          985 non-null object
body              1000 non-null object
date_accepted     1000 non-null object
date_pub          1000 non-null object
date_received     1000 non-null object
id_doi            1000 non-null object
id_pmc            1000 non-null object
id_pmid           987 non-null object
journal_ids       1000 non-null object
journal_titles    1000 non-null object
title             1000 non-null object
arch_id           1000 non-null object
arch_path         1000 non-null object
arch_venue        1000 non-null object
arch_name         1000 non-null object
arch_archive      1000 non-null object
dtypes: object(16)
memory usage: 125.1+ KB


In [36]:
dfcts = [pf.read_row_group(i).to_pandas().notnull().sum() for i in range(pf.num_row_groups)]
dfcts = pd.concat(dfcts, axis=1).sum(axis=1)
dfcts

abstract          46886
body              48304
date_accepted     48694
date_pub          48694
date_received     48694
id_doi            46623
id_pmc            48694
id_pmid           47880
journal_ids       48694
journal_titles    48694
title             48692
arch_id           48694
arch_path         48694
arch_venue        48694
arch_name         48694
arch_archive      48694
dtype: int64

In [37]:
assert dfcts.max() == len(dfe), f'Expecting {len(dfe)} documents, found {dfcts.max()}'

### Dev

In [1]:
tar = tarfile.open(osp.join(oadir, 'comm_use.C-H.xml.tar.gz'), "r:gz")
text = tar.extractfile(tar.getmember('Front_Immunol/PMC6546853.nxml')).read().decode('utf-8', errors='ignore')
text[:10000]