In [1]:
import os
import os.path as osp
import glob
import tqdm
import tarfile
import pandas as pd
import re
%run env.py
%run src/integration.py

In [2]:
oadir = osp.join(DATA_DIR, 'pmc_oa', 'bulk', 'files')
oadir

'/lab/data/pmc_oa/bulk/files'

In [3]:
archives = glob.glob(osp.join(oadir, '*.xml.tar.gz'))
archives

['/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.xml.tar.gz']

In [4]:
def parse_path(p, archive):
    parts = p.split('/')
    # Curr_HIV/AIDS_Rep/Curr_HIV_AIDS_Rep_2014_Dec_4_11_487-495.txt
    venue, name = '/'.join(parts[:-1]), parts[-1]
    aid = name.replace('.txt', '').replace('.nxml', '') if name.startswith('PMC') else None
    return dict(id=aid, path=p, venue=venue, name=name, archive=archive)

def file_meta_stream(archives):
    for a in tqdm.tqdm(archives):
        tar = tarfile.open(a, "r:gz")
        files = [f for f in tar.getmembers() if f.isfile()]
        for f in tqdm.tqdm(files, mininterval=10):
            yield tar, f, parse_path(f.path, a)
                
BODY_REGEX = r'<body>.*</body>'
ABSTRACT_REGEX = r'<abstract>.*</abstract>'

def doc_stream(file_meta):
    for tar, file, meta in file_meta:
        try:
            xml = tar.extractfile(file).read().decode('utf-8', errors='ignore')
            text = (re.findall(BODY_REGEX, xml) or [''])[0] + (re.findall(ABSTRACT_REGEX, xml) or [''])[0] 
            yield meta, text
        except Exception as e:
            raise type(e)(e.message + f'; (archive = {tar.name}, file = {file.path}')

SEARCH_TERMS = [
    'human', 'mouse', 'murine', 
    ' t cell', ' t-cell', ' t lymphocyte', ' t-lymphocyte',
    'cd3', 'cd4', 'cd8', 
    'expression', 'cytokine', 'phenotype', 'surface',
    'differentiate', 'differentiation', 'differentiated',
    'polarization', 'polarize', 'induce', 'induction'
]

def row_stream(docs, terms=SEARCH_TERMS):
    for meta, text in docs:
        row = dict(meta)
        ltext = text.lower()
        for t in SEARCH_TERMS:
            row[f'term:{t}'] = t in ltext
        yield row

In [5]:
stream = row_stream(doc_stream(file_meta_stream(archives)))
rows = list(stream)
len(rows)

  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/291368 [00:00<?, ?it/s][A
  4%|▍         | 11888/291368 [00:10<03:55, 1188.75it/s][A
  8%|▊         | 23776/291368 [00:25<04:19, 1032.43it/s][A
 12%|█▏        | 33816/291368 [00:35<04:11, 1023.72it/s][A
 16%|█▌        | 46395/291368 [00:45<03:45, 1084.26it/s][A
 20%|██        | 58974/291368 [00:57<03:39, 1059.29it/s][A
 24%|██▎       | 69027/291368 [01:07<03:34, 1038.16it/s][A
 24%|██▎       | 69027/291368 [01:18<03:34, 1038.16it/s][A
 27%|██▋       | 78202/291368 [01:18<03:37, 979.87it/s] [A
 31%|███       | 89800/291368 [01:28<03:16, 1027.68it/s][A
 34%|███▍      | 99186/291368 [01:38<03:12, 999.11it/s] [A
 34%|███▍      | 99186/291368 [01:38<03:12, 999.11it/s][A
 38%|███▊      | 110402/291368 [01:48<02:55, 1032.94it/s][A
 42%|████▏     | 122115/291368 [01:58<02:38, 1070.82it/s][A
 45%|████▌     | 131651/291368 [02:08<02:34, 1032.58it/s][A
 45%|████▌     | 131651/291368 [02:08<02:34, 1032.58it/s][A
 45%|████▌   

 36%|███▌      | 193033/542020 [03:04<06:33, 885.88it/s][A
 38%|███▊      | 206218/542020 [03:14<05:41, 982.59it/s][A
 38%|███▊      | 206218/542020 [03:24<05:41, 982.59it/s][A
 40%|████      | 219274/542020 [03:24<05:04, 1061.19it/s][A
 43%|████▎     | 232325/542020 [03:34<04:35, 1124.21it/s][A
 43%|████▎     | 232325/542020 [03:44<04:35, 1124.21it/s][A
 45%|████▌     | 245302/542020 [03:44<04:13, 1170.96it/s][A
 48%|████▊     | 258541/542020 [03:54<03:53, 1212.99it/s][A
 48%|████▊     | 258541/542020 [04:04<03:53, 1212.99it/s][A
 49%|████▉     | 267795/542020 [04:04<04:07, 1109.21it/s][A
 51%|█████     | 274946/542020 [04:14<04:40, 951.66it/s] [A
 51%|█████     | 274946/542020 [04:14<04:40, 951.66it/s][A
 51%|█████     | 274946/542020 [04:24<04:40, 951.66it/s][A
 52%|█████▏    | 282068/542020 [04:24<05:00, 864.26it/s][A
 54%|█████▎    | 290185/542020 [04:34<04:57, 847.77it/s][A
 55%|█████▌    | 298865/542020 [04:44<04:44, 853.73it/s][A
 57%|█████▋    | 308249/542020 [

2459147

In [6]:
pd.set_option('display.max_info_rows', 10000000)
df = pd.DataFrame(rows)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 26 columns):
archive                 2459147 non-null object
id                      2459146 non-null object
name                    2459147 non-null object
path                    2459147 non-null object
term: t cell            2459147 non-null bool
term: t lymphocyte      2459147 non-null bool
term: t-cell            2459147 non-null bool
term: t-lymphocyte      2459147 non-null bool
term:cd3                2459147 non-null bool
term:cd4                2459147 non-null bool
term:cd8                2459147 non-null bool
term:cytokine           2459147 non-null bool
term:differentiate      2459147 non-null bool
term:differentiated     2459147 non-null bool
term:differentiation    2459147 non-null bool
term:expression         2459147 non-null bool
term:human              2459147 non-null bool
term:induce             2459147 non-null bool
term:induction          2459147 non-null bool
term:

### Export and Reload

In [7]:
export_file = osp.join(DATA_DIR, 'pmc_oa', 'bulk', 'file_meta.feather')
df.to_feather(export_file)
export_file

'/lab/data/pmc_oa/bulk/file_meta.feather'

In [8]:
df = pd.read_feather(export_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 26 columns):
archive                 2459147 non-null object
id                      2459146 non-null object
name                    2459147 non-null object
path                    2459147 non-null object
term: t cell            2459147 non-null bool
term: t lymphocyte      2459147 non-null bool
term: t-cell            2459147 non-null bool
term: t-lymphocyte      2459147 non-null bool
term:cd3                2459147 non-null bool
term:cd4                2459147 non-null bool
term:cd8                2459147 non-null bool
term:cytokine           2459147 non-null bool
term:differentiate      2459147 non-null bool
term:differentiated     2459147 non-null bool
term:differentiation    2459147 non-null bool
term:expression         2459147 non-null bool
term:human              2459147 non-null bool
term:induce             2459147 non-null bool
term:induction          2459147 non-null bool
term:

In [9]:
grps = {
    'cd': ['cd3', 'cd4', 'cd8'],
    'differentiate': ['differentiate', 'differentiated', 'differentiation'],
    'induce': ['induce', 'induction'],
    'mouse': ['mouse', 'murine'],
    'polarize': ['polarization', 'polarize'],
    't cell': [' t cell', ' t lymphocyte', ' t-cell', ' t-lymphocyte']
}
dfg = df.copy()
for k, g in grps.items():
    k = 'term:' + k
    g = ['term:' + v for v in g]
    dfg[k] = dfg[g].any(axis=1)
    dfg = dfg.drop([c for c in g if c != k], axis=1)
dfg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2459147 entries, 0 to 2459146
Data columns (total 16 columns):
archive               2459147 non-null object
id                    2459146 non-null object
name                  2459147 non-null object
path                  2459147 non-null object
term:cytokine         2459147 non-null bool
term:differentiate    2459147 non-null bool
term:expression       2459147 non-null bool
term:human            2459147 non-null bool
term:induce           2459147 non-null bool
term:mouse            2459147 non-null bool
term:phenotype        2459147 non-null bool
term:polarize         2459147 non-null bool
term:surface          2459147 non-null bool
venue                 2459147 non-null object
term:cd               2459147 non-null bool
term:t cell           2459147 non-null bool
dtypes: bool(11), object(5)
memory usage: 119.6+ MB


In [18]:
#dfg.groupby(dfg.filter(regex='term:').columns.tolist()).size()

In [10]:
dfg.groupby(['term:' + c for c in ['t cell', 'human', 'mouse', 'cd']]).size()

term:t cell  term:human  term:mouse  term:cd
False        False       False       False      1712822
                                     True         14809
                         True        False        47241
                                     True          2368
             True        False       False       400022
                                     True         13460
                         True        False       138904
                                     True         18836
True         False       False       False        12775
                                     True         11734
                         True        False         2911
                                     True          3923
             True        False       False        14353
                                     True         16028
                         True        False        16295
                                     True         32666
dtype: int64

In [11]:
dfg.groupby(['term:' + c for c in ['human', 't cell', 'cd', 'phenotype', 'surface']]).size()

term:human  term:t cell  term:cd  term:phenotype  term:surface
False       False        False    False           False           1576049
                                                  True             128187
                                  True            False             46113
                                                  True               9714
                         True     False           False             11337
                                                  True               3062
                                  True            False              1673
                                                  True               1105
            True         False    False           False             11288
                                                  True               2391
                                  True            False              1541
                                                  True                466
                         True     False          

In [12]:
dfg.groupby(['term:' + c for c in [
    'human', 't cell', 'cd', 'differentiate', 'polarize', 'cytokine', 'induce'
]]).size().loc[(True, True, True)]

term:differentiate  term:polarize  term:cytokine  term:induce
False               False          False          False           3695
                                                  True            5803
                                   True           False           1258
                                                  True            9301
                    True           False          False             62
                                                  True             276
                                   True           False             52
                                                  True            1194
True                False          False          False           1169
                                                  True            4189
                                   True           False            845
                                                  True           15273
                    True           False          False             34
               

In [13]:
def get_mask(df):
    return df['term:t cell'] & df['term:human'] & (~df['term:mouse']) & df['term:cd']
    #return df['term:t cell'] & df['term:human'] & df['term:cd']
mask = get_mask(dfg)
dfe = dfg[mask].drop(dfg.filter(regex='term:').columns.tolist(), axis=1)
mask.value_counts()

False    2443119
True       16028
dtype: int64

In [14]:
dfe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16028 entries, 583 to 2459096
Data columns (total 5 columns):
archive    16028 non-null object
id         16028 non-null object
name       16028 non-null object
path       16028 non-null object
venue      16028 non-null object
dtypes: object(5)
memory usage: 751.3+ KB


In [15]:
dfe.head(3)

Unnamed: 0,archive,id,name,path,venue
583,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1636025,PMC1636025.nxml,Biol_Direct/PMC1636025.nxml,Biol_Direct
618,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC2390528,PMC2390528.nxml,Biol_Direct/PMC2390528.nxml,Biol_Direct
773,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC3203257,PMC3203257.nxml,Biol_Direct/PMC3203257.nxml,Biol_Direct


In [16]:
dfe['venue'].nunique(), dfe['archive'].nunique()

(1600, 8)

In [17]:
dfe[dfe['id'].notnull()].head(3)

Unnamed: 0,archive,id,name,path,venue
583,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC1636025,PMC1636025.nxml,Biol_Direct/PMC1636025.nxml,Biol_Direct
618,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC2390528,PMC2390528.nxml,Biol_Direct/PMC2390528.nxml,Biol_Direct
773,/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.t...,PMC3203257,PMC3203257.nxml,Biol_Direct/PMC3203257.nxml,Biol_Direct


## Export Corpus

In [34]:
import pyarrow.parquet as pq
import pyarrow as pa

def filtered_file_meta_stream(df, meta_stream):
    paths = df['path'].unique()
    for tar, file, meta in meta_stream:
        if meta['path'] in paths:
            yield tar, file, meta

def full_text_stream(file_meta):
    for tar, file, meta in file_meta:
        try:
            text = tar.extractfile(file).read().decode('utf-8', errors='ignore')
            row = {'arch_' + k:v for k, v in meta.items()}
            yield row, text
        except Exception as e:
            raise type(e)(e.message + f'; (archive = {tar.name}, file = {file.path}')

def extract_corpus(stream, output_file, batch_size=1000):
    dfs = []
    writer = None
    
    def flush(dfs, writer):
        dfs = pd.concat(dfs)
        table = pa.Table.from_pandas(dfs, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(output_file, table.schema)
        writer.write_table(table)
        return writer
    
    for row, text in stream:
        df = parse_nxml(text)
        df = df.assign(**{k:v for k, v in row.items() if k.startswith('arch_')})
        
        # Convert to string to avoid issue with all null vs datetime type fields
        for c in df.filter(regex='date_'):
            df[c] = df[c].astype(str)

        if len(df) > 0:
            dfs.append(df)
        if len(dfs) >= batch_size:
            writer = flush(dfs, writer)
            dfs = []
    if len(dfs) > 0:
        writer = flush(dfs, writer)
    if writer is not None:
        writer.close()

In [35]:
output_file = osp.join(DATA_DIR, 'articles', 'import', '20190621', 'corpus.parquet')
output_file

'/lab/data/articles/import/20190621/corpus.parquet'

In [36]:
target_archives = list(dfe['archive'].unique())
target_archives

['/lab/data/pmc_oa/bulk/files/comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/comm_use.O-Z.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.A-B.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.C-H.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.I-N.xml.tar.gz',
 '/lab/data/pmc_oa/bulk/files/non_comm_use.O-Z.xml.tar.gz']

In [37]:
stream = full_text_stream(filtered_file_meta_stream(dfe, file_meta_stream(target_archives)))
extract_corpus(stream, output_file)     



  0%|          | 0/8 [00:00<?, ?it/s][A[A


  0%|          | 0/291368 [00:00<?, ?it/s][A[A[A


  2%|▏         | 5864/291368 [00:10<08:19, 571.55it/s][A[A[A


  2%|▏         | 5864/291368 [00:24<08:19, 571.55it/s][A[A[A


  4%|▎         | 10351/291368 [00:24<10:11, 459.65it/s][A[A[A


  5%|▌         | 15598/291368 [00:34<09:38, 476.53it/s][A[A[A


  7%|▋         | 20810/291368 [00:48<10:21, 435.51it/s][A[A[A


 10%|▉         | 27706/291368 [00:58<08:58, 489.63it/s][A[A[A


 12%|█▏        | 35841/291368 [01:10<07:51, 542.19it/s][A[A[A


 15%|█▌        | 45028/291368 [01:20<06:38, 618.19it/s][A[A[A


 15%|█▌        | 45028/291368 [01:34<06:38, 618.19it/s][A[A[A


 18%|█▊        | 52534/291368 [01:34<06:50, 581.29it/s][A[A[A


 20%|█▉        | 57637/291368 [01:45<07:01, 554.29it/s][A[A[A


 21%|██▏       | 62638/291368 [02:03<09:05, 418.95it/s][A[A[A


 24%|██▍       | 69350/291368 [02:13<07:50, 472.18it/s][A[A[A


 27%|██▋       | 77821/291368

 90%|█████████ | 303658/336804 [11:09<00:47, 692.06it/s][A[A[A


 95%|█████████▌| 320671/336804 [11:19<00:19, 841.88it/s][A[A[A


100%|██████████| 336804/336804 [11:30<00:00, 487.91it/s][A[A[A

 25%|██▌       | 2/8 [24:41<1:08:39, 686.61s/it][A[A


  0%|          | 0/343696 [00:00<?, ?it/s][A[A[A


  2%|▏         | 6050/343696 [00:10<09:21, 601.06it/s][A[A[A


  4%|▎         | 12061/343696 [00:20<09:18, 593.61it/s][A[A[A


  4%|▎         | 12061/343696 [00:31<09:18, 593.61it/s][A[A[A


  4%|▍         | 14362/343696 [00:31<14:13, 385.73it/s][A[A[A


  5%|▍         | 16485/343696 [00:42<18:22, 296.87it/s][A[A[A


  6%|▌         | 20671/343696 [00:52<16:33, 325.24it/s][A[A[A


  8%|▊         | 28953/343696 [01:02<13:11, 397.70it/s][A[A[A


 11%|█         | 37235/343696 [01:12<10:57, 466.31it/s][A[A[A


 13%|█▎        | 45041/343696 [01:25<09:55, 501.20it/s][A[A[A


 15%|█▍        | 51114/343696 [01:36<09:26, 516.42it/s][A[A[A


 17%|█▋        | 

 42%|████▏     | 227252/542020 [07:55<09:48, 534.91it/s][A[A[A


 43%|████▎     | 232942/542020 [07:55<09:32, 540.27it/s][A[A[A


 44%|████▍     | 239215/542020 [08:05<08:58, 562.52it/s][A[A[A


 45%|████▌     | 245439/542020 [08:17<08:55, 554.08it/s][A[A[A


 46%|████▋     | 251451/542020 [08:27<08:32, 567.42it/s][A[A[A


 48%|████▊     | 257837/542020 [08:37<08:04, 587.04it/s][A[A[A


 49%|████▊     | 264223/542020 [08:49<08:04, 573.88it/s][A[A[A


 50%|████▉     | 269677/542020 [09:02<08:45, 518.52it/s][A[A[A


 51%|█████     | 273910/542020 [09:12<09:13, 484.05it/s][A[A[A


 51%|█████▏    | 278397/542020 [09:22<09:27, 464.75it/s][A[A[A


 52%|█████▏    | 283090/542020 [09:32<09:15, 466.10it/s][A[A[A


 52%|█████▏    | 283090/542020 [09:45<09:15, 466.10it/s][A[A[A


 53%|█████▎    | 285865/542020 [09:45<12:23, 344.55it/s][A[A[A


 53%|█████▎    | 285865/542020 [09:55<12:23, 344.55it/s][A[A[A


 53%|█████▎    | 286772/542020 [09:55<23:03, 184

 98%|█████████▊| 211507/215696 [05:02<00:04, 894.44it/s][A[A[A


100%|██████████| 215696/215696 [05:05<00:00, 706.71it/s][A[A[A

 75%|███████▌  | 6/8 [1:12:28<20:49, 624.87s/it][A[A


  0%|          | 0/426134 [00:00<?, ?it/s][A[A[A


  1%|          | 4430/426134 [00:10<15:57, 440.35it/s][A[A[A


  2%|▏         | 10340/426134 [00:20<14:35, 474.83it/s][A[A[A


  4%|▍         | 16744/426134 [00:30<13:20, 511.39it/s][A[A[A


  5%|▌         | 22978/426134 [00:42<13:00, 516.46it/s][A[A[A


  7%|▋         | 31024/426134 [00:52<11:24, 577.51it/s][A[A[A


  9%|▉         | 40020/426134 [01:02<10:01, 642.25it/s][A[A[A


 11%|█▏        | 48718/426134 [01:20<10:36, 592.55it/s][A[A[A


 13%|█▎        | 56416/426134 [01:30<09:41, 635.34it/s][A[A[A


 15%|█▌        | 64313/426134 [01:40<08:56, 674.91it/s][A[A[A


 17%|█▋        | 72210/426134 [01:50<08:30, 693.01it/s][A[A[A


 19%|█▊        | 79756/426134 [02:00<08:07, 710.41it/s][A[A[A


 19%|█▊        | 7

In [24]:
pf = pq.ParquetFile(output_file)
#pf = pq.read_pandas(output_file)

ArrowIOError: Invalid parquet file. Corrupt footer.

In [189]:
dftmp = pf.read_row_group(1).to_pandas()
dftmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 899 entries, 0 to 898
Data columns (total 16 columns):
abstract          856 non-null object
body              899 non-null object
date_accepted     804 non-null datetime64[ns]
date_pub          899 non-null datetime64[ns]
date_received     794 non-null datetime64[ns]
id_doi            862 non-null object
id_pmc            899 non-null object
id_pmid           881 non-null object
journal_ids       899 non-null object
journal_titles    899 non-null object
title             899 non-null object
arch_id           899 non-null object
arch_path         899 non-null object
arch_venue        899 non-null object
arch_name         899 non-null object
arch_archive      899 non-null object
dtypes: datetime64[ns](3), object(13)
memory usage: 112.5+ KB


In [192]:
dftmp.iloc[0]['body']

"BackgroundGlobal gene expression has been used successfully to elicit biological behavior in different soft tissue tumors. Pterygium as a human disease, noted to be more prevalent than 20% of some populations, is of immense biological interest for a few reasons.First, the pathogenesis of this condition is hotly debated. Hypothesis driven approaches have not resolved the relative importance of competing mechanisms for this disease. Theories that have been proposed include inflammatory influence, degeneration of connective tissue, genetic instability, angiogenesis, redox-related toxicity, cellular proliferation, aberration of apoptosis, exuberant wound healing, altered lipid metabolism, mast cell infiltration. and stem cell dysfunction. Conventional approaches to disease mechanism, by virtue of their narrow focus, were not helpful to assess relative contribution of widely heterogenous processes. Furthermore, a fundamental issue about the diseased tissue remains un-resolved in this conte

In [None]:
#df[df['id'] == 'PMC5052263']

### Dev

In [14]:
tar = tarfile.open(osp.join(oadir, 'comm_use.C-H.xml.tar.gz'), "r:gz")
text = tar.extractfile(tar.getmember('Front_Immunol/PMC6546853.nxml')).read().decode('utf-8', errors='ignore')
text

'<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">\n<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">Front Immunol</journal-id><journal-id journal-id-type="iso-abbrev">Front Immunol</journal-id><journal-id journal-id-type="publisher-id">Front. Immunol.</journal-id><journal-title-group><journal-title>Frontiers in Immunology</journal-title></journal-title-group><issn pub-type="epub">1664-3224</issn><publisher><publisher-name>Frontiers Media S.A.</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">31191530</article-id><article-id pub-id-type="pmc">6546853</article-id><article-id pub-id-type="doi">10.3389/fimmu.2019.01148</article-id><article-categories><subj-group subj-group-type="heading"><subject>Immuno

In [108]:
%run src/integration.py

In [109]:
dft = parse_nxml(text)

In [110]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 11 columns):
abstract          1 non-null object
body              1 non-null object
date_accepted     1 non-null datetime64[ns]
date_pub          1 non-null datetime64[ns]
date_received     1 non-null datetime64[ns]
id_doi            1 non-null object
id_pmc            1 non-null object
id_pmid           1 non-null object
journal_ids       1 non-null object
journal_titles    1 non-null object
title             1 non-null object
dtypes: datetime64[ns](3), object(8)
memory usage: 168.0+ bytes


In [112]:
dft['journal_ids']

0    Front. Immunol.||Front Immunol
Name: journal_ids, dtype: object

In [97]:
dft['body'].iloc[0]

"IntroductionSevere viral infections have a major impact on the clinical course of immunocompromised patients. Despite availability of powerful antiviral medication, cytomegalovirus (CMV) still accounts for significant morbidity and mortality in solid organ transplant (SOT) recipients. CMV can trigger direct and indirect morbidities such as chronic allograft rejection or in the case of kidney transplantation (KTx) chronic nephropathy. Therapeutic control of CMV may be hampered by the development of anti-viral drug resistance. Moreover, after discontinuation of anti-viral prophylaxis, late-onset CMV disease frequently occurs and overall mortality is significantly higher in CMV-infected compared to uninfected KTx patients. Of note, T-cell-mediated anti-CMV immunity was reported to be predictive for the development of late-onset disease  and anti-CMVIE−1-specific CD8+ T-cell responses stratify risk of CMV disease in heart and lung transplant as well as KTx patients. In addition, the magni

In [6]:
dft

Unnamed: 0,abstract,date,has_text,id,title,xml
0,Viral infections have a major impact on morbid...,2019-01-08,True,6546853,Comprehensive Characterization of a Next-Gener...,"<article article-type=""research-article"" xmlns..."


In [7]:
# import itertools
# stream = row_stream(doc_stream(file_meta_stream(archives)))
# rows = list(itertools.islice(stream, 5))

In [11]:
#tar = tarfile.open(osp.join(oadir, 'non_comm_use.0-9A-B.txt.tar.gz'), "r:gz")
#/lab/data/pmc_oa/bulk/files/comm_use.0-9A-B.txt.tar.gz, file BMC_Cancer/PMC5481910.txt
#tar = tarfile.open(osp.join(oadir, 'comm_use.0-9A-B.txt.tar.gz'), "r:gz")

In [22]:
# f = tar.getmember('BMC_Cancer/PMC5481910.txt')
# f.path

In [23]:
# arr = tar.extractfile(f).read()
# len(arr)

In [25]:
#arr.decode('utf-8', errors='strict')

In [None]:
# tar = tarfile.open(osp.join(oadir, 'non_comm_use.0-9A-B.txt.tar.gz'), "r:gz")
# files = tar.getmembers()
# pd.Series([f.name for f in files]).sample(35).tolist()

In [62]:
#tar.extractfile([f for f in files if f.name == 'Am_J_Respir_Crit_Care_Med/PMC6444650.txt'][0]).read()
#tar.extractfile([f for f in files if f.name == 'Bioinformatics/PMC6084620.txt'][0]).read().decode('utf-8')