# Preprocess Cochrane abstracts and references
- Load raw CSVs produced by obtain_cochrane_abstracts.ipynb.
- Basic validation (row counts, nulls, duplicates).
- Light cleaning: strip whitespace, normalize casing, drop empty abstracts.
- Save cleaned parquet files for faster I/O.
- Write metadata JSON (query, pull date, counts, paths).

In [1]:
import json
from pathlib import Path
import pandas as pd
from datetime import datetime

project_root = Path('..').resolve()
data_dir = project_root / 'Data'
raw_abs = data_dir / 'cochrane_pubmed_abstracts.csv'
raw_refs = data_dir / 'cochrane_pubmed_references.csv'
clean_abs = data_dir / 'cochrane_pubmed_abstracts_clean.parquet'
clean_refs = data_dir / 'cochrane_pubmed_references_clean.parquet'
meta_path = data_dir / 'cochrane_pubmed_metadata.json'

raw_abs.exists(), raw_refs.exists()

(True, True)

In [2]:
# Load
abs_df = pd.read_csv(raw_abs)
refs_df = pd.read_csv(raw_refs) if raw_refs.exists() else pd.DataFrame()
abs_df.head(), refs_df.head() if not refs_df.empty else 'no refs'

(       pmid                                              title  \
 0  41527994  Surgical interventions for treating vesicovagi...   
 1  41524153  Physiology- versus angiography-guided percutan...   
 2  41510790     Cladribine for people with multiple sclerosis.   
 3  41510785  Oral iron supplements for children in malaria-...   
 4  41500513                           Exercise for depression.   
 
                                             abstract  \
 0  This is a protocol for a Cochrane Review (inte...   
 1  This is a protocol for a Cochrane Review (inte...   
 2  RATIONALE: Multiple sclerosis (MS) is a chroni...   
 3  RATIONALE: Iron deficiency anaemia is a common...   
 4  RATIONALE: Depression is a common cause of mor...   
 
                                        journal  year  \
 0  The Cochrane database of systematic reviews  2026   
 1  The Cochrane database of systematic reviews  2026   
 2  The Cochrane database of systematic reviews  2026   
 3  The Cochrane databas

In [3]:
# Basic validation
print('Abstracts rows:', len(abs_df))
print('Null abstracts:', abs_df['abstract'].isna().sum())
print('Duplicate PMIDs:', abs_df['pmid'].duplicated().sum())
if not refs_df.empty:
    print('References rows:', len(refs_df))
    print('Ref citing pmid nulls:', refs_df['citing_pmid'].isna().sum())

Abstracts rows: 17092
Null abstracts: 0
Duplicate PMIDs: 0
References rows: 1182678
Ref citing pmid nulls: 0


In [4]:
def clean_text(s: str) -> str:
    if pd.isna(s):
        return ''
    return ' '.join(str(s).split())

abs_df['title'] = abs_df['title'].map(clean_text)
abs_df['abstract'] = abs_df['abstract'].map(clean_text)
abs_df = abs_df[abs_df['abstract'] != '']
abs_df = abs_df.drop_duplicates(subset=['pmid'])
abs_df = abs_df.reset_index(drop=True)

if not refs_df.empty:
    for col in ['citing_pmid','ref_pmid','ref_doi','ref_title']:
        if col in refs_df:
            refs_df[col] = refs_df[col].map(clean_text)
    refs_df = refs_df[~refs_df['citing_pmid'].eq('')]
    refs_df = refs_df.reset_index(drop=True)

len(abs_df), len(refs_df) if not refs_df.empty else 0

(17092, 1182678)

In [5]:
# Save cleaned
clean_abs.parent.mkdir(parents=True, exist_ok=True)
abs_df.to_parquet(clean_abs, index=False)
if not refs_df.empty:
    refs_df.to_parquet(clean_refs, index=False)
clean_abs, clean_refs if not refs_df.empty else 'no refs'

(WindowsPath('C:/Users/juanx/Documents/LSE-UKHSA Project/Data/cochrane_pubmed_abstracts_clean.parquet'),
 WindowsPath('C:/Users/juanx/Documents/LSE-UKHSA Project/Data/cochrane_pubmed_references_clean.parquet'))

In [6]:
# Metadata
metadata = {
    'query': '("Cochrane Database Syst Rev"[Journal]) AND hasabstract[text]',
    'pulled_at_utc': datetime.utcnow().isoformat() + 'Z',
    'raw_abstracts': len(abs_df),
    'raw_references': len(refs_df) if not refs_df.empty else 0,
    'paths': {
        'raw_abstracts_csv': str(raw_abs),
        'raw_references_csv': str(raw_refs),
        'clean_abstracts_parquet': str(clean_abs),
        'clean_references_parquet': str(clean_refs) if not refs_df.empty else None,
    },
}
with open(meta_path, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2)
meta_path, metadata['raw_abstracts']

  'pulled_at_utc': datetime.utcnow().isoformat() + 'Z',


(WindowsPath('C:/Users/juanx/Documents/LSE-UKHSA Project/Data/cochrane_pubmed_metadata.json'),
 17092)