# DeMetRA - literature review

Read and clean dataset

In [1]:
import pandas as pd
import numpy as np

from dateutil.parser import parse as parse_date

In [2]:
assets_directory = '../assets/'

lit, base_ss, base_va = pd.read_excel(f'{assets_directory}MPS_review_systematic_2025-02-14.xlsx', sheet_name=[0, 1, 2]).values()

# lit = lit.loc[lit.Include == 'Yes'].drop('Include', axis=1) # All included

# print(lit.shape, '\n', list(lit.columns))
# print(base_ss.shape, '\n', list(base_ss.columns))
# print(base_va.shape, '\n', list(base_va.columns))


  warn(msg)
  warn(msg)


### Data inspection and cleaning

In [3]:
# Inspect values 
def checklvl(var):
    allobs = pd.concat([lit[var], base_ss[var], base_va[var]])
    print(allobs.value_counts(dropna=False))

for v in  ['Tissue', 'Array','Ancestry','Developmental_period']:
    checklvl(v)
    print('\n')
    try: 
        checklvl(f'Multiple_{v.lower()}')
        print('\n')
    except:
        pass

# NOTE: this is to guide manual cleaning of the excel file!

Tissue
Peripheral blood          497
Whole blood               153
Saliva                    120
Cord blood                 48
Dried bloodspot            26
Blood-clots                26
Buccal cells               21
Placenta                   15
Tumour cells                7
Nasal epithelial cells      7
Multiple                    4
Leukocytes                  4
Not reported                1
Cervical cells              1
Name: count, dtype: int64


Multiple_tissue
NaN                            926
Whole blood, HPCs                2
Cord blood, Dried bloodspot      1
Cord blood, Whole blood          1
Name: count, dtype: int64


Array
Multiple    436
EPICv1      289
450K        191
WGBS         14
Name: count, dtype: int64


Multiple_array
NaN                                    494
450K, EPICv1                           426
450K, GMEL (~3000 CpGs from EPICv1)      6
450K, EPICv2                             2
450K, EPICv3                             1
450K, EPICv4                     

In [4]:
# Recode "multiple" categories
def replace_multiples(df):

    for var in ['Tissue', 'Array', 'Ancestry']:
    
        if var == 'Ancestry':
            df.loc[df[var] == 'Multiple', var] = 'Mixed'
    
        else:
            df.loc[df[var] == 'Multiple', var] = [f'Multiple ({values})' for values in df.loc[df[var] == 'Multiple', f'Multiple_{var.lower()}']]
        
        # print(df[var].value_counts(), '\n')
    
    return df

lit = replace_multiples(lit)
base_ss = replace_multiples(base_ss)
base_va = replace_multiples(base_va)

In [5]:
lit.head(3)

Unnamed: 0,Identifier,Type,Author,Year,Title,Journal,DOI,Include,pdf,Sample_size_total,...,Including_CpGs_4,Including_CpGs_5,Number of CpGs,Determining_weights_1,Train_test,Independent_validation,Comparison,Missing_value_note,Reflect_phenotype,Link
0,1,Journal Article,S. Abrishamcar; J. Chen; D. Feil; A. Kilanowsk...,2022,DNA methylation as a potential mediator of the...,Transl Psychiatry,10.1038/s41398-022-02195-3,Yes,Abrishamcar (2022) - DNA methylation as a pote...,262,...,,,Not reported,Discovery EWAS | linear regression,No,No,No,Not reported,Reported | Explained variance | R2 = 0.23,http://dx.doi.org/10.1038/s41398-022-02195-3_x...
1,1,Journal Article,S. Abrishamcar; J. Chen; D. Feil; A. Kilanowsk...,2022,DNA methylation as a potential mediator of the...,Transl Psychiatry,10.1038/s41398-022-02195-3,Yes,Abrishamcar (2022) - DNA methylation as a pote...,262,...,,,Not reported,Discovery EWAS | linear regression,No,No,No,Not reported,Reported | Explained variance | R = 0.0006,http://dx.doi.org/10.1038/s41398-022-02195-3_x...
2,3,Journal Article,R. Al-Jawahiri; A. Foroutan; J. Kerkhof; H. Mc...,2022,SOX11 variants cause a neurodevelopmental diso...,Gen Med,10.1016/j.gim.2022.02.013,Yes,Al-Jawahiri (2022) - SOX11 variants cause a ne...,126,...,Pruning | Pairwise correlation | 0.6,,224,Machine learning | Support vector machine,"Split 25/75%, 10-fold cross-validation",No,No,No,No actual number reported,https://www.embase.com/search/results?subactio...


### Parse bibliography for publication dates, abstracts and keywords

In [6]:
# Parsing bibliography file (this shoud be a RIS or TXT file in RIS format)

ris_tags = {
    'TY': 'Reference Type',
    'AU': 'Author_list',
    'PY': 'Year',
    'TI': 'Title', # 'T1
    'T2': 'Journal', # 'JO'
    'J2': 'Journal', # For pre-prints 
    'AB': 'Abstract',
    'DO': 'DOI',
    'UR': 'URL',
    'KW': 'Keywords',
    'DA': 'Date'
}

def parse_ris(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        references = []

        current_entry = {}
        current_tag = None

        for line in file:
            if not line.strip():  # Skip empty lines
                continue
            elif line.strip() == 'ER  -':  # ER marks the end of the record
                # Append reference and reset
                references.append(current_entry)
                current_entry = {}
                current_tag = None 
            else:
                tag = line[:5]

                if tag in [f'{t}  -' for t in ris_tags.keys()]:
                    current_tag = tag

                    value = line[5:].strip()
                    key = ris_tags[tag[:2]]

                    if key in current_entry:
                        if isinstance(current_entry[key], list):
                            current_entry[key].append(value)
                        else:
                            current_entry[key] = [current_entry[key], value]
                    else:
                        current_entry[key] = value

                # Handle dates that span multiple lines
                elif tag in [f'{y}-' for y in range(2000, 2025)]:
                    value = line.strip()
                    if 'Date' not in current_entry:
                        current_entry['Date'] = value # Only keep first occurrance (i.e. publication date)

                # Handle keywords that span multiple lines
                else:
                    if current_tag == 'KW  -':
                        value = line.strip()
                        if isinstance(current_entry['Keywords'], list):
                            current_entry['Keywords'].append(value)
                        else:
                            current_entry['Keywords'] = [current_entry['Keywords'], value]
                                    
        return references


parsed_data = parse_ris(f'{assets_directory}Bibliography_2024-09-19.txt')
bib = pd.DataFrame(parsed_data)[['Author_list', 'Year', 'Title', 'Journal', 'Keywords', 'Abstract', 
                                 'Date', 'DOI', 'URL']]

bib.head(5)

Unnamed: 0,Author_list,Year,Title,Journal,Keywords,Abstract,Date,DOI,URL
0,"[Abrishamcar, S., Chen, J., Feil, D., Kilanows...",2022,DNA methylation as a potential mediator of the...,Transl Psychiatry,,Prenatal tobacco exposure (PTE) and prenatal a...,09 30,10.1038/s41398-022-02195-3,http://dx.doi.org/10.1038/s41398-022-02195-3
1,"[Ács, O., Péterfia, B., Hollósi, P., Luczay, A...",2017,Methylation Status of CYP27B1 and IGF2 Correla...,Obes Facts,"[25 hydroxyvitamin D, bisulfite, cytochrome P4...",Objective: Worldwide increasing childhood obes...,2017-08-16,10.1159/000477462,https://www.embase.com/search/results?subactio...
2,"[Adcock, R., Nedjai, B., Lorincz, A. T., Scibi...",2022,DNA methylation testing with S5 for triage of ...,Int J Cancer,"[adolescent, adult, aged, article, biopsy, can...",Methylation of host and viral genes is promisi...,2022-06-27,10.1002/ijc.34050,https://www.embase.com/search/results?subactio...
3,"[Al-Jawahiri, R., Foroutan, A., Kerkhof, J., M...",2022,SOX11 variants cause a neurodevelopmental diso...,Gen Med,"[nucleic acid genotyping array kit, growth dif...",Purpose: This study aimed to undertake a multi...,2022-04-05,10.1016/j.gim.2022.02.013,https://www.embase.com/search/results?subactio...
4,"[Alfano, R., Zugna, D., Barros, H., Bustamante...",2023,Cord blood epigenome-wide meta-analysis in six...,BMC Med,"[aurora C kinase, long untranslated RNA, ACTG1...",Background: Rapid postnatal growth may result ...,2023-01-17,10.1186/s12916-022-02685-7,https://www.embase.com/search/results?subactio...


In [7]:
# Clean bibliography file

# Only selected papers ----------------------------------------------------------------
bib_incl = bib.loc[bib.Title.isin(lit.Title.unique()), ].reset_index(drop=True)
# bib_incl.shape

# Correct dates ----------------------------------------------------------------------- 
print('Note', bib_incl.Date.isna().sum(), 'NaN Date values will be set to 01/01 of respective year')

date_tmp = pd.Series([' '.join([d, y]) if y not in d else d for d, y in 
                   zip(bib_incl.Date.map(str), bib_incl.Year.map(str))])

bib_incl.loc[:, 'Date'] = date_tmp.apply(lambda date: 
                                         parse_date(date).strftime('%Y-%m-%d') if 'nan' not in date else 
                                         parse_date('01 01' + date[3:]).strftime('%Y-%m-%d'))

# Get Short Author titles ------------------------------------------------------------
bib_incl['Author'] = [f'{fa[0].split(",")[0]} et al.' if len(fa) > 1 else f'{fa[0].split(",")[0]}' 
                            for fa in bib_incl['Author_list']]


# Fix missing Journal names ----------------------------------------------------------
bib_incl.loc[bib_incl.DOI.str.contains('10.3390/toxics9100262', na=False), 
             'Journal'] = 'Toxics'
bib_incl.loc[bib_incl.DOI.str.contains('10.1007/s00787-024-02390-1', na=False), 
             'Journal'] = 'Eur Child Adolesc Psych'
bib_incl.loc[bib_incl.DOI.str.contains('10.3390/ijms22031111|10.3390/ijms22168611', na=False), 
             'Journal'] = 'Int J Mol Sci'

bib_incl.head(5)


Note 9 NaN Date values will be set to 01/01 of respective year


Unnamed: 0,Author_list,Year,Title,Journal,Keywords,Abstract,Date,DOI,URL,Author
0,"[Abrishamcar, S., Chen, J., Feil, D., Kilanows...",2022,DNA methylation as a potential mediator of the...,Transl Psychiatry,,Prenatal tobacco exposure (PTE) and prenatal a...,2022-09-30,10.1038/s41398-022-02195-3,http://dx.doi.org/10.1038/s41398-022-02195-3,Abrishamcar et al.
1,"[Al-Jawahiri, R., Foroutan, A., Kerkhof, J., M...",2022,SOX11 variants cause a neurodevelopmental diso...,Gen Med,"[nucleic acid genotyping array kit, growth dif...",Purpose: This study aimed to undertake a multi...,2022-04-05,10.1016/j.gim.2022.02.013,https://www.embase.com/search/results?subactio...,Al-Jawahiri et al.
2,"[Alfano, R., Zugna, D., Barros, H., Bustamante...",2023,Cord blood epigenome-wide meta-analysis in six...,BMC Med,"[aurora C kinase, long untranslated RNA, ACTG1...",Background: Rapid postnatal growth may result ...,2023-01-17,10.1186/s12916-022-02685-7,https://www.embase.com/search/results?subactio...,Alfano et al.
3,"[Aref-Eshghi, E., Bend, E. G., Colaiacovo, S.,...",2019,Diagnostic Utility of Genome-wide DNA Methylat...,Am J Hum Genet,"[genomic DNA, adult, article, case report, chi...",Conventional genetic testing of individuals wi...,2019-04-05,10.1016/j.ajhg.2019.03.008,https://www.embase.com/search/results?subactio...,Aref-Eshghi et al.
4,"[Aref-Eshghi, E., Kerkhof, J., Pedro, V. P.]",2020,Evaluation of DNA methylation episignatures fo...,The American Journal of …,,,2020-01-01,,,Aref-Eshghi et al.


In [11]:
print(lit.shape)

lit_main = lit.rename(columns={'Author': 'Author_dirty',
                               'Journal': 'Journal_dirty'}).merge(bib_incl,
                     on='Title', how='left', suffixes=['','_BIB'])

# CHECKUPS 
# lit_main[['Author','Author_list','Author_dirty']]
# np.where(lit_main['Year'].map(int) != lit_main['Year_BIB'].map(int))
# pd.set_option('display.max_rows', None)
# lit_main.loc[lit_main['DOI'].map(str) != lit_main['DOI_BIB'].map(str), ['Title','DOI','DOI_BIB']]

# Clean DOI values -------------------------------------------------------------------
lit_main['DOI_BIB'] = lit_main['DOI_BIB'].fillna(lit_main['DOI'])
lit_main['DOI'] = lit_main['DOI_BIB']
lit_main.drop(['DOI_BIB','Year_BIB'], axis=1, inplace=True)

print(lit_main.shape)
print(lit_main.columns)

# Save cleaned data ------------------------------------------------------------------
lit_main.to_csv(f'{assets_directory}MPS_literature_cleaned.csv', index=False)

(760, 40)
(760, 47)
Index(['Identifier', 'Type', 'Author_dirty', 'Year', 'Title', 'Journal_dirty',
       'DOI', 'Include', 'pdf', 'Sample_size_total', 'Sample_size_control',
       'Sample_size_case', 'Sample_type', 'Category', 'Phenotype',
       'What_is_available', 'Identifier_base', 'Multiple_identifier_base',
       'Tissue', 'Multiple_tissue', 'Array', 'Multiple_array', 'Ancestry',
       'Multiple_ancestry', 'Developmental_period', 'Covariates',
       'Sample_overlap_target_base', 'Including_CpGs_1', 'Including_CpGs_2',
       'Including_CpGs_3', 'Including_CpGs_4', 'Including_CpGs_5',
       'Number of CpGs', 'Determining_weights_1', 'Train_test',
       'Independent_validation', 'Comparison', 'Missing_value_note',
       'Reflect_phenotype', 'Link', 'Author_list', 'Journal', 'Keywords',
       'Abstract', 'Date', 'URL', 'Author'],
      dtype='object')


In [14]:
lit_main.Array.value_counts()

Array
Multiple (450K, EPICv1)                           393
EPICv1                                            283
450K                                               63
WGBS                                               14
Multiple (450K, GMEL (~3000 CpGs from EPICv1))      6
Multiple (450K, EPICv2)                             1
Name: count, dtype: int64

### Merge base and target info 

In [9]:
# Merge BASE and TARGET data ---------------------------------------------------------
d2_vars = ['Identifier', # 'Identifier_base', 'Multiple_identifier_base'
           'Category', 'Phenotype', 
           'Tissue',
           'Array',
           'Ancestry',
           'Developmental_period', 
           'Covariates']

d1_vars = ['Identifier_base', # 'Multiple_identifier_base' ??
           'Title', 'Year', 'What_is_available', 'Sample_overlap_target_base']+d2_vars[1:]

# There is not overlap between BASE reference 
# any([i in base_va.Identifier for i in  base_ss.Identifier])
d_base = pd.concat([base_ss[d2_vars], base_va[d2_vars]], axis=0)
d_base['MPS_id'] = d_base['Identifier'] + d_base['Phenotype']
print(d_base.shape, '\n', list(d_base.columns))

d_targ = lit.loc[lit.What_is_available != 'Only phenotype', d1_vars]
d_targ['MPS_id'] = d_targ['Identifier_base'] + d_targ['Phenotype']
d_targ = d_targ.rename(columns={'Identifier_base': 'Identifier'})

print(d_targ.shape, '\n', list(d_targ.columns))

# Merge info about target with info about base (for summary statistics)
d = d_targ.merge(d_base, on='MPS_id',
                 # left_on='Identifier_base', right_on='Identifier',
                 how='left', suffixes=['_targ','_base'])

# TMP: remove mismatches -------------------------------------------------------------
mismatch = d.loc[d['Identifier_base'].isna(), ]
d = d.loc[d['Identifier_base'].notna(), ] # Effectively inner join...
d.shape


# Save cleaned data ------------------------------------------------------------------
d.to_csv(f'{assets_directory}MPS_base_target.csv', index=False)

(170, 9) 
 ['Identifier', 'Category', 'Phenotype', 'Tissue', 'Array', 'Ancestry', 'Developmental_period', 'Covariates', 'MPS_id']
(283, 13) 
 ['Identifier', 'Title', 'Year', 'What_is_available', 'Sample_overlap_target_base', 'Category', 'Phenotype', 'Tissue', 'Array', 'Ancestry', 'Developmental_period', 'Covariates', 'MPS_id']


In [10]:
# PROBLEMS 
# "Identifier_base" in the validated_algorithm - ignore? 

# What to do with the "multiple" bases ?  Excluded for now 
lit.loc[lit.Identifier_base == 'Multiple', ['Title', 'Multiple_identifier_base']]

# mismatch

# 21 mismatches... dropping them for now 
# Drop mismatches for now-- discuss with Is how to handle them 
# Richmond (2018) - DNA methylation as a marker --- does not match becasue two kinds of maternal smoking ...
# Gondolia (2019) - Methylome-wide association study provides --- not match because Air Polution vs PM 
# Stevenson (2021) - Creating and Validating a DNA --- IL6 vs IL-6: CHANGED IT 
# Levy (2022) - Novel diagnostic DNA methylation --- different disorders mismatch...?
# Awamleh (2022) - DNA methylation signature ass --- different syndromes?
# Portales-Casamar (2016) - DNA methylation sign --- fetal alcohol syndrome ONLY? or also prenatal alcohol exposure 
# Aref-Eshghi (2019) - Evaluation of DNA Methyla --- Genitopatellar syndrome (GTPTS) and Ohdo syndrome, SBBYSS variant (SBBYSS)...  matching wich?
# ...

# Identifier base == multiple. What do you mean, combined? 

Unnamed: 0,Title,Multiple_identifier_base
81,Blood-based DNA methylation study of alcohol c...,"Liu (2018) - A DNA methylation biomarker of, D..."
434,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
435,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
436,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
437,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
438,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
439,Nasal DNA methylation at three CpG sites predi...,Xu (2021) - Shared DNA methylation signatures ...
