In [1]:
import os
import pandas as pd
import numpy as np

os.chdir('../')

In [2]:
path_data = 'data/'

### Dataset based on Andaur Navarro et al. 2022:
##### Completeness of reporting of clinical prediction models developed using supervised machine learning: a systematic review)

Load the data:

In [3]:
df1_scores = pd.read_csv(path_data + 'AndaurNavarro/20201127_DATA_TRIPOD.csv')
df1_labels = pd.read_excel(path_data + 'AndaurNavarro/Prog_reporting_labeled_ids.xlsx')

The dataset containing the TRIPOD scores:

In [4]:
df1_scores.head()

Unnamed: 0,article_id,Unnamed: 1,gen_clinicalarea,gen_study_1,gen_studydesign,gen_outcomeformat,gen_outcome,gen_publicationyear,t_background_1,t_background_2,...,t_results_126___1,t_results_126___2,t_results_126___3,t_results_126___4,t_results_48,t_methods_60,t_methods_126,t_methods_79,gen_methods_153,gen_methods_153___1
0,172,X Jiang,Oncology,Prognosis,Development with external validation (same model),Binary,5-year breast cancer metastasis,2019,1,0,...,0,0,0,0,2.0,0.0,,0.0,0,0
1,174,L Adhikari,Nephrology,Prognosis,Development only (including internal validation),Binary,acute kidney injury at first 7 days after surgery,2019,1,1,...,0,0,0,0,,,,,1,1
2,175,WP Chen,Dentistry,Diagnosis,Development only (including internal validation),Binary,periodontitis,2018,1,0,...,0,0,0,0,,,,,0,0
3,177,G Lorenzoni,Cardiovascular medicine,Prognosis,Development only (including internal validation),Binary,first hospitalizacion in heart failure patients,2019,1,1,...,0,0,0,0,,,,,0,0
4,178,L-K Pries,Psychiatry,Prognosis,Development with external validation (same model),Binary,Schizophrenia,2019,1,1,...,0,0,0,0,1.0,0.0,,1.0,0,0


The dataset containing the title-abstract inclusions:

In [5]:
df1_labels['article_id'] = df1_labels['article_id'].fillna(0)
df1_labels['article_id'] = df1_labels['article_id'].astype(int)
df1_labels.rename(columns={'label_included': 'label_ta_included'}, inplace=True)
df1_labels['label_included'] = np.where(df1_labels['label_ta_included'] != 0, 1, 0)
df1_labels.head()

Unnamed: 0.1,Unnamed: 0,type,authors,year,title,journal,pmid,keywords,abstract,label_ta_included,article_id,label_included
0,2278,JOUR,"['Csato V', 'Kadir SZSA', 'Khavandi K', 'Benne...",2019.0,"""A Step and a Ceiling"": mechanical properties ...",,,"['eppi-reviewer4', 'Ca2+ spark', 'oxidant sign...",We investigated the biomechanical relationship...,0,0,0
1,1242,JOUR,,2019.0,"""Implications of emotion regulation strategies...",,,['eppi-reviewer4'],"Reports an error in ""Implications of emotion r...",0,0,0
2,1632,JOUR,"['Moyano J', 'Mases L', 'Izeta T', 'Flores T',...",2019.0,"""In Vitro"" Study About Variables that Influenc...",,,"['eppi-reviewer4', 'conventional brackets', 'f...",Many advantages have been described surroundin...,0,0,0
3,187,JOUR,"['Song J', 'Han K', 'Lee D', 'Kim SW']",2018.0,"""Is a picture really worth a thousand words?"":...",,,"['eppi-reviewer4', 'Adolescent', 'Age Factors'...",Because using social media has become a major ...,0,0,0
4,2406,JOUR,"['Rodrigues MAV', 'Olmos RD', 'Kira CM', 'Lotu...",2019.0,"""Shadow"" OSCE examiner. A cross-sectional stud...",,,['eppi-reviewer4'],OBJECTIVES: Feedback is a powerful learning to...,0,0,0


In [6]:
df1 = pd.merge(df1_labels, df1_scores, on='article_id', how='outer')

In [7]:
df1.head()

Unnamed: 0.1,Unnamed: 0,type,authors,year,title,journal,pmid,keywords,abstract,label_ta_included,...,t_results_126___1,t_results_126___2,t_results_126___3,t_results_126___4,t_results_48,t_methods_60,t_methods_126,t_methods_79,gen_methods_153,gen_methods_153___1
0,2278,JOUR,"['Csato V', 'Kadir SZSA', 'Khavandi K', 'Benne...",2019.0,"""A Step and a Ceiling"": mechanical properties ...",,,"['eppi-reviewer4', 'Ca2+ spark', 'oxidant sign...",We investigated the biomechanical relationship...,0,...,,,,,,,,,,
1,1242,JOUR,,2019.0,"""Implications of emotion regulation strategies...",,,['eppi-reviewer4'],"Reports an error in ""Implications of emotion r...",0,...,,,,,,,,,,
2,1632,JOUR,"['Moyano J', 'Mases L', 'Izeta T', 'Flores T',...",2019.0,"""In Vitro"" Study About Variables that Influenc...",,,"['eppi-reviewer4', 'conventional brackets', 'f...",Many advantages have been described surroundin...,0,...,,,,,,,,,,
3,187,JOUR,"['Song J', 'Han K', 'Lee D', 'Kim SW']",2018.0,"""Is a picture really worth a thousand words?"":...",,,"['eppi-reviewer4', 'Adolescent', 'Age Factors'...",Because using social media has become a major ...,0,...,,,,,,,,,,
4,2406,JOUR,"['Rodrigues MAV', 'Olmos RD', 'Kira CM', 'Lotu...",2019.0,"""Shadow"" OSCE examiner. A cross-sectional stud...",,,['eppi-reviewer4'],OBJECTIVES: Feedback is a powerful learning to...,0,...,,,,,,,,,,


### Dataset based on Heus et al. 2018:

In [38]:
# n=74
df2a_scores = pd.read_excel(path_data + 'Heus/170509_Data_set_for_SPSS.xlsx', 
                            sheet_name='Developm for SPSS',
                            header=1) 
#df2a_scores['origin'] = 'Development'

# n=43
df2b_scores = pd.read_excel(path_data + 'Heus/170509_Data_set_for_SPSS.xlsx', 
                            sheet_name='Validation for SPSS',
                            header=1)
#df2b_scores['origin'] = 'Validation'

# n=33
df2c_scores = pd.read_excel(path_data + 'Heus/170509_Data_set_for_SPSS.xlsx', 
                            sheet_name='IV for SPSS_2',
                            header=1)
#df2c_scores['origin'] = 'IV'

# n=22
df2d_scores = pd.read_excel(path_data + 'Heus/170509_Data_set_for_SPSS.xlsx', 
                            sheet_name='D&V for SPSS_2',
                            header=1)
#df2d_scores['origin'] = 'D&V'

  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():


In [40]:
df2 = pd.concat([df2a_scores,
                 df2b_scores,
                 df2c_scores,
                 df2d_scores], axis = 0)
#df2 = df2[['origin'] + [col for col in df2.columns if col != 'origin']]


df2 = df2.drop_duplicates()

df2.sort_values(by='Author name')[20:40]

Unnamed: 0,Author name,3a,3ai,3aii,3b,3bi,4a,4ai,4a_design,4a_design_coded,...,2vi.1,2vii.1,2viii.1,2ix.1,2x.1,2xi.1,2xii.1,2xiii.1,2xiv.1,2xv.1
3,Butte,1,1.0,1.0,1,1.0,1,1.0,Prospectively designed,1.0,...,,,,,,,,,,
5,Chen,1,1.0,1.0,1,1.0,1,1.0,Existing cohort,2.0,...,0.0,1.0,1.0,0.0,,1.0,3.0,0.0,3.0,1.0
4,Cheng,0,1.0,0.0,1,1.0,1,1.0,Other,7.0,...,,,,,,,,,,
9,Chung-Esaki,1,1.0,1.0,1,1.0,1,1.0,Existing cohort,2.0,...,,,,,,,,,,
4,Chung-Esaki,1,1.0,1.0,1,1.0,1,1.0,Existing cohort,2.0,...,,,,,,,,,,
10,Colecchia,1,1.0,1.0,0,0.0,1,1.0,Prospectively designed,1.0,...,,,,,,,,,,
5,Conger,1,1.0,1.0,1,1.0,1,1.0,Prospectively designed,1.0,...,,,,,,,,,,
11,Conger,1,1.0,1.0,1,1.0,1,1.0,Prospectively designed,1.0,...,,,,,,,,,,
5,Corfield,1,1.0,1.0,0,0.0,1,1.0,Registry / medical records,4.0,...,,,,,,,,,,
12,Corrigan,1,1.0,1.0,0,0.0,1,1.0,Registry / medical records,4.0,...,,,,,,,,,,


In [41]:
import pandas as pd


# Step 1: Parse the .txt (RIS-formatted) file
def parse_ris_file(file_path):
    references = []
    entry = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "":  # Skip empty lines
                continue

            # Split the line based on RIS format (Tag - Value)
            if line.startswith("TY  -"):  # New reference entry starts with TY tag
                if entry:  # If we already have an entry, append it
                    references.append(entry)
                entry = {}  # Start a new entry
            try:
                tag, value = line.split('  - ', 1)  # Split by '  - ' to get tag and value
                entry[tag] = value.strip()  # Strip any excess spaces/newlines
            except ValueError:
                continue  # Handle lines that don't match the pattern

        # Add the last entry if it exists
        if entry:
            references.append(entry)

    return references

# Step 2: Convert parsed data into a pandas DataFrame
file_path = path_data + 'Heus/TRIPOD adherence included_final-Converted.txt'
parsed_data = parse_ris_file(file_path)

# Convert list of dictionaries to a DataFrame
df = pd.DataFrame(parsed_data)

# Display the DataFrame
df.to_excel('TRIPOD adherence included.xlsx', index=False)
df

Unnamed: 0,﻿TY,AB,AD,AN,AU,DA,DO,DP,ET,J2,...,TI,UR,ID,ER,TY,IS,SP,VL,KW,C2
0,JOUR,BACKGROUND: -Vascular adhesion protein-1 (VAP-...,"MediCity Research Laboratory, University of Tu...",24850810,"Salmi, M.",May 21,10.1161/circgenetics.113.000543,NLM,2014/05/23,Circulation. Cardiovascular genetics,...,Soluble Vascular Adhesion Protein-1 Predicts I...,http://circgenetics.ahajournals.org/content/7/...,1,,,,,,,
1,,Previous studies have shown that hippocampal v...,"Biomedical Imaging Group Rotterdam, Department...",24039001,"de Bruijne, M.",May,10.1002/hbm.22333,NLM,2013/09/17,Human brain mapping,...,Hippocampal shape is predictive for the develo...,http://onlinelibrary.wiley.com/store/10.1002/h...,2,,JOUR,5,2359-71,35,,
2,,BACKGROUND: Many of the common equations for w...,"From the Department of Anesthesiology, Univers...",24681659,"Nafiu, O. O.",May,10.1213/ane.0000000000000163,NLM,2014/04/01,Anesthesia and analgesia,...,Assessing the accuracy of common pediatric age...,,3,,JOUR,5,1027-33,118,Aging/ physiology,
3,,The Model for End-Stage Liver Disease (MELD) s...,"Department of Surgery, Dumont-UCLA Transplant ...",24854341,"Busuttil, R. W.",Jul,10.1111/ajt.12759,NLM,2014/05/24,American journal of transplantation : official...,...,Liver transplantation in recipients receiving ...,http://onlinelibrary.wiley.com/store/10.1111/a...,4,,JOUR,7,1638-47,14,,
4,,OBJECTIVES: Psoriasis is a chronic inflammator...,"Department of Cardiology, Copenhagen Universit...",24860914,"Hansen, P. R.",May 26,10.1111/joim.12272,NLM,2014/05/28,Journal of internal medicine,...,Risk of thromboembolism and fatal stroke in pa...,http://onlinelibrary.wiley.com/store/10.1111/j...,5,,JOUR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,,Abstract Traumatic brain injury (TBI) is commo...,"1 Department of Biostatistics, University of W...",24552494,"Newgard, C. D.",Jun 1,10.1089/neu.2013.3122,NLM,2014/02/21,Journal of neurotrauma,...,Addressing the challenges of obtaining functio...,http://online.liebertpub.com/doi/abs/10.1089/n...,176,,JOUR,11,1029-38,31,,4043258
176,,BACKGROUND: Acute kidney injury (AKI) is a fre...,,24293449,"Zhang, Y.",May,10.1515/cclm-2013-0823,NLM,2013/12/03,Clinical chemistry and laboratory medicine : C...,...,Performance of urinary NGAL and L-FABP in pred...,http://www.degruyter.com/view/j/cclm.2014.52.i...,177,,JOUR,5,671-8,52,,
177,,BACKGROUND & AIMS: Anemia is a common adverse ...,Johann Wolfgang Goethe University Medical Cent...,24486089,"Witek, J.",Jun,10.1016/j.jhep.2014.01.013,NLM,2014/02/04,Journal of hepatology,...,Risk factors predictive of anemia development ...,http://www.journal-of-hepatology.eu/article/S0...,178,,JOUR,6,1112-7,60,,
178,,BACKGROUND: Early warning scores (EWS) are des...,"Division of Biomedical Informatics, Cincinnati...",24813568,"Solti, I.",May 9,10.1016/j.resuscitation.2014.04.009,NLM,2014/05/13,Resuscitation,...,Developing and evaluating a machine learning b...,http://www.resuscitationjournal.com/article/S0...,179,,JOUR,,,,,
