In [1]:
# Preparation of the data from intervention reviews for ASReview simulation

# The data were retrieved from the CLEF 2019 challenge:
#      One part of the sets: https://github.com/CLEF-TAR/tar/tree/master/2019-TAR/Task2/Testing/Intervention/topics
#      Other part of the sets: https://github.com/CLEF-TAR/tar/tree/master/2019-TAR/Task2/Training/Intervention/topics
#      The reviews are named by CDxxxxxx number
#      For each review that was used, the PubMed IDs were copied and pasted in PubMed 
#      The selection ("All results") was saved in format ("PubMed") and by clicking "Create file"
#      The saved file was opened in a new EndNote library
#      The EndNote library was then exported as "Text Only" and "Tab Delimited"
#      The exported .txt file was opened and the content was copied and pasted the in Excel: 'Intervention_CDxxxxxx_raw.xlsx'
#      The rows that have empty PubMed IDs in column 'AI' were merged with the row prior to that (these are faulty rows; confirmed by comparing counts)
#      Then the dataset was saved as 'Intervention_CDxxxxxx_final.xlsx' file

# The labels were retrieved:
#      For one part of the sets: https://github.com/CLEF-TAR/tar/blob/master/2019-TAR/Task2/Testing/Intervention/qrels/full.test.intervention.abs.2019.qrels
#      And for the other part of the sets: https://github.com/CLEF-TAR/tar/blob/master/2019-TAR/Task2/Training/Intervention/qrels/full.train.intervention.abs.2019.qrels
#      These data were copied and pasted in Excel, merged, and manually converted into a usable format as a .csv file
#      And saved as 'Intervention_inclusions_all_sets.csv'

# Import the packages
import pandas as pd
import os 

# Set a path to the directory with the raw datasets
path_data = '/Users/ispiero2/Documents/Research/Datasets/Systematic_Reviews/Datasets_final/Raw_datasets/Intervention/'
# Set a path to the directory to store the clean output datasets
path_results = '/Users/ispiero2/Documents/Research/Datasets/Systematic_Reviews/Datasets_final/Clean_datasets/'

In [2]:
# Import the reviews

os.chdir(path_data)

# Intervention review: Educational interventions for improving primary caregiver complementary feeding practices for children aged 24 months and under 
# CD011768: n = 9160, inclusions = 54
CD011768 = pd.read_excel('Intervention_CD011768.xlsx', header = None)

# Intervention review: Face‐to‐face interventions for informing or educating parents about early childhood vaccination 
# CD010038: n = 8877, inclusions = 23
CD010038 = pd.read_excel('Intervention_CD010038.xlsx', header = None)

# Intervention review: Anticoagulation for people with cancer and central venous catheters 
# CD006468: n = 3874, inclusions = 52
CD006468 = pd.read_excel('Intervention_CD006468.xlsx', header = None)

# Intervention review: Methylphenidate for attention deficit hyperactivity disorder (ADHD) in children and adolescents – assessment of adverse events in non‐randomised studies  
# CD012069: n = 3489, inclusions = 320
CD012069 = pd.read_excel('Intervention_CD012069.xlsx', header = None)

# Intervention review: Psychological therapies for treatment‐resistant depression in adults 
# CD010558: n = 2815, inclusions = 37
CD010558 = pd.read_excel('Intervention_CD010558.xlsx', header = None)

# Intervention review: Antidepressants for insomnia in adults 
# CD010753: n = 2529, inclusions = 29
CD010753 = pd.read_excel('Intervention_CD010753.xlsx', header = None)

# Intervention review: Comparison of a therapeutic‐only versus prophylactic platelet transfusion policy for people with congenital or acquired bone marrow failure disorders 
# CD012342: n = 2353, inclusions = 6
CD012342 = pd.read_excel('Intervention_CD012342.xlsx', header = None)

# Intervention review: First‐line drugs inhibiting the renin angiotensin system versus other first‐line antihypertensive drug classes for hypertension
# CD008170: n = 12319, inclusions = 88
CD008170 = pd.read_excel('Intervention_CD008170.xlsx', header = None)

# Intervention review: Anti‐vascular endothelial growth factor for neovascular age‐related macular degeneration 
# CD005139: n = 5392, inclusions = 112
CD005139 = pd.read_excel('Intervention_CD005139.xlsx', header = None)

# Intervention review: Interventions for implementation of thromboprophylaxis in hospitalized patients at risk for venous thromboembolism
# CD008201: n = 3574, inclusions = 11
CD008201 = pd.read_excel('Intervention_CD008201.xlsx', header = None)


In [3]:
# Create a dictionary of the reviews

int_reviews = {'CD011768' : CD011768, 
               'CD010038' : CD010038, 
               'CD006468' : CD006468, 
               'CD012069' : CD012069,
               'CD010558' : CD010558, 
               'CD010753' : CD010753, 
               'CD012342' : CD012342, 
               'CD008170' : CD008170,
               'CD005139' : CD005139, 
               'CD008201' : CD008201  
              }

In [5]:
# Import the inclusions for all reviews in one file

os.chdir(path_data)
data_incl = pd.read_csv('Intervention_inclusions_all_sets.csv', sep = ',', encoding = 'latin-1', index_col = False)
data_ft_incl = pd.read_csv('Intervention_fulltext_inclusions_all_sets.csv', sep = ',', encoding = 'latin-1', index_col = False)

In [8]:
# Correct the column names of the inclusions data

data_incl = data_incl.rename(columns = {'ï»¿Topic': 'review',
                                        'Iteration' : 'iteration',
                                        'Document' : 'pmid',
                                        'Relevance' : 'label_included'})
data_ft_incl = data_ft_incl.rename(columns = {'ï»¿Topic': 'review',
                                        'Iteration' : 'iteration',
                                        'Document' : 'pmid',
                                        'Included' : 'label_ft_included'})
data_ft_incl

Unnamed: 0,review,iteration,pmid,label_ft_included,NA
0,CD000996,0,27720881,0.0,
1,CD000996,0,4209914,0.0,
2,CD000996,0,24625938,0.0,
3,CD000996,0,20635793,0.0,
4,CD000996,0,7638431,0.0,
...,...,...,...,...,...
73636,CD012930,0,26194213,0.0,
73637,CD012930,0,27576232,0.0,
73638,CD012930,0,30141826,0.0,
73639,CD012930,0,28375647,0.0,


In [9]:
# Create the labeled output reviews

os.chdir(path_results)

int_reviews_labeled = {}
for review,data in int_reviews.items():
    # Choose the columns to be saved in the output
    data = data.iloc[:,[0,1,2,3,5,34,37,38,51]]
    # Name the respective columns
    data.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']
    # Take the inclusions subset of the respective review
    inclusions = data_incl[data_incl['review'] == review]
    # Merge the inclusions with the review data
    data = pd.merge(data, inclusions[['pmid', 'label_included']], 
                    how = 'left', on = ['pmid'])
    # Store the resulting labeled data in a new dictionary
    int_reviews_labeled[review] = data
    # Save the resulting labeled data
    data.to_excel('Int_{x}_labeled.xlsx'.format(x=review))
    print(len(data), len(inclusions), sum(data['label_included'])) # to check the correct numbers

9160 9160 54
8867 8867 23
3874 3874 52
3479 3479 320
2815 2815 37
2539 2539 29
2353 2353 6
12319 12320 88
5392 5392 112
3574 3574 11


In [25]:
# Retrieve the full-text inclusion numbers

for review,data in int_reviews.items():
    # Choose the columns to be saved in the output
    data = data.iloc[:,[0,1,2,3,5,34,37,38,51]]
    # Name the respective columns
    data.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']
    # Take the inclusions subset of the respective review
    inclusions = data_incl[data_incl['review'] == review]
    # Merge the inclusions with the review data
    data = pd.merge(data, inclusions[['pmid', 'label_included']], 
                    how = 'left', on = ['pmid'])
    # Take the ft inclusions subset of the respective review
    ft_inclusions = data_ft_incl[data_ft_incl['review'] == review]
    # Merge the inclusions with the review data
    data = pd.merge(data, ft_inclusions[['pmid', 'label_ft_included']], 
                    how = 'left', on = ['pmid'])
    data['label_ft_included'] = data['label_ft_included'].fillna(0)
    # Save the resulting labeled data
    print(review, len(data), sum(data['label_included']), sum(data['label_ft_included'])) # to check the correct numbers

CD011768 9160 54 25.0
CD010038 8867 23 9.0
CD006468 3874 52 12.0
CD012069 3479 320 245.0
CD010558 2815 37 10.0
CD010753 2539 29 20.0
CD012342 2353 6 0.0
CD008170 12319 88 81.0
CD005139 5392 112 68.0
CD008201 3574 11 8.0


In [8]:
# Check the results
int_reviews_labeled['CD010038']

Unnamed: 0,type,authors,year,title,journal,pmid,keywords,abstract,language,label_included
0,Journal Article,,1969,Immunization of man against rubella. Discussio...,Am J Dis Child,4978511,Adolescent_x000D_\nAdult_x000D_\nAnimals_x000D...,,eng,0
1,Journal Article,,1969,[Therapy of juvenile diabetes],Presse Med (1893),4977371,Adolescent_x000D_\nChild_x000D_\nDiabetes Mell...,,fre,0
2,Journal Article,,1972,[BCG vaccine and the vaccine prepared with BRC...,Bol Oficina Sanit Panam,4264521,Adolescent_x000D_\nAdult_x000D_\n*BCG Vaccine/...,,spa,0
3,Journal Article,,1976,Recommendation of the Public Health Service ad...,IMJ Ill Med J,10256,Adolescent_x000D_\nAdult_x000D_\nAging_x000D_\...,,eng,0
4,Journal Article,,1978,Influenza vaccine: recommendation of the Publi...,Ann Intern Med,717939,Adolescent_x000D_\nAdult_x000D_\nChild_x000D_\...,Influenza vaccine for 1978--79 will consist of...,eng,0
...,...,...,...,...,...,...,...,...,...,...
8862,Journal Article,J. Zuckerman; B. Langer,2005,Hepatitis B vaccination in a school age popula...,J Med Virol,15778966,Adolescent_x000D_\nChild_x000D_\nDrug Administ...,There remains no consensus on whether to adopt...,eng,0
8863,Journal Article,B. Zülfikar,2002,Two patients with haemophilia and acute leukaemia,Haemophilia,12199682,Adolescent_x000D_\nCost of Illness_x000D_\n*De...,Acute leukaemia is the commonest form of malig...,eng,0
8864,Journal Article,M. L. Zúñiga de Nuncio; P. R. Nader; M. H. Saw...,2003,A prenatal intervention study to improve timel...,J Community Health,12705316,Adult_x000D_\nCalifornia_x000D_\nCohort Studie...,This was a prospective randomized cohort study...,eng,0
8865,Journal Article,J. A. Zupancic; D. K. Richardson; B. J. O'Brie...,2003,Daily cost prediction model in neonatal intens...,Int J Technol Assess Health Care,12862190,"Ancillary Services, Hospital/*economics/statis...",OBJECTIVES: One barrier to economic evaluation...,eng,0
