In [1]:
############# DATA PREPARATION PROGNOSIS REVIEWS #############

# Import the packages
import pandas as pd
import numpy as np
import os
import rispy
from pathlib import Path

# Set a path to the directory with the raw datasets
path_data = '/Users/ispiero2/Documents/Research/Datasets/Systematic_Reviews/Datasets_final/Raw_datasets/'
# Set a path to the directory to store the clean output datasets
path_results = '/Users/ispiero2/Documents/Research/Datasets/Systematic_Reviews/Datasets_final/Clean_datasets/'

In [2]:
#### Prognosis review 1: Cardiac risk (Damen et al. (2016)) ####
# Preparation of the data for ASReview simulation

#      The raw EndNote file 'Prognosis_1.enl' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited", for the inclusions and exclusions separately
#      The contents of the exported 'Prognosis_1_incl.txt' and 'Prognosis_1_excl.txt' files were each copied and pasted in Excel
#             For the exclusions, the some rows were manually fixed (merged): rows 240-241, rows 420-421, and rows 426-427
#             For the inclusions, the some rows were manually fixed (merged): rows 70-71 and rows 72-73
#      And saved as 'Prognosis_1_excl.xlsx' and 'Prognosis_1_incls.xlsx' files

os.chdir(path_data)

# All records (n = 777)
# TA exclusions (n = 686)
data_prog1_excl_orig = pd.read_excel('Prognosis_1_cardio/Prognosis_1_excl.xlsx', header = None)
# TA inclusions (n = 91)
data_prog1_incl_orig = pd.read_excel('Prognosis_1_cardio/Prognosis_1_incl.xlsx', header = None)

In [3]:
# Choose the columns to be saved in the output
data_prog1_excl = data_prog1_excl_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]
data_prog1_incl = data_prog1_incl_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]

# Name the respective columns
data_prog1_excl.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']
data_prog1_incl.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']

# Add the labels column
data_prog1_excl['label_included'] = list(np.repeat(0, len(data_prog1_excl)))
data_prog1_incl['label_included'] = list(np.repeat(1, len(data_prog1_incl)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog1_excl['label_included'] = list(np.repeat(0, len(data_prog1_excl)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog1_incl['label_included'] = list(np.repeat(1, len(data_prog1_incl)))


In [4]:
# Combine the exclusions and inclusions to one final dataframe and save the output
data_prog1_final = pd.concat([data_prog1_excl, data_prog1_incl], ignore_index=True)
data_prog1_final = data_prog1_final.sort_values(by=['authors']).reset_index(drop=True)

# Save the output
os.chdir(path_results)
data_prog1_final.to_excel('Prog_cardio_labeled.xlsx')

In [5]:
#### Prognosis review 2: rcri (Vernooij et al. (2023)) ####
# Preparation of the data for ASReview simulation

#      The raw EndNote 'Search_addedvalueRCRI.enlx' file was opened and exported as "Text Only" and "Tab Delimited"
#      The content of the exported 'Prognosis_2_all.txt' file was copied and pasted in Excel (in the 'Prognosis_2_all_raw.xlsx' file)
#      Each of the records that were spread over multiple rows were manually merged to correct these records:
#             These could be recognized by rows that do not have a year in the year column
#             After merging, this lead to the correct amount of rows of 3999
#      -> The file was saved as 'Prognosis_2_all_final.xlsx' file

#      The raw 'Full-text-screening_part1.xlsx' file was opened in excel
#      The entries of Nummer 556, 644, 647, 654, and 887 were corrected manually (authors, title, journal)
#      -> The file was saved as 'Full-text-screening_part1_adapted.xlsx'

#      The raw 'Full-text-screening_part1_updatesearch2020.xlsx' file was opened in excel
#      A column "Titel" was added with the titels corresponding

os.chdir(path_data)

# All records (n = 3999)
data_prog2_all_orig = pd.read_excel('Prognosis_2_rcri/Prognosis_2_all_final.xlsx', header = None)

# Inclusions based on original search (n = 942)
data_prog2_incl1_orig = pd.read_excel('Prognosis_2_rcri/Full-text-screening_part1_adapted.xlsx')
# Inclusions based on search update (n = 123)
data_prog2_incl2_orig = pd.read_excel('Prognosis_2_rcri/Full_text_screening_part1_updatesearch2020_adapted.xlsx')

In [6]:
# Choose the columns to be saved in the output
data_prog2_all = data_prog2_all_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]

# Name the respective columns
data_prog2_all.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']

data_prog2_all['year'] = data_prog2_all['year'].astype(str)
data_prog2_incl1_orig['Jaar'] = data_prog2_incl2_orig['Jaar'].astype(str)
data_prog2_incl2_orig['Jaar'] = data_prog2_incl2_orig['Jaar'].astype(str)

labels = []
for i in range(0,len(data_prog2_all)):
    title = data_prog2_all['title'][i]
    if len(data_prog2_incl1_orig[data_prog2_incl1_orig['Titel'].str.contains(title, regex = False)]) == 1:
        labels.append(1)
    else:
        labels.append(0)
        
labels2 = []
for i in range(0,len(data_prog2_all)):
    title = data_prog2_all['title'][i]
    if len(data_prog2_incl2_orig[data_prog2_incl2_orig['Titel'].str.contains(title, regex = False)]) == 1:
        labels2.append(1)
    else:
        labels2.append(0)

labels_comb = []
for i in range(0,len(data_prog2_all)):
    if labels[i] == 0 and labels2[i] == 0:
        labels_comb.append(0)
    else:
        labels_comb.append(1)
        
data_prog2_all['label_included'] = labels_comb        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog2_all['year'] = data_prog2_all['year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog2_all['label_included'] = labels_comb


In [7]:
sum(data_prog2_all['label_included']) # This should be 942+133=1075

1057

In [8]:
# Manually compared and corrected in excel:
#      data_prog2_all['labels_included1'] = labels
#      data_prog2_all['labels_included2'] = labels2
#      os.chdir(path_data)
#      data_prog2_all.to_excel('Prog2_test.xlsx')
#      Then manually corrected
#      And saved as 'Prognosis_2_all_manually_corrected_labels_final.xlsx'

# Load the correct labels:
os.chdir(path_data)
data_prog2_labels = pd.read_excel('Prognosis_2_rcri/Prognosis_2_all_manually_corrected_labels_final.xlsx').sort_values(by=['authors']).reset_index()

# Sort the data_prog2_all by title to match with the data_prog2_labels set:
data_prog2_final = data_prog2_all.sort_values(by=['authors']).reset_index()

data_prog2_final['label_included'] = data_prog2_labels['label_included']
sum(data_prog2_final['label_included']) # 1064 was actually correct, since some were labeled as inclusion in both the original and updated search

1064

In [9]:
# Save the output
os.chdir(path_results)
data_prog2_final.to_excel('Prog_rcri_labeled.xlsx')

In [10]:
#### Prognosis review 3: ecmo (Pladet et al. (2023)) ####
# Preparation of the data for ASReview simulation

#      The raw EndNote file 'articles total.enw' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited"
#      The exported 'Prognosis_3_all.txt' file content was copied and pasted in Excel
#      And saved as 'Prognosis_3_all.xlsx' file

#      The raw EndNote file 'articles included.enw' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited"
#      The exported 'Prognosis_3_incl.txt' file content copied and pasted in Excel
#      And saved as 'Prognosis_3_all.xlsx' file


os.chdir(path_data)

# All records (n = 4274)
data_prog3_all_orig = pd.read_excel('Prognosis_3_ecmo/Prognosis_3_all.xlsx', header = None)
# TA inclusions (n = 377)
data_prog3_incl_orig = pd.read_excel('Prognosis_3_ecmo/Prognosis_3_incl.xlsx', header = None)

In [11]:
# Remove row 287 due to faulty entry:
data_prog3_all = data_prog3_all_orig.drop(index=287)
data_prog3_all = data_prog3_all.reset_index(drop=True)

In [12]:
# Choose the columns to be saved in the output
data_prog3_all = data_prog3_all.iloc[:,[0,1,2,3,5,34,37,38,51]]
data_prog3_incl = data_prog3_incl_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]

# Name the respective columns
data_prog3_all.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']
data_prog3_incl.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']

In [13]:
# Get the labels based on title comparison
labels = []
for i in range(0,len(data_prog3_all)):
    title = data_prog3_all['title'][i]
    if len(data_prog3_incl[data_prog3_incl['title'].str.contains(title, regex = False)]) == 1:
        labels.append(1)
    else:
        labels.append(0)

In [14]:
# Check labels
sum(labels) # This should be 377. By manually checking in excel, the following should not have been labeled:
# Falk et al. (2019) Extracorporeal Membrane Oxygenation for Septic Shock 

378

In [15]:
# Add the labels to the dataset
data_prog3_all['label_included'] = labels

# Check the label that was wrong
data_prog3_all.loc[1054] # The location of Falk et al. (2019)

type                                                Journal Article
authors                           L. Falk; J. Hultman; L. M. Broman
year                                                         2019.0
title             Extracorporeal Membrane Oxygenation for Septic...
journal                                      Critical Care Medicine
pmid                                               rayyan-164934298
keywords          adult_x000D_\narticle_x000D_\ncardiomyopathy_x...
abstract          Objectives: Septic shock carries a high mortal...
language                                                    English
label_included                                                    1
Name: 1054, dtype: object

In [16]:
# Unlabel that record
data_prog3_all.at[1054, 'label_included'] = 0

In [17]:
sum(data_prog3_all['label_included'])

377

In [18]:
# Save the output
os.chdir(path_results)
data_prog3_all.to_excel('Prog_ecmo_labeled.xlsx')

In [19]:
#### Prognosis review 4: model reporting (Andaur Navarrow et al. (2022)) ####
# Preparation of the data for ASReview simulation

# The dataset consisted of 10 samples of the original dataset of 24814 records
# To get the total set of records that was TA-screened, the 10 original files of records were loaded and merged
# To get the included set of records, the 10 original files of inclusions were loaded and merged

os.chdir(path_data)

group1 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group1.txt'), encoding='utf-8'))
group2 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group2.txt'), encoding='utf-8'))
group3 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group3.txt'), encoding='utf-8'))
group4 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group4.txt'), encoding='utf-8'))
group5 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group5.txt'), encoding='utf-8'))
group6 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group6.txt'), encoding='utf-8'))
group7 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group7.txt'), encoding='utf-8'))
group8 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group8.txt'), encoding='utf-8'))
group9 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group9.txt'), encoding='utf-8'))
group10 = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/1. [T&A] Random samples/[T&A] ML_References_group10.txt'), encoding='utf-8'))

# Merge all the samples (n=2482)
groups = [group1, group2, group3, group4, group5, group6, group7, group8, group9, group10]
data_prog4_all_orig = pd.concat(groups).reset_index(drop=True)

os.chdir(path_data)

group1_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group1.ris'), encoding='utf-8'))
group2_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group2.ris'), encoding='utf-8'))
group3_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group3.ris'), encoding='utf-8'))
group4_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group4.ris'), encoding='utf-8'))
group5_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group5.ris'), encoding='utf-8'))
group6_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group6.ris'), encoding='utf-8'))
group7_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group7.ris'), encoding='utf-8'))
group8_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group8.ris'), encoding='utf-8'))
group9_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group9.ris'), encoding='utf-8'))
group10_incl = pd.DataFrame(rispy.load(Path('Prognosis_4_reporting/Review_Constanza/2. [T&A] Included References/[T&A] Ref_included_group10.ris'), encoding='utf-8'))

# Merge all the inclusions (n=312)
groups_incl = [group1_incl, group2_incl, group3_incl, group4_incl, group5_incl, group6_incl, group7_incl, group8_incl, group9_incl, group10_incl]
data_prog4_incl_orig = pd.concat(groups_incl).reset_index(drop=True)

In [20]:
# Choose the columns to be saved in the output
data_prog4_all = data_prog4_all_orig.iloc[:,[0,3,5,1,4,9]]
data_prog4_incl = data_prog4_incl_orig.iloc[:,[0,9,3,2,11,12]]

# Name the respective columns
data_prog4_all.columns = ['type','authors','year','title','keywords','abstract']
data_prog4_incl.columns = ['type','authors','year','title','keywords','abstract']

In [21]:
# Get the labels based on title comparison
labels = []
for i in range(0,len(data_prog4_all)):
    title = data_prog4_all['title'][i]
    if len(data_prog4_incl[data_prog4_incl['title'].str.contains(title, regex = False)]) == 1:
        labels.append(1)
    else:
        labels.append(0)
        
sum(labels) # = 312

# Add the labels to the dataset
data_prog4_all['label_included'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog4_all['label_included'] = labels


In [22]:
# Insert columns to match with the other reviews datasets
data_prog4_all.insert(4, 'journal', np.repeat(np.NaN, len(data_prog4_all)))
data_prog4_all.insert(5, 'pmid', np.repeat(np.NaN, len(data_prog4_all)))
data_prog4_all.insert(9, 'language', np.repeat(np.NaN, len(data_prog4_all)))

In [23]:
# Save the output
os.chdir(path_results)
data_prog4_all.to_excel('Prog_reporting_labeled.xlsx')

In [24]:
# Prognosis review 5: TRIPOD 
# Preparation of the data for ASReview simulation

#      The raw EndNote file 'TRIPOD search 20140704-Converted.enlx' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited"
#      The content of the exported 'Prognosis_5_all.txt' file was copied and pasted in Excel
#      Each of the records that were spread over multiple rows were manually merged to correct these records:
#             These could be recognized by rows that do not have a year in the year column
#             After merging, this lead to the correct amount of rows of 4871
#      And saved as 'Prognosis_5_all.xlsx' file 

#      The raw EndNote file 'TRIPOD full text selection_PH-Converted.enlx' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited"
#      The content of the exported 'Prognosis_5_incl.txt' file was copied and pasted in Excel
#      Each of the records that were spread over multiple rows were manually merged to correct these records:
#             These could be recognized by rows that do not have a year in the year column
#             After merging, this lead to the correct amount of rows of 347
#      And saved as 'Prognosis_5_incl.xlsx' file 

os.chdir(path_data)

# All records (n=4871)
data_prog5_all_orig = pd.read_excel('Prognosis_5_tripod/Prognosis_5_all.xlsx', header = None)
# TA inclusions (n=347)
data_prog5_incl_orig = pd.read_excel('Prognosis_5_tripod/Prognosis_5_incl.xlsx', header = None)

In [25]:
# Choose the columns to be saved in the output
data_prog5_all = data_prog5_all_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]
data_prog5_incl = data_prog5_incl_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]

# Name the respective columns
data_prog5_all.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']
data_prog5_incl.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']

In [26]:
# Get the labels based on title comparison
labels = []
for i in range(0,len(data_prog5_all)):
    title = data_prog5_all['title'][i]
    if len(data_prog5_incl[data_prog5_incl['title'].str.contains(title, regex = False)]) == 1:
        labels.append(1)
    else:
        labels.append(0)

In [27]:
sum(labels) # Two labels were missed; manually compared in excel

345

In [28]:
# Add the labels to the dataset
data_prog5_all['label_included'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog5_all['label_included'] = labels


In [29]:
# One missed label is this:
data_prog5_all.loc[data_prog5_all['title'] == '6-Minute walk distance is an independent predictor of mortality in patients with idiopathic pulmonary fibrosis', ['label_included']] = 1

# The other missed label is this:
data_prog5_all.loc[data_prog5_all['title'] == '6-minute walk distance as a predictor of outcome in idiopathic pulmonary fibrosis', ['label_included']] = 1

In [30]:
sum(data_prog5_all['label_included'])

347

In [31]:
# Save the output
os.chdir(path_results)
data_prog5_all.to_excel('Prog_tripod_labeled.xlsx')

In [32]:
# Prognosis review 6: ntcp (Takada et al. (2023)?)
# Preparation of the data for ASReview simulation

# To get the excluded set of records, the 4 original files of records were loaded and merged
# To get the included set of records, the original file was loaded

os.chdir(path_data)

excl1 = pd.DataFrame(rispy.load(Path('Prognosis_6_ntcp/Archief/ExportedRis_excluded_relevant_review_tiab.txt'), encoding='utf-8'))
excl2 = pd.DataFrame(rispy.load(Path('Prognosis_6_ntcp/Archief/ExportedRis_excluded_tiab1.txt'), encoding='utf-8'))
excl3 = pd.DataFrame(rispy.load(Path('Prognosis_6_ntcp/Archief/ExportedRis_excluded_tiab2.txt'), encoding='utf-8'))
excl4 = pd.DataFrame(rispy.load(Path('Prognosis_6_ntcp/Archief/ExportedRis_excluded_tiab3.txt'), encoding='utf-8'))


# All records (n=10664)

# TA exclusions (n=9711)
excls = [excl1, excl2, excl3, excl4]
data_prog6_excl_orig = pd.concat(excls).reset_index(drop=True)

# TA inclusions (n=953)
data_prog6_incl_orig = pd.DataFrame(rispy.load(Path('Prognosis_6_ntcp/Archief/ExportedRis_included_tiab.txt'), encoding='utf-8'))

In [33]:
# Choose the columns to be saved in the output
data_prog6_excl = data_prog6_excl_orig.iloc[:,[0,14,4,1,3,7]]#[0,1,2,3,5,34,37,38,51]]
data_prog6_incl = data_prog6_incl_orig.iloc[:,[0,3,5,1,4,8]]

# Name the respective columns
data_prog6_excl.columns = ['type','authors','year','title','keywords','abstract']
data_prog6_incl.columns = ['type','authors','year','title','keywords','abstract']

In [34]:
# Add the labels to the data
data_prog6_excl['label_included'] = np.repeat(0,len(data_prog6_excl))
data_prog6_incl['label_included'] = np.repeat(1,len(data_prog6_incl))

# Merge both datasets and sort according to authors
data_prog6_all = pd.concat([data_prog6_excl, data_prog6_incl])
data_prog6_all = data_prog6_all.sort_values(by=['authors']).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog6_excl['label_included'] = np.repeat(0,len(data_prog6_excl))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog6_incl['label_included'] = np.repeat(1,len(data_prog6_incl))


In [35]:
# Insert columns to match with the other reviews datasets
data_prog6_all.insert(4, 'journal', np.repeat(np.NaN, len(data_prog6_all)))
data_prog6_all.insert(5, 'pmid', np.repeat(np.NaN, len(data_prog6_all)))
data_prog6_all.insert(9, 'language', np.repeat(np.NaN, len(data_prog6_all)))

In [36]:
# Save the output
os.chdir(path_results)
data_prog6_all.to_excel('Prog_ntcp_labeled.xlsx')

In [14]:
# Prognosis review 7: CLEF Diabetes
# Preparation of the data for ASReview simulation

# The data are retrieved from the CLEF 2019 challenge:
# From: https://github.com/CLEF-TAR/tar/tree/master/2019-TAR/Task2/Testing/Prognosis/topics/CD012661
#      All PubMed IDs were copied and pasted PubMed 
#      The selection ("All results") was saved in format ("PubMed") and then by clicking "Create file"
#      The resulting raw EndNote file 'Prognosis_review_7.enl' was opened in a new EndNote library
#      The EndNote library was exported as "Text Only" and "Tab Delimited"
#      The content of the exported 'Prognosis_7_all.txt' file was copied and pasted in Excel
#      Each of the records that were spread over multiple rows were manually merged to correct these records:
#             These could be recognized by rows that do not have a year in the year column
#             After merging, this lead to the correct amount of rows of 3367
#      And saved as 'Prognosis_7_all.xlsx' file 

# From: https://github.com/CLEF-TAR/tar/blob/master/2019-TAR/Task2/Testing/Prognosis/qrels/full.test.prognosis.abs.2019.qrels
#      These data were copied and pasted in Excel and manually converted into a .csv file
#      And saved as 'Clef_prognosis_inclusions.csv

os.chdir(path_data)

# All records (n=3366)
data_prog7_all_orig = pd.read_excel('Prognosis_7_diabetes/Prognosis_7_all.xlsx')

# TA inclusions (n=192)
data_prog7_incl_orig = pd.read_csv('Prognosis_7_diabetes/Clef_prognosis_inclusions.csv', header = None)

# FT inclusions 
data_prog7_ft_incl_orig = pd.read_csv('Prognosis_7_diabetes/Prognosis_fulltext_inclusions.csv')

In [3]:
# Choose the columns to be saved in the output
data_prog7_all = data_prog7_all_orig.iloc[:,[0,1,2,3,5,34,37,38,51]]

# Name the respective columns
data_prog7_all.columns = ['type','authors','year','title','journal','pmid','keywords','abstract','language']

In [4]:
# Retrieve the included records
data_prog7_incl = data_prog7_incl_orig
data_prog7_incl.columns = ['review_id', 'unknown1', 'pmid', 'label_included', 'unknown2']
data_prog7_incl = data_prog7_incl.loc[data_prog7_incl['label_included'] == 1]

In [5]:
# Get the labels based on pmid's
labels = []
for i in range(0,len(data_prog7_all)):
    p = data_prog7_all['pmid'][i]
    if any(data_prog7_incl['pmid'] == p):
        labels.append(1)
    else:
        labels.append(0)

In [6]:
# Check the labels
sum(labels) 

192

In [7]:
# Add the labels to the dataset
data_prog7_all['label_included'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prog7_all['label_included'] = labels


In [43]:
# Save the output
os.chdir(path_results)
data_prog7_all.to_excel('Prog_diabetes_labeled.xlsx')

In [17]:
# Get the full-text labels based on pmid's
data_prog7_ft_incl = data_prog7_ft_incl_orig.loc[data_prog7_ft_incl_orig['Included'] == 1]

ft_labels = []
for i in range(0,len(data_prog7_all)):
    p = data_prog7_all['pmid'][i]
    if any(data_prog7_ft_incl['Document'] == p):
        ft_labels.append(1)
    else:
        ft_labels.append(0)
sum(ft_labels)

94