# Title and Abstract Review
**Author:** Jack Galbraith-Edge

In [6]:
from msc_code.scripts.notebook_setup import *

In [7]:
# Import google scholar and database search data
import_path = os.path.join(RAW_DATA_DIR, 'google_scholar', 'cleaned_google_scholar_title_abstract_screen_end_1.csv')
google_df = pd.read_csv(import_path)

import_path = os.path.join(RAW_DATA_DIR, 'database_search', 'database_search_title_abstract_screen_end.csv')
database_df = pd.read_csv(import_path)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jackgedge/Projects/msc_dissertation/iifo_motivation/input/raw_data/google_scholar/cleaned_google_scholar_title_abstract_screen_end_1.csv'

In [None]:
# remove duplicates from google dataframe that were identified in title and abstract review
google_df = google_df[google_df['Duplicate'] != True]

In [None]:
# Merge all Google Scholar and Database search results
all_df = pd.merge(
    database_df, google_df,
    how='outer',
    on=["Publication Year", "First Author", "Authors", "Publication Title",
        "Title", "Abstract", "Database", "Exclude", "Reason ID", "DOI"])

# Specify Desired Column Order
desired_column_order = ["Publication Year", "First Author", "Authors", 
                        "Publication Title", "Summary", "Database", "DOI", 
                        "URL", "Title", "Abstract", "Abstract2", "Exclude", 
                        "Reason ID"]

# Apply to dataframe
all_df = all_df[desired_column_order]

# Fix capitalisation on of fields
desired_cols = ['Title', 'First Author', 'Authors', 'Abstract']
for col in desired_cols:
    all_df[col].str.title()

# Sort by Publication Year ascending.
all_df = all_df.sort_values(by="Publication Year", ascending=True).reset_index(drop=True)

In [None]:
# Generate Unique Study_ID
all_df.index.name = 'Study_ID'

In [None]:
# Export all JGE title and abstract reviewed articles
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_start_jge.csv')
all_df.to_csv(export_path)

In [None]:
# Create clean dataframe to export for other author review
clean_df = all_df.copy()

# Remove reasons for exlusion and previously marked exclusions for clean export and second author review.
clean_df['Exclude'] = None
clean_df['Reason ID'] = None

# Export to CSV
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_clean.csv')
clean_df.to_csv(export_path)

In [None]:
# Create sample of 10% of papers
# clean_sample_df = create_results_sample(clean_df, 0.1, 42) # '42' argument specificies random number to great sample from

# export clean data to raw data directory as ms_title_abstract review start.csv
# clean_sample_df.to_csv("/".join([PROC_DATA_DIR, "title_abstract_review", "ta_review_ms_start.csv"]))

In [None]:
# Import second author (MS) Title Abstract Review Data
import_path = os.path.join(RAW_DATA_DIR, 'title_abstract_review', 'ta_review_end_ms.csv')
ms_reviewed_df = pd.read_csv(import_path)

NameError: name 'REPORT_RAW_DATA_DIR' is not defined

In [None]:
# Import title and abstract review results from JGE.
import_path = os.path.join(RAW_DATA_DIR, 'title_abstract_review', 'ta_review_end_jge.csv')
jge_reviewed_df = pd.read_csv(import_path)

# Rename columns before merge with second author (MS) review data.
jge_reviewed_df = jge_reviewed_df.rename(columns={
    'Exclude': 'Exclude_JGE',
    'Reason ID': 'Reason ID_JGE'
})

NameError: name 'REPORT_RAW_DATA_DIR' is not defined

In [None]:
# Drop columns in ms_reviewed_df that are redundant in jge_reviewed_df (except for columns to add)
ms_reviewed_df = ms_reviewed_df[['Study_ID', 'Exclude', 'Reason ID']]

# Rename columns in the reviewed dataset
ms_reviewed_df.rename(columns={
    'Exclude': 'Exclude_MS', 
    'Reason ID': 'Reason ID_MS'
    }, 
    inplace=True
)

# Merge the datasets rightwards, so only papers review by MS and JGE are together.
jge_ms_reviewed_df = pd.merge(jge_reviewed_df, ms_reviewed_df, on='Study_ID', how='right')

In [None]:
# Calculate agreement
jge_ms_agree = jge_ms_reviewed_df[jge_ms_reviewed_df['Exclude_JGE'] == jge_ms_reviewed_df['Exclude_MS']]
print(f"JGE and MS agree on {len(jge_ms_agree)}/{len(jge_ms_reviewed_df)} results reviewed by MS.")

# Calculate disagreement
jge_ms_disagree = jge_ms_reviewed_df[jge_ms_reviewed_df['Exclude_JGE'] != jge_ms_reviewed_df['Exclude_MS']]
print(f"JGE and MS disagree on {len(jge_ms_disagree)}/{len(jge_ms_reviewed_df)} results reviewed by MS.")

# Export to CSV
jge_ms_agree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_agree_jge_ms.csv']), index=False)
jge_ms_disagree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_disagree_jge_ms.csv']), index=False)

# Import Cohen's Kappa from scikit-learn
from sklearn.metrics import cohen_kappa_score

y1 = jge_ms_reviewed_df['Exclude_JGE']
y2 = jge_ms_reviewed_df['Exclude_MS']

cohen_kappa_score(y1, y2)

JGE and MS agree on 34/50 results reviewed by MS.
JGE and MS disagree on 16/50 results reviewed by MS.


np.float64(0.3808049535603716)

At this point, the disagreements were then sent to my supervisor (GC) for a third review

In [None]:
import_path = os.path.join(RAW_DATA_DIR, "title_abstract_review", "ta_review_end_gc.csv")
gc_reviewed_df = pd.read_csv(import_path)

gc_reviewed_df = gc_reviewed_df[["Study_ID", "Exclude_GC", "Reason ID_GC", "Review_GC", "Comments_GC"]]

In [None]:
# Merge all JGE reviewed articles and all MS review articles
reviewed_df = pd.merge(
    jge_reviewed_df,
    ms_reviewed_df,
    on='Study_ID',
    how='left'
)

# Merge GC reviewed articles with these
reviewed_df = pd.merge(
    reviewed_df,
    gc_reviewed_df,
    on='Study_ID',
    how='left'
)

In [None]:
# Merge the datasets rightwards, so only papers review by GC and JGE are together.
jge_gc_reviewed_df = pd.merge(jge_reviewed_df, gc_reviewed_df, on='Study_ID', how='right')

# Calculate agreement
jge_gc_agree = jge_gc_reviewed_df[jge_gc_reviewed_df['Exclude_JGE'] == jge_gc_reviewed_df['Exclude_GC']]
print(f"JGE and GC agree on {len(jge_gc_agree)}/{len(jge_gc_reviewed_df)} results reviewed by GC.")

# Calculate disagreement
jge_gc_disagree = jge_gc_reviewed_df[jge_gc_reviewed_df['Exclude_JGE'] != jge_gc_reviewed_df['Exclude_GC']]
print(f"JGE and GC disagree on {len(jge_gc_disagree)}/{len(jge_gc_reviewed_df)} results reviewed by GC.")

# Export to CSV
jge_ms_agree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_agree_jge_gc.csv']), index=False)
jge_ms_disagree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_disagree_jge_gc.csv']), index=False)

# Calculate Cohen's Kappa between JGE and GC
y1 = jge_gc_reviewed_df['Exclude_JGE']
y2 = jge_gc_reviewed_df['Exclude_GC']

cohen_kappa_score(y1, y2)

JGE and GC agree on 10/16 results reviewed by GC.
JGE and GC disagree on 6/16 results reviewed by GC.


np.float64(0.18644067796610164)

In [None]:
# Merge the datasets rightwards, so only papers review by MS and JGE are together.
ms_gc_reviewed_df = pd.merge(ms_reviewed_df, gc_reviewed_df, on='Study_ID', how="right")

# Calculate agreement
ms_gc_agree = ms_gc_reviewed_df[ms_gc_reviewed_df['Exclude_MS'] == ms_gc_reviewed_df['Exclude_GC']]
print(f"MS and GC agree on {len(ms_gc_agree)}/{len(ms_gc_reviewed_df)} results reviewed by GC.")

# Calculate disagreement
ms_gc_disagree = ms_gc_reviewed_df[ms_gc_reviewed_df['Exclude_MS'] != ms_gc_reviewed_df['Exclude_GC']]
print(f"MS and GC disagree on {len(ms_gc_disagree)}/{len(ms_gc_reviewed_df)} results reviewed by GC.")

# Export to CSV
ms_gc_agree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_agree_ms_gc.csv']), index=False)
ms_gc_disagree.to_csv("/".join([PROC_DATA_DIR, 'title_abstract_review', 'ta_review_disagree_ms_gc.csv']), index=False)

# Calculate Cohen's Kappa between MS and GC
y1 = ms_gc_reviewed_df['Exclude_MS']
y2 = ms_gc_reviewed_df['Exclude_GC']

cohen_kappa_score(y1, y2)

MS and GC agree on 6/16 results reviewed by GC.
MS and GC disagree on 10/16 results reviewed by GC.


np.float64(-0.1594202898550725)

In response to the adjucation of the JGE-MS title and abstract review by GC, GC comments:

"Just gone through them. I’m afraid I’ve probably muddied the waters still further. I’ve put ‘unclear’ for a few of them, that might be resolved by looking at the full text. A key question for your exclusion criteria is: are you including children who swallow things like coins and magnets? It’s debatable whether these are ‘intentional’ – children often just put them in their mouths and accidentally swallow them. Even when they do intentionally swallow, it’s unlikely to be DSH. So it depends a bit on what you mean by ‘intentional’. Intending to put it in their mouth? Intending to swallow? Intending to cause themselves harm?"

In [None]:
# Create Exclude_FINAL column in reviewed dataframe to store final decision
reviewed_df['Exclude_FINAL'] = None

In [None]:
# Ensure 'Exclude_FINAL' exists in the DataFrame
reviewed_df['Exclude_FINAL'] = np.nan

reviewed_df['Exclude_FINAL'] = reviewed_df.apply(
    lambda row: row['Exclude_JGE']
    if pd.isna(row['Exclude_MS'])  # If MS is NaN, take JGE's decision
    else (row['Exclude_JGE']
          if row['Exclude_JGE'] == row['Exclude_MS']  # If JGE and MS agree
          else row['Exclude_GC']),  # If JGE and MS disagree, take GC's decision
    axis=1
)

reviewed_df = reviewed_df.rename(columns={
    "Reason ID_MS": "Reason_ID_MS",
    "Reason ID_JGE": "Reason_ID_JGE",
    "Reason ID_GC": "Reason_ID_GC"
})

def resolve_reason_id(row):
    if not row['Exclude_FINAL']:
        return None  # No reason needed if not excluded

    # CASE 1: MS is missing, so JGE's exclusion decision was used
    if pd.isna(row['Exclude_MS']):
        return row['Reason_ID_JGE']
    
    # CASE 2: JGE and MS agree, so their shared exclusion decision was used
    if row['Exclude_JGE'] == row['Exclude_MS']:
        return row['Reason_ID_JGE']
    
    # CASE 3: JGE and MS disagree, so GC’s exclusion decision was used
    return row['Reason_ID_GC']

# Apply it
reviewed_df['Reason_ID_FINAL'] = reviewed_df.apply(resolve_reason_id, axis=1)

# Export to CSV
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_final.csv')
reviewed_df.to_csv(export_path, index=False)

In [None]:
# Create dataframe containing only reviewer decisions, not paper information, just paper ID.
reviewer_decisions_df = reviewed_df[['Study_ID', 'Exclude_JGE', 'Reason_ID_JGE', 'Exclude_MS', 'Reason_ID_MS', 'Exclude_GC', 'Reason_ID_GC', 'Exclude_FINAL']]

# make index equal to paper id
reviewer_decisions_df.index = reviewer_decisions_df['Study_ID']

# Export decisions to CSV
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_decisions_final.csv')
reviewer_decisions_df.to_csv(export_path, index=False)

In [None]:
# Create dataframes of included and excluded results
title_abstract_include_final_df = reviewed_df[reviewed_df['Exclude_FINAL'] == False] # Included
title_abstract_exclude_final_df = reviewed_df[reviewed_df['Exclude_FINAL'] == True] # Excluded

# Calculate number of inclusions and exclusions
title_abstract_inclusion_count = len(title_abstract_include_final_df) # Inclusions
title_abstract_exclusion_count = len(title_abstract_exclude_final_df) # Exclusions
print(f"{title_abstract_exclusion_count} results were excluded during title and abstract screening.")
print(f"{title_abstract_inclusion_count} results were included during title and abstract screening.")

# Export results to CSV
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_include_final.csv')
title_abstract_include_final_df.to_csv(export_path, index=False)
export_path = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_review_exclude_final.csv')
title_abstract_exclude_final_df.to_csv(export_path, index=False)

176 results were excluded during title and abstract screening.
316 results were included during title and abstract screening.


In [None]:
# Export to RIS format for import into Zotero reference manager for full text review
output_file = os.path.join(PROC_DATA_DIR, 'title_abstract_review', 'ta_results.ris')

# Export dataframe to RIS
dataframe_to_ris(title_abstract_include_final_df, output_file)

#