# Second Author Title and Abstract Review

In [10]:
# Import Libraries
from notebook_setup import *
from helpers import *

In [12]:
google_df = pd.read_csv("/".join([PROC_DATA_DIR, "cleaned_google_scholar_title_abstract_screen_end_1.csv"]))
database_df = pd.read_csv("/".join([PROC_DATA_DIR, "database_search_title_abstract_screen_end.csv"]))

In [13]:
google_df.columns

Index(['Publication Year', 'First Author', 'Authors', 'Summary',
       'Publication Title', 'Title', 'Abstract', 'URL', 'Database', 'DOI',
       'Duplicate', 'Exclude', 'Reason ID', 'Abstract2', 'Unnamed: 14'],
      dtype='object')

In [14]:
database_df.columns

Index(['Publication Year', 'First Author', 'Authors', 'Publication Title',
       'Title', 'Abstract', 'DOI', 'Database', 'Exclude', 'Reason ID',
       'Reason', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')

In [30]:
# Merge all Google Scholar and Database search results
all_df = pd.merge(
    database_df, google_df,
    how='outer',
    on=["Publication Year", 
        "First Author", 
        "Authors",
        "Publication Title",
        "Title",
        "Abstract",
        "Database",
        "Exclude",
        "Reason ID",
        "DOI"]
)

# Specify Desired Column Order
desired_column_order = ["Publication Year", "First Author", "Authors", "Publication Title", "Summary", "Database", "DOI", "URL", "Title", "Abstract", "Abstract2", "Exclude", "Reason ID"]
# Apply to dataframe
all_df = all_df[desired_column_order]

# Fix capitalisation on of fields
all_df["Title"] = all_df["Title"].str.title()
all_df["First Author"] = all_df["First Author"].str.title()
all_df["Authors"] = all_df["Authors"].str.title()
all_df["Abstract"] = all_df["Abstract"].str.title()

# Sort by Publication Year ascending.
all_df = all_df.sort_values(by="Publication Year", ascending=True).reset_index(drop=True)

# Show dataframe
all_df


Unnamed: 0,Publication Year,First Author,Authors,Publication Title,Summary,Database,DOI,URL,Title,Abstract,Abstract2,Exclude,Reason ID
0,0,"Listed, N.",Yee A,,A Yee,Google Scholar,,https://scholar.google.com/scholar?hl=en&as_sd...,K9 Tactical Emergency Casualty Care,,Abstract not available,True,4.0
1,1849,"Ri, Q.",Q Ri,,Q RI - search.proquest.com,Google Scholar,,https://www.proquest.com/openview/eb41c2191c53...,Quarterly Summary Of The Improvements And Disc...,Impr Ovments And D 1Scov Erie S Page 1 Quarte ...,Abstract not available,True,4.0
2,1883,"Howe, A.",Aj Howe,,AJ Howe - 1883 - books.google.com,Google Scholar,,https://books.google.lk/books?hl=en&lr=&id=Ytb...,A Practical And Systematic Treatise On Fractur...,37. Fracture Of The Shaft Of The Humerus.........,Abstract not available,True,2.0
3,1886,"Richardson, M.",Mh Richardson,The Boston Medical And Surgical Journal,MH RICHARDSON - The Boston Medical and Surgica...,Google Scholar,10.1056/NEJM188612161152403,https://www.nejm.org/doi/pdf/10.1056/nejm18861...,A Case Of Gastrotomy. Digital Exploration Of S...,Many Cases Have Been Recorded Where Foreign Bo...,Abstract not available,False,
4,1892,"Otologists, L.",L Otologists,,L OTOLOGISTS - 1892 - cambridge.org,Google Scholar,10.1017/S1755146300162523,https://www.cambridge.org/core/journals/journa...,Association Meetings.,About Twenty-Five Belgian Physicians Comprised...,Abstract not available,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,2024,"Beatrice..., R.","R Soloperto, G Festa, M Beatrice...",Archives Of ...,"R Soloperto, G Festa, M Beatrice... - Archives...",Google Scholar,,https://journals.sbmu.ac.ir/aaem/index.php/aae...,Life-Threatening Carotid Complications Caused ...,Carotid Complications Resulting From Extra-Lum...,Carotid complications resulting from extra-lum...,False,
491,2024,"Ardila, S.","Ardila, Sara; Woodley, Lucille; Ulloa, Emily; ...",Journal Of Laparoendoscopic & Advanced Surgica...,,PubMed,10.1089/lap.2023.0394,,Utilization Of Single-Incision Laparoscopy In ...,Background: A Ban On Neodymium Magnets Was Lif...,,False,
492,2024,"Aourarh, B.","Aourarh, Benayad; Belkouchi, Lina; Saouab, Rac...",Radiology Case Reports,,PubMed,10.1016/j.radcr.2024.07.017,,Hematemesis In A Young Patient: When The Cause...,Foreign Bodies May Be Ingested Accidentally Or...,,False,
493,2024,"Marano, M.","Marano M., Goffredo B.M., Faraci S., Torroni F...",Toxicology Reports,,Embase,10.1016/j.toxrep.2024.101683,,Pharmacokinetic Effects Of Endoscopic Gastric ...,Introduction: Intentional Multiple Drugs Overd...,,True,7.0


In [37]:
# Create clean dataframe
clean_df = all_df

# Remove reasons for exlusion and previously marked exclusions for clean export and second author review.
clean_df['Exclude'] = None
clean_df['Reason ID'] = None

# Export to CSV
clean_df.to_csv("/".join([RAW_DATA_DIR, "all_results_title_abstract_start.csv"]))

In [39]:
# Export all JGE title and abstract review
all_df.to_csv("/".join([PROC_DATA_DIR, "all_results_title_abstract_reviewed_jge_end.csv"]))

In [57]:
# Create 10% sample for review by second author
def create_results_sample(df, sample_pct_size=0.05):

    """
    Creates sample of given dataframe based on a percentage.

    Parameters:
    - df (pd.DataFrame): Results dataframe
    - sample_pct_size (float): A percentage as a decimal value between 0 and 1.

    Returns:
    - pd.DataFrame: A DataFrame containing the sampled rows.
    """
    
    # Validate that a dataframe was given
    if not isinstance(df, pd.DataFrame):
        return TypeError(f"Expected DataFrame, was given {type(df)}")
    
    # Validate that percentage given as decimal
    if not sample_pct_size > 0 or not sample_pct_size < 1:
        return ValueError(f"Sample percentage must be a decimal between 0 and 1.")
    

    
    sample_size = len(df) # Calculate size of given dataframe
    print(f"Given sample size: {sample_size}")  # Print result
    new_sample_size = int(round(sample_size * sample_pct_size)) # Calculate desired sample size.
    
    desired_percentage = int(sample_pct_size * 100)
    print(f"Calculating desired sample size... {desired_percentage}% of {sample_size} = {new_sample_size}")

    # Create the sample DataFrame
    print("Creating Sample Dataframe")
    sample_df = df.sample(n=new_sample_size, random_state=42)  # Use n instead of frac for precise row count
    
    return sample_df
    

In [64]:
# Create sample of 10% of papers
clean_sample_df = create_results_sample(clean_df, 0.1)

clean_sample_df.to_csv("/".join([RAW_DATA_DIR, "ms_title_abstract_review_start.csv"]))

Given sample size: 495
Calculating desired sample size... 10% of 495 = 50
Creating Sample Dataframe
