### Test on ADVP 1

In [None]:
import pandas as pd
from gwas_table_extraction import *

Import reference table

In [None]:
advp = pd.read_csv("advp.variant.records.hg38.tsv", sep='\t')
advp.head()

In [None]:
advp.info()

Test papers along with their PMC and PMID
- 1-s2.0-S0197458018303816-main.pdf - 30448613 - PMC6331247
- 1-s2.0-S0197458019300727-main.pdf - 30979435 - PMC6783343
- s11357-019-00071-5.pdf - 31055733 - PMC6544706
- s41588-018-0311-9.pdf - 30617256 - PMC6836675
- s41588-019-0358-2.pdf - 30820047 - PMC6463297

In [None]:
# paper file - PMID - PMCID - table id
test_papers_info = [
    ("test_papers/1-s2.0-S0197458018303816-main.pdf", 30448613, "PMC6331247", [1, 2, 3]),
    ("test_papers/1-s2.0-S0197458019300727-main.pdf", 30979435, "PMC6783343", [1]),
    ("test_papers/s11357-019-00071-5.pdf", 31055733, "PMC6544706", [1, 3, 4]), # got error in PMC
    ("test_papers/s41588-018-0311-9.pdf", 30617256, "PMC6836675", [1]),
    ("test_papers/s41588-019-0358-2.pdf", 30820047, "PMC6463297", [1, 2])
]

for file_name, pmid, pmcid, inx_lst in test_papers_info:
    try:
        df_lst = extract_tables_lst_from_paper(pmcid, file_name)
    except:
        print(f"Error while extracting tables from {file_name}")
    try:
        for i in inx_lst:
            df_lst[i-1].to_csv(f"test_tables/{file_name.split('/')[-1].replace('.pdf', '')}_table_{i}.csv", index=False)
    except:
        print(f"Not accurate extraction for {file_name}")

In [None]:
referencing_col_df = pd.read_csv("Rules for harmonizing ADVP papers - Main cols.csv")
referencing_col_df["column_with_context"] = referencing_col_df.apply(lambda x: x["column"] if pd.isna(x["description"]) else x["column"] + ": " + x["description"], axis = 1)

gwas_column_matching_engine = GWASColumnMatchingEngine(referencing_col_df)

for file in os.listdir("./test_tables"):
    if "table" in file and ".csv" in file and "harmonized" not in file:
        print(file)
        df = pd.read_csv(f"./test_tables/{file}")
        df.columns = ['' if 'Unnamed:' in col else col for col in df.columns]
        col_to_ref_col = gwas_column_matching_engine.match_many_col_to_ref_col(df)
        for ref_col in col_to_ref_col:
            print(f"{ref_col}: {col_to_ref_col[ref_col]}")
        print()

In [None]:
modified_df_all = None
for file in os.listdir("./test_tables"):
    if ".csv" in file and "table" in file and "harmonized" not in file:
        df = pd.read_csv(f"./test_tables/{file}")
        df.columns = ['' if 'Unnamed:' in col else col for col in df.columns]
        modified_df = format_original_table(df, gwas_column_matching_engine, remove_unique_col = False)
        modified_df["file_name"] = file
        if modified_df_all is None:
            modified_df_all = modified_df.copy()
        else:
            modified_df_all = pd.concat([modified_df_all, modified_df], ignore_index = True)
        modified_df.to_csv(f"./harmonized_test_tables/{file.replace('.csv', '')}_harmonized.csv", index = False)
modified_df_all.to_csv("./harmonized_test_tables/harmonized_table.csv", index = False)