In [1]:
import pandas as pd
import numpy as np

# FAERS

In [2]:
df = pd.read_csv("../data/FAERS_2012Q4_2021Q1.csv", low_memory=False)

# df_val_sub = df_val[["drugname", "prod_ai", "nda_num"]]
# df_val_info_scores = df_val_sub.shape[1] - df_val_sub.count(axis=1)
# df_val["info_scores"] = df_val_info_scores
# df_val = df_val.sort_values(by=["info_scores"])
# df_val_unique_first = df_val.drop_duplicates("drugname", keep="first")

In [15]:
sub_df = df[["drugname", "prod_ai", "nda_num", "route", "dose_amt", "dose_unit", "dose_form"]]

In [18]:
sub_df = sub_df.replace("(?i)unknown", np.nan, regex=True)

In [19]:
sub_df_scores = sub_df.shape[1] - sub_df.count(axis=1)
sub_df["scores"] = sub_df_scores

In [21]:
sorted_sub = sub_df.sort_values(by=["scores"])

In [23]:
sorted_sub = sorted_sub.drop_duplicates("drugname", keep="first")

In [24]:
sorted_sub

Unnamed: 0,drugname,prod_ai,nda_num,route,dose_amt,dose_unit,dose_form,scores
5871187,TECFIDERA,DIMETHYL FUMARATE,204063,ORAL,240,MG,PROLONGED-RELEASE CAPSULE,0
8494106,ORENITRAM,TREPROSTINIL,203496.0,Oral,.5,MG,SLOW RELEASE TABLET,0
29865185,ATEZOLIZUMAB.,ATEZOLIZUMAB,761034.0,Intravenous drip,1200.0,MG,INFUSION,0
8494109,TYSABRI,NATALIZUMAB,125104.0,Intravenous (not otherwise specified),300,MG,INTRAVENOUS INFUSION,0
8494110,AVONEX,INTERFERON BETA-1A,103628.0,Intramuscular,30,UG,SOLUTION FOR INJECTION IN PRE-FILLED PEN,0
...,...,...,...,...,...,...,...,...
1973660,DULCOLAX SODIUM PICOSULFATE),,,,,,,6
1892955,Penicillin,,,,,,,6
4431031,OLYSIO/SIMEPREVIR,,,,,,,6
14785061,PECTOX LISINA,,,,,,,6


In [26]:
indexes_to_keep = sorted_sub.index

In [28]:
final_df = df.iloc[indexes_to_keep]

In [29]:
final_df.to_csv("../data/FAERS_OPT_2012Q4_2021Q1.csv", index=False)

In [37]:
def minify_faers_data(full_faers_filepath: str, minified_faers_filepath: str) -> None:
    """
    Minimizes the FAERS data entries by dropping duplicates following the rule where we only
    retain the unique rows containing the highest information quality, i.e. the entries where
    the most columns are filled.
    This is a compromise where we are willing to sometimes lose some rows that may have had objectively
    higher quality information, but we are assuming that rows that have had more columns filled are more
    likely to have been entered more carefully and so that generally we can assume that this is a safer approach.
    """
    df = pd.read_csv(full_faers_filepath, low_memory=False)
    # We only care about the following columns when it comes to info quality as we use these to create our RxNav queries
    sub_df = df[["drugname", "prod_ai", "nda_num", "route", "dose_amt", "dose_unit", "dose_form"]]
    
    # Some entries contain "UNKNOWN" or "unknown", and should not count towards the info quality score so we remove them
    print("Removing entries with 'unknown' or 'unk', this may take a while...")
    sub_df = sub_df.replace("(?i)(unknown|unk)", np.nan, regex=True)
    print("Done!")
    
    
    # Calculate scores [0-7] by missing rows, where 0 means a perfect entry and 7 means the entry is empty
    # We sort them afterwards from best to worst for the dupe removal step
    sub_df_scores = sub_df.shape[1] - sub_df.count(axis=1)
    sub_df["scores"] = sub_df_scores
    sub_df = sub_df.sort_values(by=["scores"])
    sub_df = sub_df.drop_duplicates("drugname", keep="first")  # Drop all duplicates except the ones with the best scores
    
    indexes_to_keep = sub_df.index
    df = df.iloc[indexes_to_keep]
    df.to_csv(minified_faers_filepath, index=False)

In [45]:
def minify_legacy_data(full_aers_filepath: str, minified_aers_filepath: str) -> None:
    """
    Minimizes the AERS (legacy) data entries by dropping duplicates following the rule where we only
    retain the unique rows containing the highest information quality, i.e. the entries where
    the most columns are filled.
    This is a compromise where we are willing to sometimes lose some rows that may have had objectively
    higher quality information, but we are assuming that rows that have had more columns filled are more
    likely to have been entered more carefully and so that generally we can assume that this is a safer approach.
    """
    df = pd.read_csv(full_aers_filepath, low_memory=False)
    # We only care about the following columns when it comes to info quality as we use these to create our RxNav queries
    sub_df = df[["DRUGNAME", "NDA_NUM", "ROUTE", "DOSE_VBM"]]
    
    # Some entries contain "UNKNOWN" or "UNK", and should not count towards the info quality score so we remove them
    print("Removing entries with 'unknown' or 'unk', this may take a while...")
    sub_df = sub_df.replace("(?i)(unknown|unk)", np.nan, regex=True)
    print("Done!")
    
    
    # Calculate scores [0-7] by missing rows, where 0 means a perfect entry and 7 means the entry is empty
    # We sort them afterwards from best to worst for the dupe removal step
    sub_df_scores = sub_df.shape[1] - sub_df.count(axis=1)
    sub_df["scores"] = sub_df_scores
    sub_df = sub_df.sort_values(by=["scores"])
    sub_df = sub_df.drop_duplicates("DRUGNAME", keep="first")  # Drop all duplicates except the ones with the best scores
    
    indexes_to_keep = sub_df.index
    df = df.iloc[indexes_to_keep]
    df.to_csv(minified_aers_filepath, index=False)

In [46]:
minify_legacy_data(full_aers_filepath="../data/AERS_2004Q1_2012Q3.csv", minified_aers_filepath="../data/AERS_MIN_2004Q1_2012Q3.csv")

Removing entries with 'unknown' or 'unk', this may take a while...
Done!
