Sometimes OpenAI returned errors due to high usage. We use this file to find those errors, which we then requery the API with in 04-2_rerun_patent_to_sum_on_errors.py, and then merge them back in here.

In [1]:
from pathlib import Path
import pandas as pd
from ast import literal_eval

In [2]:
results_path = Path("../results/surechembl_smiles_canon_chiral_randomized_patents_l10p_summarizations/100000")

# read in each csv from results path and merge into one df
df = pd.concat([pd.read_csv(file) for file in results_path.glob("*.csv")], ignore_index=True)

In [3]:
df["summarization_sources"] = df["summarization_sources"].apply(literal_eval)

In [4]:
# collect all df["summarization_sources"] into one dictionary
summarizations = {}
for source in df["summarization_sources"]:
    summarizations.update(source)
    

success = {}
errors = {}
for k, v in summarizations.items():
    if v == "API REQUEST ERROR":
        errors.update({k:v})
    else:
        success.update({k:v})

In [5]:
# save errors to df
df_errors = pd.DataFrame(errors.keys(), columns = ["patent_ids"]).to_csv(Path(results_path,"../..","errors_v1.csv"), index=False)

In [6]:
df_fixed_errors = pd.read_csv("../results/errors_reran_all.csv")
df_fixed_errors["summarization_sources"] = df_fixed_errors["summarization_sources"].apply(literal_eval)

print(len(df_fixed_errors))
print(len(success))


# add the fixed errors to the success dictionary
for source in df_fixed_errors["summarization_sources"]:
    success.update(source)

print(len(success))

1501
186648
188149


In [7]:
# check if keys with value "API REQUEST ERROR" exist in summarizations, and replace with value from that set

def replace_errors(curr_summs, ref_summs):
    for key in curr_summs.keys():
        if curr_summs[key] == "API REQUEST ERROR":
            try:
                if ref_summs[key] != "NA":
                    print(f"{key} replaced {curr_summs[key]} for {ref_summs[key]}")
                    curr_summs[key] = ref_summs[key]
            except:
                pass
    return curr_summs

df["summarization_sources"] = df["summarization_sources"].apply(lambda x: replace_errors(x, success))

US-4454111-A replaced API REQUEST ERROR for flavor enhancer / aroma modifier / foodstuff additive
US-2017001947-A1 replaced API REQUEST ERROR for siRNA delivery / Biodegradable / Cationic lipids
EP-0102580-A1 replaced API REQUEST ERROR for 1. Substituted / 2. 1,3,4-benzotriazepines / 3. Medicaments
US-2005245563-A1 replaced API REQUEST ERROR for Chk-1 inhibitors / Cell cycle regulation / Cancer therapy
EP-1445249-B1 replaced API REQUEST ERROR for anti-arthritic / bone destruction inhibitor / AP-1 inhibitor
CA-2473162-A1 replaced API REQUEST ERROR for thyroid hormone receptor ligand / medicinal compositions / high affinity
WO-2022131757-A1 replaced API REQUEST ERROR for organic light emitting / novel compound / device
EP-3181619-A1 replaced API REQUEST ERROR for ion exchange / polymer film / production method
US-9730449-B2 replaced API REQUEST ERROR for pest control / thiazole compound / control activity
US-9818953-B2 replaced API REQUEST ERROR for Aromatic amine derivative / Organic el

In [11]:
df.to_csv("../results/schembl_summs_v2_fixed_errors.csv", index=False)
df.to_pickle("../results/schembl_summs_v2_fixed_errors.pkl")