In [1]:
import pandas as pd
from ast import literal_eval
import re

In [53]:
common_words = {
    "of",
    "and",
    "in",
    "to",
    "from",
    "the",
    "or",
    "for",
    "on",
    "with",
    "are",
    "n/a",
    "t",
    "t1",
    "t2",
    "t3",
    "thereof",
    "patent",
    "as"
    "&",
}

In [54]:
df = pd.read_csv("CheF_100K_final.csv")
df["summarizations"] = df["summarizations"].apply(literal_eval)

In [56]:
# remove entries in set if they contain more than 8 words
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x if len(y.split()) <= 8]))

# remove colons and semicolons
df["summarizations"] = df["summarizations"].apply(lambda x: set([y.replace(":", "").replace(";", "") for y in x]))

# convert to lowercase
df["summarizations"] = df["summarizations"].apply(lambda x: set([y.lower() for y in x]))

# # remove entries of set if they are substrings of other entries in the same set
# df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x if not any([y in z for z in x if y != z])]))

# if label contains format "1. text 2. text 3. text", split into individual entries. Use re
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x for y in re.split(r"\d+\.\s", y) if y != ""]))

# if label contains format "1) text 2) text 3) text", split into individual entries. Use re
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x for y in re.split(r"\d+\)\s", y) if y != ""]))

# remove leading text of the format "1. ", "2. ", etc.
df["summarizations"] = df["summarizations"].apply(lambda x: set([re.sub(r"^\d+\.\s", "", y) for y in x]))

# remove leading text of the format "1) ", "2) ", etc.
df["summarizations"] = df["summarizations"].apply(lambda x: set([re.sub(r"^\d+\)\s", "", y) for y in x]))

# split entries with escape characters on the escape characters
df['summarizations'] = df['summarizations'].apply(lambda x: set([y for y in x for y in re.split(r"[\n\t\r]", y) if y != ""]))

# replace all escape characters with empty string
df["summarizations"] = df["summarizations"].apply(lambda x: set([re.sub(r"[\n\t\r]", "", y) for y in x]))


# NOTE SPLITS
# split summarizations into individual words separated by spaces, creating a set of words, not characters
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x for y in y.split()]))

# split on commas
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x for y in y.split(",") if y != ""]))

# split summarizations by "/" unless the character following the "/" is a number
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x for y in re.split(r"\/(?!\d)", y)]))



# NOTE String structure dependent cleaning
# remove leading text of the form "- "
df["summarizations"] = df["summarizations"].apply(lambda x: set([re.sub(r"^- ", "", y) for y in x]))

# remove leading punctuation
df["summarizations"] = df["summarizations"].apply(lambda x: set([y.lstrip(".,;-") for y in x]))

# remove trailing punctuation
df["summarizations"] = df["summarizations"].apply(lambda x: set([y.rstrip(".,;-") for y in x]))

# if entries in a set begin and end with parentheses, remove the parentheses
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x if not (y.startswith("(") and y.endswith(")"))]))

# remove empty strings
df["summarizations"] = df["summarizations"].apply(lambda x: set([y for y in x if y != ""]))

# remove common words
df["summarizations"] = df["summarizations"].apply(lambda x: set([word for word in x if word not in common_words]))

# remove rows with empty sets
print(len(df))
df = df[df["summarizations"].apply(len) > 0]
print(len(df))

100000
99454


In [57]:
from collections import Counter

# create Counter of terms
all_terms = Counter()
for i in df["summarizations"]:
    all_terms.update(i)


In [58]:
# create dictionary of all singular plural pairs
plural_to_singular = {}
for term in all_terms:
    if term + "s" in all_terms:
        plural_to_singular.update({term + "s": term})

# apply plural_to_singular dictionary to all terms in df["summarizations"]
df["summarizations"] = df["summarizations"].apply(lambda x: [plural_to_singular.get(term, term) for term in x])

# redo Counter of terms
all_terms = Counter()
for i in df["summarizations"]:
    all_terms.update(i)


In [59]:
# print each label in label_counts sorted by count
for label, count in sorted(all_terms.items(), key = lambda x: x[1], reverse = True):
    print(f"{label}: {count}")

inhibitor: 42131
treatment: 32482
disease: 17673
compound: 15334
derivative: 13821
cancer: 11427
receptor: 10294
disorder: 9813
modulator: 9362
antagonist: 8521
agent: 8105
kinase: 7524
therapeutic: 7400
composition: 6517
pharmaceutical: 5950
organic: 5089
agonist: 5027
protein: 4238
inflammatory: 4232
device: 3639
activity: 3538
acid: 3421
antiviral: 3116
cell: 2993
pain: 2833
diabetes: 2799
inhibition: 2778
therapy: 2754
anti-inflammatory: 2625
high: 2501
heterocyclic: 2495
prevention: 2370
antibacterial: 2298
medicament: 2243
material: 2152
cardiovascular: 2144
drug: 2079
inhibitory: 2018
synthesis: 1963
autoimmune: 1926
treating: 1882
control: 1825
inflammation: 1824
obesity: 1773
efficiency: 1756
electroluminescent: 1726
pharmacological: 1706
neurodegenerative: 1704
inhibiting: 1689
novel: 1676
metabolic: 1663
channel: 1655
protease: 1649
light-emitting: 1630
selective: 1628
ligand: 1548
alzheimer's: 1545
light: 1521
infection: 1500
neurological: 1448
substituted: 1446
modulation:

In [60]:
df[["smiles", "cid", "patent_ids", "summarization_sources", "summarizations"]].to_csv("CheF_100K_prefinal_v3_alg_cleaned_labels.csv", index=False)
df[["smiles", "cid", "patent_ids", "summarization_sources", "summarizations"]].to_pickle("CheF_100K_prefinal_v3_alg_cleaned_labels.pkl")

# save set of all terms to file
with open("CheF_100K_prefinal_v3_alg_cleaned_labels_vocab.txt", "w") as f:
    f.write("\n".join(all_terms))