In [None]:
import pandas as pd

# Load the CSV file
file_path = "outputs/Vaxjo_PMIDs_adjuvant_log.csv"
df = pd.read_csv(file_path)

# Display the column names
df.columns.tolist(), df.head()


In [None]:
# Count the number of each status
status_counts = df['Status'].value_counts()

# Display the result
print(status_counts)
len(df)

In [None]:
# Load the text file and check the frequency of the specific phrase
file_path_txt = "outputs/Vaxjo_PMIDs_adjuvant_responses.txt"

with open(file_path_txt, "r", encoding="utf-8") as file:
    content = file.read()

# Count the occurrences of the target phrase
target_phrase = "No adjuvants mentioned."
frequency = content.count(target_phrase)

frequency


In [None]:
import re
import json
import pandas as pd

# Load the content from the text file
with open("outputs/Vaxjo_PMIDs_adjuvant_responses.txt", "r", encoding="utf-8") as file:
    content = file.read()

# Extract each PMID block
entries = re.split(r'===\s+(\d+)\s+===', content)[1:]
pmid_ids = entries[::2]   # PMIDs
texts = entries[1::2]    # Corresponding text blocks

# Extract data
data = []
for pmid_id, text in zip(pmid_ids, texts):
    if "No adjuvants mentioned." in text:
        continue
    # Find JSON-like blocks
    json_blocks = re.findall(r"\[\s*{.*?}\s*]", text, re.DOTALL)
    for block in json_blocks:
        try:
            adjuvant_list = json.loads(block)
            for item in adjuvant_list:
                data.append({
                    "PMID": pmid_id,
                    "adjuvant": item.get("adjuvant"),
                    "immune_response_mechanism": item.get("immune_response_mechanism")
                })
        except json.JSONDecodeError:
            continue

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("outputs/Vaxjo_PMIDs_adjuvant_extracted.csv", index=False)

print("Extraction complete. Saved to 'Vaxjo_PMIDs_adjuvant_extracted.csv'.")
df

In [None]:
# Count unique adjuvants
unique_adjuvants = df["adjuvant"].nunique()
print(f"Number of unique adjuvants: {unique_adjuvants}")


In [None]:
# Find unique immune response mechanisms containing the word "not" (case-insensitive)
mechanisms = df["immune_response_mechanism"].dropna().unique()

mechanisms_with_not_none = [
    m for m in mechanisms if ("not " in m.lower() or "none " in m.lower() in m.lower())
]

for mechanism in mechanisms_with_not_none:
    print(mechanism)



In [None]:
not_valid_keywords = [
    "None mentioned",
"Not explicitly described",
"None mentioned.",
"Not specified",
"None described",
"Not explicitly described in the text",
"Mechanism of action not yet completely known, but it is comprised of Quillaja saponins, cholesterol and phospholipid, and induces recruitment of leukocytes to draining lymph nodes (dLNs) and spleen, activation of central immune cells, and elevation of cytokines and co-stimulatory molecules.",
"None specified",
"Not mentioned",
"Mechanism not explicitly described",
"Not explicitly stated, but based on the context, it appears to stimulate a robust immune response by enhancing the production of various cytokines and antibody-secreting cells.",
"Not specified in the text.",
"Not explicitly described, but it is known that aluminum salts activate NLRP3 inflammasome and form a depot for slow antigen release.",
"Not explicitly described, but it is known that saponins like QS21 can stimulate immune responses by interacting with TLR4 and TLR8.",
"Not described",
"Mechanism not described",
"Not explicitly described, left blank."
]


# Drop rows with NaN or empty strings (after stripping whitespace)
df_cleaned = df[df["immune_response_mechanism"].notna()].copy()
df_cleaned = df_cleaned[df_cleaned["immune_response_mechanism"].str.strip() != ""]

# Remove rows containing any invalid keyword (case-insensitive)
df_cleaned = df_cleaned[
    ~df_cleaned["immune_response_mechanism"].str.lower().apply(
        lambda x: any(keyword in x for keyword in not_valid_keywords)
    )
].reset_index(drop=True)

# Save to CSV
df_cleaned.to_csv("outputs/Vaxjo_PMIDs_adjuvant_extracted_clean.csv", index=False)

# Preview the result
df_cleaned

In [None]:
# Count unique adjuvants
unique_adjuvants = df_cleaned["adjuvant"].nunique()
print(f"Number of unique adjuvants: {unique_adjuvants}")


In [None]:
# Get and print the sorted list of unique adjuvant names
unique_adjuvant_names = sorted(df_cleaned["adjuvant"].unique())
print("Unique adjuvants:")
for name in unique_adjuvant_names:
    print(name)


In [None]:
mapping = {
    # ---- Adjuplex / ADJ ----
    "ADJ": "Adjuplex (ADJ)",

    # ---- Alum / Aluminum hydroxide family (exact naming variants only) ----
    "Alum": "Alum (aluminum hydroxide)",
    "Al(OH)3": "Alum (aluminum hydroxide)",
    "Alhydrogel": "Alum (aluminum hydroxide)", #commercial brand name for an aluminum hydroxide gel suspension
    "Aluminium Hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminium hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum Hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum Hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide (Alum)": "Alum (aluminum hydroxide)",
    "Aluminum oxyhydroxide": "Alum (aluminum hydroxide)", #Aluminum oxyhydroxide is the formal chemical name for the hydrated crystalline species often used in alum adjuvants (sometimes cited as boehmite-like AlO(OH)), which is the same active form as in common alum formulations. So mapping to “Alum (aluminum hydroxide)” is also correct.
    #"Aluminum phosphate": "Alum (aluminum hydroxide)",

    # NOTE: not mapping "Nanoalum"/"nanoalum" to Alum since it’s a distinct nanoformulation.
    # Also not mapping "Aluminium salts" or "aluminum salt-based adjuvants" (broader classes).

    # ---- Advax / delta inulin (brand vs generic, diacritics, ™/®) ----
    "Advax": "Advax (delta inulin)",
    "Advax delta inulin": "Advax (delta inulin)",
    "Advax® delta inulin": "Advax (delta inulin)",
    "Advax™": "Advax (delta inulin)",
    "Delta Inulin (DI)": "Advax (delta inulin)",
    "Delta inulin": "Advax (delta inulin)",
    "delta inulin": "Advax (delta inulin)",
    "delta inulin (DI)": "Advax (delta inulin)",
    "delta inulin adjuvant (Advax™)": "Advax (delta inulin)",
    "δ-inulin": "Advax (delta inulin)",
    "delta-inulin": "Advax (delta inulin)",

    # Keep specific Advax *formulations* distinct:
    # Advax-2 / -M / -P / -SM / CpG / CpG55.2 etc. are NOT collapsed.

    # ---- BECC spelling/spacing ----
    "BECC 438": "BECC438",
    "BECC 470": "BECC470",

    # ---- Calcium phosphate shorthand ----
    "CAP": "Calcium Phosphate (CAP)",

    # ---- CAF01 encoding variant ----
    "CAF\u00b01": "CAF01",

    # ---- Cholera toxin naming ----
    "Cholera toxin": "Cholera toxin (CT)",

    # ---- Compound 48/80 naming ----
    "C48/80": "Compound 48/80 (C48/80)",

    # ---- CoVaccine HT trademark ----
    "CoVaccine HT™": "CoVaccine HT",

    # ---- CpG ODN (shorthands & plurals used as the same thing here) ----
    "CpG": "CpG ODN",
    "CpG ODNs": "CpG ODN",
    #"CpG motifs": "CpG ODN",
    "CpG oligodeoxynucleotide (CpG-ODN)": "CpG ODN",
    "CpG-ODN": "CpG ODN",

    # Keep specific sequences distinct (not collapsed):
    # "CpG ODN 1826", "CpG M362", "ODN2006" remain separate.


    "Complete Freund's adjuvant": "Complete Freund's Adjuvant (CFA)",
    
    # ---- EM-005 aka GLA-SE ----
    "EM-005 (GLA-SE)": "GLA-SE",

    # ---- Flagellin (case only) ----
    "flagellin": "Flagellin",

    # (FliC, FlaB kept distinct—specific flagellins.)

    # ---- GLA naming variants ----
    "Glucopyranosyl Lipid Adjuvant": "GLA",
    "Glucopyranosyl Lipid Adjuvant (GLA)": "GLA",
    "Glucopyranosyl lipid A": "GLA",
    #"Glucopyranosyl lipid A (G100)": "GLA",
    "Glucopyranosyl lipid adjuvant (GLA)": "GLA",
    "ID93/glucopyranosyl lipid adjuvant (GLA)": "GLA",
    # GLA-AF / GLA-SE / GLA-LSQ are formulations—kept distinct.
    
    "GLA-SE (components included: glucopyranosyl lipid A, squalene emulsion)": "GLA-SE",
    "GLA-squalene emulsion": "GLA-SE",
    "Glucopyranosyl lipid A (G100)": "GLA",
    
    # ---- IFN naming (alpha/beta symbol) ----
    "IFN-alpha": "IFN-α",
    "IFNβ": "IFN-β",

    # ---- Imiquimod alias ----
    "imiquimod (R837)": "Imiquimod (R837)",
    "Imiquimod": "Imiquimod (R837)",

    # ---- Matrix-M trademark ----
    "Matrix-M™": "Matrix-M",

    # ---- MCT trademark ----
    "MCT®": "MCT",

    # ---- MPL naming variants ----
    '''"MPL": "MPL (monophosphoryl lipid A)",
    "MPL®": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid A (MPLA)": "MPLA",'''
    
    # Keep "MPLA" as is (canonical already).
    "MPL": "MPL (monophosphoryl lipid A)",
    "MPL®": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid A (MPLA)": "MPL (monophosphoryl lipid A)",

    "MPL (monophosphoryl lipid A)": "MPLA",
    "MPL": "MPLA",   # redundant safety check, in case not yet mapped
    
    # "MPLA" stays as is (already canonical).
    
    
    
    "M7": "Mastoparan-7 (M7)",

    
    # Keep MPL-SE / MPL+TDM / MPL/DDA distinct (formulations/combinations).

    # ---- Montanide ISA 51 spacing ----
    "Montanide ISA-51": "Montanide ISA 51",
    "Montanide ISA51": "Montanide ISA 51",
    
    # --- Mastoparan naming ---
    "Mastoparan 7": "Mastoparan-7 (M7)",
   
    # ---- PCL abbreviation expansion ----
    "PCL/chitosan NPs": "poly-ϵ-caprolactone/chitosan NPs",

    # ---- Poly(I:C) case ----
    "poly(I:C)": "Poly(I:C)",

    # ---- Polyclonal Antibody Stimulator naming ----
    "polyclonal antibody stimulator-PAS": "Polyclonal Antibody Stimulator (PAS)",

    # ---- QS-21 dash/spacing ----
    "QS21": "QS-21",

    # ---- R848 (resiquimod) alias ----
    "R848": "Resiquimod (R848)",
    "resiquimod": "Resiquimod (R848)",

    # ---- SE (squalene emulsion) naming (keep brands separate like MF59/AS03) ----
    "Squalene-based oil-in-water emulsion system (SE)": "SE",
    "squalene oil-in-water emulsion (SE)": "SE",

    # ---- SWE (Sepivac SWE) naming ----
    "Sepivac SWE": "SWE",
    "SEPIVAC SWETM": "SWE",

    # ---- Squalene case only ----
    "squalene": "Squalene",

    # ---- TDM (trehalose-6,6'-dimycolate) spelling variants ----
    "6,6'-trehalose dimycolate (TDM)": "Trehalose-6,6'-dimycolate (TDM)",

    # ---- α-GalCer naming variants ----
    "α-Galactosylceramide (α-GC)": "α-GalCer",
    "α-Galactosylceramide (αGalCer)": "α-GalCer",
}
df_cleaned["adjuvant_canonical"] = df_cleaned["adjuvant"].replace(mapping)
df_cleaned

In [None]:
print("Before:", df_cleaned["adjuvant"].nunique())
print("After :", df_cleaned["adjuvant_canonical"].nunique())


In [None]:
#set(df_cleaned["adjuvant"]) - set(df_cleaned["adjuvant_canonical"])

In [None]:
# Get and print the sorted list of unique adjuvant names
unique_adjuvant_names_canonical = sorted(df_cleaned["adjuvant_canonical"].unique())
print("Unique adjuvants_canonical:")
for name in unique_adjuvant_names_canonical:
    print(name)


In [None]:
df_cleaned.columns

In [None]:
df_grouped = (
    df_cleaned
    .assign(immune_response_mechanism=lambda x: x["immune_response_mechanism"] + " [" + x["PMID"].astype(str) + "]")
    .groupby("adjuvant_canonical")["immune_response_mechanism"]
    .agg(lambda x: " | ".join(sorted(set(x))))
    .reset_index()
    .sort_values(by="adjuvant_canonical", key=lambda s: s.str.lower())
)

df_grouped.to_csv("outputs/Vaxjo_PMIDs_adjuvant_mechanism_collapsed.csv", index=False)
df_grouped
