**CARD (Comprehensive Antibiotic Resistance Database) Program** is a bioinformatics platform designed for the identification, analysis, and interpretation of antibiotic resistance genes (ARGs) from DNA sequences.

**CARD is used for:**

- Identifying resistance genes in genomic, metagenomic, or read data (e.g., FASTA/FASTQ).

- Annotating antibiotic resistance mechanisms.

- Classifying identified genes by type, mechanism, and the antibiotics to which they confer resistance.

**Database:**

Contains meticulously curated information about antibiotic resistance genes, mutation variants, mechanisms of action, and their phenotypic consequences.

Each gene is associated with:

- Gene name

- Resistance mechanism

- Antibiotics to which it confers resistance

- Scientific references

**Analysis Tool ‚Äî RGI (Resistance Gene Identifier):**

- Used to search for resistance genes in your data.

- Compares your sequences against the CARD database using alignment algorithms (e.g., BLAST, DIAMOND).

- Outputs a list of predicted ARGs with a confidence level (*perfect, strict, loose*).

**Metadata and Mechanisms:**

- Indicates the mechanism of action, such as Œ≤-lactamase, efflux, target alteration, etc.

- Helps interpret which antibiotics are potentially ineffective against bacteria carrying a specific gene.

In [None]:
# @title STEP 1: Frequency of antibiotic mentions in reports

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from glob import glob
import re

# Path to the folder with .txt files
data_folder = "/content/card"  # <-- Specify the path to your folder
all_files = glob(os.path.join(data_folder, "*.txt"))

df_list = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')

        # Extract the strain name from the filename (e.g., "WHO_A")
        filename = os.path.basename(file)
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        if match:
            strain_name = match.group(0)
        else:
            strain_name = filename  # if not found, use the full filename

        df['Strain'] = strain_name
        df_list.append(df[['Strain', 'Antibiotic']])
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Combine all data
all_data = pd.concat(df_list, ignore_index=True)

# Process the 'Antibiotic' column with multiple values
all_data = all_data.dropna(subset=['Antibiotic'])
all_data['Antibiotic'] = all_data['Antibiotic'].str.split('; ')
all_data = all_data.explode('Antibiotic')

# Create a pivot table for the heatmap
pivot_table = all_data.pivot_table(index='Antibiotic', columns='Strain', aggfunc='size', fill_value=0)

pivot_table = pivot_table.T  # Transpose: antibiotics on X-axis, strains on Y-axis

plt.figure(figsize=(16, 8))
sns.heatmap(
    pivot_table,
    cmap='YlGnBu',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'Presence of Resistance'},
    yticklabels=True,
    xticklabels=True,
    annot=True
)

# Y-axis settings: right side and horizontal labels
ax = plt.gca()
ax.yaxis.tick_right()
ax.yaxis.set_label_position("right")
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha='left')  # Horizontal labels

plt.title('Antibiotic Mentions in CARD Reports')
plt.ylabel('Strains')
plt.xlabel('Antibiotics')
plt.tight_layout()
plt.show()

In [None]:
# @title STEP 2: Frequency of resistance mechanisms

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from collections import defaultdict

# Path to the folder with .txt files
data_dir = "/content/card"

# Dictionary to count (antibiotic, mechanism) pairs
pair_counts = defaultdict(int)

# Process all .txt files
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(data_dir, filename)
        try:
            df = pd.read_csv(filepath, sep="\t", dtype=str)

            for _, row in df.iterrows():
                antibiotics = row.get("Antibiotic", "")
                mechanism = row.get("Resistance Mechanism", "")

                if pd.isna(antibiotics) or pd.isna(mechanism):
                    continue

                # Process multiple antibiotics
                for ab in antibiotics.split(";"):
                    ab = ab.strip()
                    if ab:
                        pair_counts[(ab, mechanism.strip())] += 1

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

# Convert to DataFrame
pair_df = pd.DataFrame.from_dict(pair_counts, orient="index", columns=["Count"])
pair_df.index = pd.MultiIndex.from_tuples(pair_df.index, names=["Antibiotic", "Mechanism"])
pair_df = pair_df.reset_index()

# Pivot the table for the heatmap
pivot = pair_df.pivot(index="Mechanism", columns="Antibiotic", values="Count").fillna(0)

# Plotting the graph
plt.figure(figsize=(14, 5))
sns.heatmap(pivot, annot=True, fmt=".0f", cmap="YlGn", cbar_kws={"label": "Frequency"})
plt.title("Frequency of Resistance Mechanisms by Antibiotic")
plt.xlabel("Antibiotic")
plt.ylabel("Resistance Mechanism")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


| Mechanism                               | Explanation                                                                                                                                                                                                                         |
|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **antibiotic efflux** | The bacterium uses special proteins (efflux pumps) to actively expel the antibiotic before it can take effect. This is a universal and common mechanism.                                                   |
| **antibiotic target alteration** | The bacterium modifies the structure of the target (e.g., ribosomes or enzymes) to which the antibiotic was supposed to bind. As a result, the antibiotic loses its effectiveness.                 |
| **antibiotic target protection** | The bacterium produces proteins that bind to the antibiotic's target, preventing the antibiotic itself from interacting effectively with it.                                                       |
| **reduced permeability to antibiotic** | The bacterium alters the structure of its cell wall or membrane pores, making it more difficult for the antibiotic to penetrate inside.                                                                 |
| **antibiotic inactivation** | The bacterium synthesizes enzymes that destroy or modify the antibiotic, rendering it ineffective. Examples include Œ≤-lactamases, acetyltransferases, and phosphotransferases.                         |


In [None]:
# @title STEP 3: Searching for ARGs

import pandas as pd
import glob

# Path to all txt files in the folder
files = glob.glob("/content/card/*.txt")

# List to store all dataframes
all_data = []

for file in files:
    df = pd.read_csv(file, sep='\t')  # if the separator is a tab
    df['Source_File'] = file  # to know which file the data is from
    all_data.append(df)

# Combine all dataframes into one
combined_df = pd.concat(all_data, ignore_index=True)

# Save as Excel or CSV
combined_df.to_csv("combined_data.csv", index=False)
combined_df.to_excel("combined_data.xlsx", index=False)


In [None]:
# @title STEP 4: Unique genes

import pandas as pd
import glob

all_files = glob.glob("/content/card/*.txt")
all_genes = set()

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        genes = df["Best_Hit_ARO"].unique()
        all_genes.update(genes)
    except Exception as e:
        print(f"Error in file {file}: {e}")

# Output all unique genes
for gene in sorted(all_genes):
    print(gene)


In [None]:
# @title STEP 5: Checking for alternative gene names and getting gene counts

import pandas as pd
import os
import glob
import re

# üìÅ Path to the CARD files folder
folder_path = "card"

# üîÅ Collect all files
all_files = glob.glob(os.path.join(folder_path, "*.tsv")) + glob.glob(os.path.join(folder_path, "*.txt"))

# üß¨ Gene Mapping
gene_mapping = {
    # Existing mappings
    "pbp2": "penA",
    "pbp1": "ponA",
    "gyra": "gyrA",
    "parc": "parC",
    "mtrc": "mtrC",
    "mtrr": "mtrR",
    "mtra": "mtrA",
    "por": "porB",
    "porin pib": "porB",
    "rpsj": "rpsJ",
    "tem-1": "TEM-1",
    "tem-135": "TEM-135",
    "tet(m)": "tet(M)",
    "macb": "macB",
    "maca": "macA",
    "16s rrna": "16S rRNA",

    # Added synonyms and spelling variations

    # Neisseria gonorrhoeae 16S rRNA mutation conferring resistance to spectinomycin
    "neisseria gonorrhoeae 16s rrna": "16S rRNA", # More general match
    "16s ribosomal rna": "16S rRNA",
    "16s_rrna": "16S rRNA",

    # Neisseria gonorrhoeae PBP1 conferring resistance to beta-lactam antibiotics
    "neisseria gonorrhoeae pbp1": "ponA",
    "neisseria gonorrhoeae penicillin-binding protein 1": "ponA",
    "penicillin-binding protein 1": "ponA",
    "pona": "ponA", # Ensure 'pona' is also mapped

    # Neisseria gonorrhoeae PBP2 conferring resistance to beta-lactam antibiotics
    "neisseria gonorrhoeae pbp2": "penA",
    "neisseria gonorrhoeae penicillin-binding protein 2": "penA",
    "penicillin-binding protein 2": "penA",
    "pena": "penA", # Ensure 'pena' is also mapped

    # Neisseria gonorrhoeae gyrA with mutations conferring resistance to fluoroquinolones
    "neisseria gonorrhoeae gyra": "gyrA",
    "gyrase subunit a": "gyrA",
    "dna gyrase subunit a": "gyrA",

    # Neisseria gonorrhoeae mtrC with mutation conferring resistance to azithromycin
    "neisseria gonorrhoeae mtrc": "mtrC",
    "mtrc efflux pump": "mtrC",

    # Neisseria gonorrhoeae mtrR with mutation conferring resistance
    "neisseria gonorrhoeae mtrr": "mtrR",
    "mtrr transcriptional regulator": "mtrR",

    # Neisseria gonorrhoeae parC conferring resistance to fluoroquinolones
    "neisseria gonorrhoeae parc": "parC",
    "topoisomerase iv subunit a": "parC", # Alternative name
    "parc topoisomerase": "parC",

    # Neisseria gonorrhoeae porin PIB (por)
    "neisseria gonorrhoeae porin pib": "porB",
    "neisseria gonorrhoeae por": "porB",
    "porin b": "porB",
    "porb": "porB", # Ensure 'porb' is also mapped

    # TEM-1, TEM-135
    "beta-lactamase tem-1": "TEM-1",
    "tem 1": "TEM-1",
    "beta-lactamase tem-135": "TEM-135",
    "tem 135": "TEM-135",

    # macA, macB
    "maca efflux protein": "macA",
    "macrolide efflux protein a": "macA",
    "macb efflux protein": "macB",
    "macrolide efflux protein b": "macB",

    # mtrA, mtrR
    "mtra efflux regulator": "mtrA",
    "mtrr efflux regulator": "mtrR",

    # rpsJ
    "ribosomal protein sj": "rpsJ",
    "30s ribosomal protein sj": "rpsJ",

    # tet(M)
    "tetm": "tet(M)",
    "tetracycline resistance protein tetm": "tet(M)",
    "tetracycline resistance gene tet(m)": "tet(M)",
}

# üßº Improved gene name extraction function
def extract_gene_name(entry):
    original_entry = entry # Save the original entry for debugging
    entry_lower = entry.lower()

    # 1. First, check for specific and long matches from gene_mapping
    # Sort keys by decreasing length to prioritize more specific matches
    for key in sorted(gene_mapping.keys(), key=len, reverse=True):
        if key in entry_lower:
            return gene_mapping[key]

    # 2. If no direct match, try to extract known gene parts from longer descriptions
    known_ng_genes = [
        "penA", "ponA", "gyrA", "parC", "mtrC", "mtrR", "mtrA", "porB",
        "16S rRNA", "TEM-1", "TEM-135", "macA", "macB", "rpsJ", "tet(M)"
    ]

    for gene_core in known_ng_genes:
        # Look for an exact case-insensitive match of the core gene name
        if gene_core.lower() in entry_lower:
            return gene_core # Return the standardized gene name

    # 3. Fallback: If nothing is found, use a regex to find potential gene-like strings
    match = re.search(r"\b([a-z0-9\-_\(\)]+)\b", entry_lower)
    if match:
        # This is risky, better to ensure all names are mapped.
        pass # Move to the next step

    # 4. Final option: if nothing matches, return a truncated version of the original string
    # or mark as "UNKNOWN_GENE" for manual review.
    return f"UNKNOWN_GENE: {original_entry[:50].strip()}" # Truncate but add a prefix for identification

# üìä Collecting all results
all_genes = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine="python")

        if "Best_Hit_ARO" not in df.columns:
            print(f"Skipping file (missing 'Best_Hit_ARO' column): {file}")
            continue

        df["Cleaned_Gene"] = df["Best_Hit_ARO"].apply(extract_gene_name)

        # Add a column with the strain name
        filename = os.path.basename(file)
        strain = re.search(r"WHO_[A-Za-z0-9]+", filename)
        strain_name = strain.group(0) if strain else filename
        df["Strain"] = strain_name

        all_genes.append(df[["Strain", "Cleaned_Gene"]])

    except Exception as e:
        print(f"Error in file {file}: {e}")

# üß© Combine everything
all_data = pd.concat(all_genes)

# üëÅ Show a summary of all unique genes
gene_summary = all_data["Cleaned_Gene"].value_counts().reset_index()
gene_summary.columns = ["Gene", "Total_Count"]
print(gene_summary)


In [None]:
# @title STEP 6: Visualization

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Table: genes, mechanisms, and number of strains
data = {
    "Gene": [
        "mtrA", "penA", "rpsJ", "mtrC", "ponA", "porB",
        "gyrA", "mtrR", "parC", "TEM-1", "tet(M)",
        "macB", "16S rRNA", "macA"
    ],
    "Mechanism": [
        "Component of MtrCDE system (expression activation)",
        "Altered PBP2 (cephalosporin resistance)",
        "Mutation in rpsJ (tetracycline target)",
        "MtrCDE efflux system (transporter)",
        "Altered PBP1 (penicillin resistance)",
        "Porin (reduced permeability)",
        "Mutations in gyrA (fluoroquinolone resistance)",
        "Repressor of MtrCDE system",
        "Mutations in parC (fluoroquinolone resistance)",
        "Class A beta-lactamase (TEM-1)",
        "Ribosome protection protein (TetM)",
        "Efflux mechanism (macAB transporter)",
        "16S rRNA mutation (spectinomycin resistance)",
        "Efflux mechanism (macAB transporter)"
    ],
    "Strain_Count": [29, 25, 25, 24, 22, 21, 17, 11, 10, 8, 4, 2, 1, 1]
}

df = pd.DataFrame(data)

# Plotting the graph
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x="Strain_Count", y="Gene", hue="Mechanism", dodge=False, palette="tab20")

plt.xlabel("Number of Strains")
plt.ylabel("Gene")
plt.title("Distribution of Genes by Resistance Mechanisms")
plt.legend(title="Resistance Mechanism", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


**Details:**

| Gene                   | Description                  | Resistance Mechanism                                                                     |
| ---------------------- | ---------------------------- | ---------------------------------------------------------------------------------------- |
| **penA** | Œ≤-lactamase (PBP2)           | Mutations in **penA** (PBP2) reduce the affinity for Œ≤-lactams, leading to cephalosporin resistance. |
| **ponA** | PBP1                         | Mutations reduce sensitivity to penicillin.                                              |
| **mtrA / mtrC / mtrR** | Components of **MtrCDE** system | Efflux of antibiotics: pumps out macrolides, Œ≤-lactams, and fluoroquinolones.          |
| **porB** | Porin PIB                    | Alterations lead to reduced permeability of the outer membrane.                         |
| **gyrA / parC** | Topoisomerases               | Mutations lead to resistance to fluoroquinolones.                                        |
| **rpsJ** | Ribosomal protein            | Mutations lead to resistance to tetracycline.                                            |
| **TEM-1** | Œ≤-lactamase                  | Degrades penicillins.                                                                    |
| **tet(M)** | Ribosome protection protein  | Protects the ribosome from the action of tetracycline.                                   |
| **macA / macB** | Efflux system                | Pumps out macrolides.                                               |
| **16S rRNA** | Ribosomal RNA                | Mutations lead to resistance to spectinomycin.                                           |


In [None]:
# @title STEP 7: Gene presence per strain and counts

import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob

# üîç 1. Define the genes of interest
target_genes = {
    "neisseria gonorrhoeae 16s rrna": "16S rRNA",
    "pbp1": "PBP1",
    "pbp2": "PBP2",
    "gyra": "gyrA",
    "mtrc": "mtrC",
    "mtrr": "mtrR",
    "parc": "parC",
    "por": "porB",      # refined name
    "tem-1": "TEM-1",
    "tem-135": "TEM-135",
    "maca": "macA",
    "macb": "macB",
    "mtra": "mtrA",
    "rpsj": "rpsJ",
    "tet(m)": "tet(M)",
    "pena": "penA",      # add penA (PBP2)
    "pona": "ponA"       # add ponA (PBP1)
}

# üîÑ 2. Function to simplify the gene name
def simplify_gene(name):
    name_lower = name.lower()
    for key, simple in target_genes.items():
        if key in name_lower:
            return simple
    return None

# üìÇ 3. Read all files from the folder
folder_path = "card"    # ‚Üê path to your folder
all_files = glob(os.path.join(folder_path, "*.txt"))

df_list = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        filename = os.path.basename(file)
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        strain = match.group(0) if match else filename
        df["Strain"] = strain
        df["gene_cleaned"] = df["Best_Hit_ARO"].apply(simplify_gene)
        df_filtered = df.dropna(subset=["gene_cleaned"])
        df_list.append(df_filtered[["Strain", "gene_cleaned"]])
    except Exception as e:
        print(f"Error processing {file}: {e}")

# üìä 4. Combine and create a pivot table
all_data = pd.concat(df_list, ignore_index=True)

# Group by strain and gene, then unstack to get columns for each gene
gene_presence = all_data.groupby(["Strain", "gene_cleaned"]).size().unstack(fill_value=0)

# üîÅ Convert to a binary (0/1) table - we are only interested in gene presence
gene_presence_binary = gene_presence.applymap(lambda x: 1 if x > 0 else 0)

# üñºÔ∏è 5. Visualize with a heatmap
plt.figure(figsize=(10,6)) # Increase size for better readability
sns.heatmap(gene_presence_binary, cmap=["#ffffff", "#2a9d8f"], cbar=False, linewidths=0.5, linecolor='gray')

plt.title("Presence of Resistance Genes by Strain")
plt.xlabel("Genes")
plt.ylabel("Strains")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Plot a histogram of the number of genes per strain
# üìà 6. Plot a histogram of gene counts for each strain
# Calculate the sum of binary values for each strain (number of unique genes)
gene_counts_per_strain = gene_presence_binary.sum(axis=1)

# Sort strains by the number of genes in descending order
gene_counts_per_strain_sorted = gene_counts_per_strain.sort_values(ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x=gene_counts_per_strain_sorted.values, y=gene_counts_per_strain_sorted.index, palette="viridis")

plt.title("Number of Resistance Genes per Strain (Descending)")
plt.xlabel("Number of Genes")
plt.ylabel("Strain")
plt.xticks(rotation=0) # No need to rotate X-axis labels now
plt.tight_layout()
plt.show()

In [None]:
# @title STEP 8: Antibiotic classes

import pandas as pd
import os
from glob import glob

# Path to the files
folder_path = "card"  # Change path if necessary
all_files = glob(os.path.join(folder_path, "*.txt"))

antibiotic_columns = ["Drug Class", "Drug_Class", "Drug_Class_Name", "Antibiotic", "Resistance Mechanism", "AMR Gene Family"]

unique_antibiotics = set()

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        # Trying to find the required column
        for col in antibiotic_columns:
            if col in df.columns:
                unique_antibiotics.update(df[col].dropna().unique())
                break
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Print all unique antibiotic classes
print("Unique classes/antibiotics found in the data:")
for ab in sorted(unique_antibiotics):
    print(ab)


In [None]:
# @title STEP 9: Unique resistance mechanisms

import pandas as pd
import os
from glob import glob

folder_path = "card"  # Path to the files
all_files = glob(os.path.join(folder_path, "*.txt"))

amr_gene_family_values = set()

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        if "AMR Gene Family" in df.columns:
            amr_gene_family_values.update(df["AMR Gene Family"].dropna().unique())
    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Unique values in the 'AMR Gene Family' column:")
for val in sorted(amr_gene_family_values):
    print(val)


In [None]:
# @title STEP 10: Creating a file with proteins

import pandas as pd
import os
from glob import glob

folder_path = "card"  # Path to the files
all_files = glob(os.path.join(folder_path, "*.txt"))

# List to store data from all files
protein_data = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        # Check if the required columns exist
        if "Predicted_Protein" in df.columns and "CARD_Protein_Sequence" in df.columns:
            # Extract the required columns and add to the list
            protein_data.append(df[["Predicted_Protein", "CARD_Protein_Sequence"]].dropna())
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Combine all data into a single DataFrame
if protein_data:
    all_proteins_df = pd.concat(protein_data, ignore_index=True)
    # Save to CSV
    all_proteins_df.to_csv("extracted_proteins.csv", index=False)
    print("Data successfully saved to 'extracted_proteins.csv'")
else:
    print("No data found in the specified columns.")


In [None]:
# @title STEP 11: Creating a file of proteins with strains

import pandas as pd
import os
import re
from glob import glob

folder_path = "card"  # Path to the files
all_files = glob(os.path.join(folder_path, "*.txt"))

df_list = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')

        # Extract the strain name from the filename (e.g., "WHO_A")
        filename = os.path.basename(file)
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        if match:
            strain_name = match.group(0)
        else:
            strain_name = filename  # if not found, use the full filename

        # Check for the presence of required columns
        if "Predicted_Protein" in df.columns and "CARD_Protein_Sequence" in df.columns:
            # Add the strain column
            df['Strain'] = strain_name
            # Select the desired columns
            df_list.append(df[['Strain', 'Predicted_Protein', 'CARD_Protein_Sequence']].dropna())
    except Exception as e:
        print(f"Error processing {file}: {e}")

if df_list:
    all_proteins_df = pd.concat(df_list, ignore_index=True)
    all_proteins_df.to_csv("proteins_with_strain.csv", index=False)
    print("Data successfully saved to 'proteins_with_strain.csv'")
else:
    print("No data found in the specified columns.")


In [None]:
# @title STEP 12: Creating a file of predicted genes with strains

import pandas as pd
import os
import re
from glob import glob

folder_path = "card"  # Path to the files
all_files = glob(os.path.join(folder_path, "*.txt"))

df_list = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')

        # Extract the strain name from the filename (e.g., "WHO_A")
        filename = os.path.basename(file)
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        if match:
            strain_name = match.group(0)
        else:
            strain_name = filename

        # Check for the presence of required columns
        required_cols = ["ORF_ID", "ARO", "Predicted_DNA"]
        if all(col in df.columns for col in required_cols):
            df['Strain'] = strain_name
            df_list.append(df[required_cols + ['Strain']].dropna(subset=["Predicted_DNA"]))
    except Exception as e:
        print(f"Error processing {file}: {e}")

if df_list:
    all_genes_df = pd.concat(df_list, ignore_index=True)
    all_genes_df.to_csv("predicted_genes_with_strain.csv", index=False)
    print("Data successfully saved to 'predicted_genes_with_strain.csv'")
else:
    print("No data found in the specified columns.")


| Main class | Examples from the list | Mechanism of action | Use in Gonorrhea / Comments |
| :--- | :--- | :--- | :--- |
| **Beta-lactams** | cephalosporin; penicillin beta-lactam<br>monobactam; cephalosporin; penicillin beta-lactam<br>monobactam; carbapenem; cephalosporin; penicillin beta-lactam | Inhibition of cell wall synthesis (inhibition of penicillin-binding proteins, PBPs) | Widely used against *N. gonorrhoeae*. Resistance occurs via **PBP2 (penA)**, **TEM-1**, **ponA**. |
| **Fluoroquinolones** | fluoroquinolone antibiotic | Inhibition of DNA gyrase and topoisomerase IV | Used, but resistance via **gyrA** and **parC** is common. |
| **Macrolides** | macrolide antibiotic<br>macrolide antibiotic; antibacterial free fatty acids | Blockage of protein translation (50S ribosomal subunit) | For example, **azithromycin**. Resistance through **mtrCDE**, **macA/B**, **mtrR**. |
| **Tetracyclines** | tetracycline antibiotic<br>monobactam; carbapenem; cephalosporin; penicillin beta-lactam; tetracycline antibiotic | Blockage of protein translation (30S ribosomal subunit) | Previously used. Resistance through **rpsJ**, **tet(M)**. |
| **Aminoglycosides** | aminoglycoside antibiotic | Disruption of protein synthesis (30S ribosome, initiation impairment) | Used experimentally. Resistance mechanisms are diverse. |
| **Antibacterial fatty acids** | macrolide antibiotic; antibacterial free fatty acids | Various mechanisms: membrane disruption, growth inhibition | A less common approach; potential for cross-resistance exists. |

In [None]:
# @title STEP 13: Visualization by antibiotic class

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data: genes, classes, and number of strains
data = {
    "Gene": ["mtrA", "mtrC", "mtrR", "macA", "macB", "TEM-1", "TEM-135", "penA", "ponA", "porB",
             "gyrA", "parC", "rpsJ", "tet(M)", "16S rRNA"],
    "Class": [
        "Macrolide", "Macrolide", "Macrolide", "Macrolide", "Macrolide",
        "Beta-lactam", "Beta-lactam", "Beta-lactam", "Beta-lactam", "Beta-lactam",
        "Fluoroquinolone", "Fluoroquinolone",
        "Tetracycline", "Tetracycline",
        "Aminoglycoside"
    ],
    "Strain_Count": [29, 24, 11, 1, 2, 8, 0, 25, 22, 21, 17, 10, 25, 4, 1]
}

df = pd.DataFrame(data)

# Group by class
class_summary = df.groupby("Class")["Strain_Count"].sum().reset_index()

# Sort
class_summary = class_summary.sort_values("Strain_Count", ascending=False)

# Visualization
plt.figure(figsize=(8, 4))
sns.barplot(data=class_summary, x="Strain_Count", y="Class", palette="Set3")

plt.title("Distribution of Antibiotic Classes")
plt.xlabel("Number of Mentions in CARD Output File")
plt.ylabel("Antibiotic Class")
plt.tight_layout()
plt.show()

In [None]:
# @title STEP 14: Grouping strains by antibiotic class

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from glob import glob
import re

# üîç Gene to antibiotic class mapping
gene_class_map = {
    "mtrA": "Macrolide", "mtrC": "Macrolide", "mtrR": "Macrolide",
    "macA": "Macrolide", "macB": "Macrolide",
    "TEM-1": "Beta-lactam", "TEM-135": "Beta-lactam", "penA": "Beta-lactam",
    "ponA": "Beta-lactam", "porB": "Beta-lactam",
    "gyrA": "Fluoroquinolone", "parC": "Fluoroquinolone",
    "rpsJ": "Tetracycline", "tet(M)": "Tetracycline",
    "16S rRNA": "Aminoglycoside"
}

# üìÇ Path to files
folder_path = "/content/card"  # specify path to your folder
all_files = glob(os.path.join(folder_path, "*.txt"))

# üîß Function to simplify gene name
def simplify_gene(name):
    name = name.lower()
    for key in gene_class_map:
        if key.lower() in name:
            return key
    return None

df_list = []

# üì• Read files and collect genes by strain
for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')
        strain = re.search(r'WHO_[A-Za-z0-9]+', os.path.basename(file))
        strain = strain.group(0) if strain else os.path.basename(file)

        df["Strain"] = strain
        df["Gene"] = df["Best_Hit_ARO"].apply(simplify_gene)
        df = df.dropna(subset=["Gene"])
        df["Class"] = df["Gene"].map(gene_class_map)
        df = df.dropna(subset=["Class"])
        df_list.append(df[["Strain", "Class"]])
    except Exception as e:
        print(f"Error processing {file}: {e}")

# üß± Combine into a single table
all_data = pd.concat(df_list, ignore_index=True)
class_matrix = all_data.drop_duplicates().assign(value=1)
class_matrix = class_matrix.pivot(index="Strain", columns="Class", values="value").fillna(0).astype(int)

# üé® Visualization ‚Äî heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(class_matrix, cmap=["#ffffff", "#264653"], linewidths=0.5, linecolor="gray", cbar=False)

plt.title("Presence of Resistance to Antibiotic Classes by Strain")
plt.xlabel("Antibiotic Class")
plt.ylabel("Strain")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# @title STEP 15: Hierarchical clustering (dendrogram)

import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from scipy.cluster.hierarchy import linkage, dendrogram

# üìÇ 1. Path to the CARD files folder
folder_path = "/content/card"
all_files = glob(os.path.join(folder_path, "*.txt"))

# üìå 2. Function to extract antibiotic classes from each file
df_list = []
strain_names = []

for file in all_files:
    try:
        df = pd.read_csv(file, sep="\t", engine='python')

        # Get the strain name from the filename
        filename = os.path.basename(file)
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        strain = match.group(0) if match else filename

        # Extract the 'Drug Class' column or similar (ensure the column is named correctly)
        if "Drug Class" not in df.columns:
            continue

        # Some rows contain multiple classes separated by ";"
        for entry in df["Drug Class"].dropna():
            for cls in entry.split(";"):
                df_list.append((strain, cls.strip()))

        strain_names.append(strain)

    except Exception as e:
        print(f"Error in file {file}: {e}")

# üß™ 3. Create a DataFrame from the collected information
df_classes = pd.DataFrame(df_list, columns=["Strain", "Class"])
df_classes["Value"] = 1

# üßÆ 4. Create a presence matrix for antibiotic classes
class_matrix = df_classes.pivot_table(index="Strain", columns="Class", values="Value", aggfunc="max", fill_value=0)

# ‚ûï 5. Add strains that are not in the matrix (with zeros)
all_strains = sorted(set(re.search(r'WHO_[A-Za-z0-9]+', os.path.basename(f)).group(0)
                         for f in all_files if re.search(r'WHO_[A-Za-z0-9]+', os.path.basename(f))))
for strain in all_strains:
    if strain not in class_matrix.index:
        class_matrix.loc[strain] = 0

# Sort the matrix
class_matrix = class_matrix.sort_index()

# üå≥ 6. Build the dendrogram
linkage_matrix = linkage(class_matrix.values, method='ward')

plt.figure(figsize=(8, 6))
dendrogram(linkage_matrix, labels=class_matrix.index, orientation='left')
plt.title("Phylogenetic Tree of Strains Based on Antibiotic Classes")
plt.xlabel("Distance")
plt.ylabel("Strains")
plt.tight_layout()
plt.show()


In [None]:
# @title STEP 16: SNP Extraction

import os
import csv

# Folder with .txt files
folder_path = "card"
output_snps = set()  # Use a set to exclude duplicates

# Iterate through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter='\t')
            if "SNPs_in_Best_Hit_ARO" not in reader.fieldnames:
                continue  # skip if the column does not exist

            for row in reader:
                snp_field = row.get("SNPs_in_Best_Hit_ARO", "").strip()
                if snp_field:
                    # Assume SNPs are separated by commas or semicolons
                    snps = [s.strip() for s in snp_field.replace(';', ',').split(',') if s.strip()]
                    output_snps.update(snps)

# Save to file
with open("best_snps_summary.txt", "w", encoding="utf-8") as out_file:
    for snp in sorted(output_snps):
        out_file.write(snp + "\n")

print(f"Extracted {len(output_snps)} unique SNPs into best_snps_summary.txt")


In [None]:
# @title STEP 17: SNPs in genes and with strains

import os
import csv
import re
from collections import defaultdict

# Map: keywords -> clean gene names
gene_name_map = {
    "neisseria gonorrhoeae 16s rrna": "16S rRNA",
    "porin pib": "porB",
    "pbp1": "PBP1",
    "pbp2": "PBP2",
    "gyra": "gyrA",
    "mtrc": "mtrC",
    "mtrr": "mtrR",
    "parc": "parC",
    "por": "porB",
    "tem-1": "TEM-1",
    "tem-135": "TEM-135",
    "maca": "macA",
    "macb": "macB",
    "mtra": "mtrA",
    "rpsj": "rpsJ",
    "tet(m)": "tet(M)",
    "pena": "penA",
    "pona": "ponA"
}

# Normalize gene name by keywords
def normalize_gene(raw_gene: str) -> str:
    raw_gene_lc = raw_gene.lower()
    for key in gene_name_map:
        if key in raw_gene_lc:
            return gene_name_map[key]
    unknown_genes.add(raw_gene)
    return raw_gene.strip()

folder_path = "card"
records_with_snp = []
genes_without_snp = []
unknown_genes = set()

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)

        # Extract strain name, e.g., "WHO_F", from the filename
        match = re.search(r'WHO_[A-Za-z0-9]+', filename)
        strain_name = match.group(0) if match else filename

        with open(file_path, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter='\t')
            if "SNPs_in_Best_Hit_ARO" not in reader.fieldnames or "Best_Hit_ARO" not in reader.fieldnames:
                continue

            for row in reader:
                snp_field = row.get("SNPs_in_Best_Hit_ARO", "").strip()
                raw_gene = row.get("Best_Hit_ARO", "").strip()
                gene = normalize_gene(raw_gene)

                # If SNPs are present
                if snp_field and snp_field.lower() not in {"n/a", "-", ".", "none"}:
                    snps = [s.strip() for s in snp_field.replace(';', ',').split(',') if s.strip()]
                    for snp in snps:
                        records_with_snp.append((strain_name, snp, gene))
                else:
                    # Genes without SNPs
                    genes_without_snp.append((strain_name, gene))

# Writing results
with open("strain_snp_gene.tsv", "w", encoding="utf-8") as out_file:
    out_file.write("Strain\tSNP\tGene\n")
    for strain, snp, gene in sorted(records_with_snp):
        out_file.write(f"{strain}\t{snp}\t{gene}\n")
    # Add a block for n/a
    for strain, gene in sorted(genes_without_snp):
        out_file.write(f"{strain}\tn/a\t{gene}\n")

print(f"\n‚úÖ Total records with SNPs: {len(records_with_snp)}")
print(f"‚úÖ Genes without SNPs: {len(genes_without_snp)}")
print("üìÑ Result saved to strain_snp_gene.tsv")

if unknown_genes:
    print("\n‚ö†Ô∏è Genes not found in the normalization dictionary:")
    for g in sorted(unknown_genes):
        print(f" - {g}")


In [None]:
# @title STEP 18: Top 15 "Antibiotic / Gene / Resistance mechanism" combinations

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# --- 1. Updated dictionary for abbreviating gene and mechanism names ---
ABBREVIATION_MAP = {
    # Complex names that appear together
    "major facilitator superfamily (MFS) antibiotic efflux pump; resistance-nodulation-cell division (RND) antibiotic efflux pump": "MFS/RND efflux pump",
    "General Bacterial Porin with reduced permeability to beta-lactams": "Bacterial Porin (red. perm.)",
    "tetracycline-resistant ribosomal protection protein": "Tet. Rib. Prot. Protein",
    "Penicillin-binding protein mutations conferring resistance to beta-lactam antibiotics": "PBP mutations",

    # Genes
    "fluoroquinolone resistant gyrA": "gyrA",
    "fluoroquinolone resistant parC": "parC",
    "TEM beta-lactamase": "TEM-1",
    "resistance-nodulation-cell division (RND) antibiotic efflux pump": "RND efflux pump",
    "major facilitator superfamily (MFS) antibiotic efflux pump": "MFS efflux pump",

    # Mechanisms
    "antibiotic efflux": "efflux",
    "antibiotic target alteration": "target alteration",
    "antibiotic target protection": "target protection",
    "reduced permeability to antibiotic": "reduced permeability",
    "antibiotic inactivation": "inactivation"
}

def apply_abbreviations(text):
    if '; ' in text:
        parts = [part.strip() for part in text.split(';')]
        processed_parts = []
        for part in parts:
            sorted_keys = sorted(ABBREVIATION_MAP.keys(), key=len, reverse=True)
            for long_form in sorted_keys:
                if long_form in part:
                    part = part.replace(long_form, ABBREVIATION_MAP[long_form])
            processed_parts.append(part)
        return '; '.join(processed_parts)
    else:
        sorted_keys = sorted(ABBREVIATION_MAP.keys(), key=len, reverse=True)
        for long_form in sorted_keys:
            text = text.replace(long_form, ABBREVIATION_MAP[long_form])
        return text

# --- General shortening function (if a label is still too long) ---
def shorten_label_fallback(label, max_length=75):
    if len(label) > max_length:
        return label[:max_length-3] + '...'
    return label

# --- 2. Load data from the specified .csv file ---
file_path = "/content/combined_data.csv"

try:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at path: {file_path}")

    raw_df = pd.read_csv(file_path)

except FileNotFoundError as e:
    print(f"‚ùå Error: {e}. Please upload `combined_data.csv` or provide the correct filename.")
    # exit() # Use exit() in a local script, not Colab
except Exception as e:
    print(f"‚ùå An error occurred while loading the file: {e}")
    # exit()

# --- 3. Preprocessing: Preparing data for visualization ---
required_columns_for_viz = ['Antibiotic', 'AMR Gene Family', 'Resistance Mechanism', 'Source_File']

if all(col in raw_df.columns for col in required_columns_for_viz):
    raw_df['Strain'] = raw_df['Source_File'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

    # Create a combined column with FULL names for console output
    df_combined_exploded_full = raw_df.assign(
        Antibiotic=raw_df['Antibiotic'].astype(str).str.split('; ')
    ).explode('Antibiotic')
    df_combined_exploded_full['Antibiotic'] = df_combined_exploded_full['Antibiotic'].str.strip()
    df_combined_exploded_full = df_combined_exploded_full.dropna(subset=['Antibiotic', 'AMR Gene Family', 'Resistance Mechanism'])
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['Antibiotic'].notna() & (df_combined_exploded_full['Antibiotic'] != 'nan') & (df_combined_exploded_full['Antibiotic'] != 'n/a')]
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['AMR Gene Family'].notna() & (df_combined_exploded_full['AMR Gene Family'] != 'n/a')]
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['Resistance Mechanism'].notna() & (df_combined_exploded_full['Resistance Mechanism'] != 'n/a')]

    if not df_combined_exploded_full.empty:
        df_combined_exploded_full = df_combined_exploded_full.reset_index(drop=True)
        df_combined_exploded_full['Antibiotic_Gene_Mechanism_full'] = \
            df_combined_exploded_full['Antibiotic'].astype(str) + " / " + \
            df_combined_exploded_full['AMR Gene Family'].astype(str) + " / " + \
            df_combined_exploded_full['Resistance Mechanism'].astype(str)

        # Create DataFrame for plotting with abbreviations
        df_combined_exploded_processed = df_combined_exploded_full.copy()
        df_combined_exploded_processed['AMR Gene Family_processed'] = df_combined_exploded_processed['AMR Gene Family'].apply(apply_abbreviations)
        df_combined_exploded_processed['Resistance Mechanism_processed'] = df_combined_exploded_processed['Resistance Mechanism'].apply(apply_abbreviations)
        df_combined_exploded_processed['Antibiotic_Gene_Mechanism_combined_for_plot'] = \
            df_combined_exploded_processed['Antibiotic'].astype(str) + " / " + \
            df_combined_exploded_processed['AMR Gene Family_processed'].astype(str) + " / " + \
            df_combined_exploded_processed['Resistance Mechanism_processed'].astype(str)

        # --- Visualization: Overview of the most frequent combinations ---
        top_combinations_full = df_combined_exploded_full['Antibiotic_Gene_Mechanism_full'].value_counts().head(15)
        top_combinations_plot = df_combined_exploded_processed['Antibiotic_Gene_Mechanism_combined_for_plot'].value_counts().head(15)

        # --- Final output and plotting ---
        print("--- Top 15 Most Frequent Combinations (Antibiotic / Gene / Mechanism) ---")
        print("For legend (full names):")
        print(top_combinations_full.to_string()) # Use .to_string() for full display

        if not top_combinations_plot.empty:
            final_labels_for_plot = top_combinations_plot.index.map(lambda x: shorten_label_fallback(x, max_length=75))

            plt.figure(figsize=(12, 5))
            sns.barplot(x=top_combinations_plot.values, y=final_labels_for_plot, palette='viridis')
            plt.title('Top 15 Most Frequent Combinations (Antibiotic / Gene / Mechanism)', fontsize=16)
            plt.xlabel('Count', fontsize=12)
            plt.ylabel('Combination', fontsize=12)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=10)
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()
        else:
            print("‚ùå No data available for plotting 'Top 15 Combinations'.")

    else:
        print("‚ùå WARNING: DataFrame is empty after cleaning! Check the source data.")
else:
    print(f"‚ùå Error: Not all required columns found in '{file_path}'.")


In [None]:
# @title STEP 19: Top 25 combinations

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# --- 1. Updated dictionary for abbreviating gene and mechanism names ---
ABBREVIATION_MAP = {
    # Complex names that appear together
    "major facilitator superfamily (MFS) antibiotic efflux pump; resistance-nodulation-cell division (RND) antibiotic efflux pump": "MFS/RND efflux pump",
    "General Bacterial Porin with reduced permeability to beta-lactams": "Bacterial Porin (red. perm.)",
    "tetracycline-resistant ribosomal protection protein": "Tet. Rib. Prot. Protein",
    "Penicillin-binding protein mutations conferring resistance to beta-lactam antibiotics": "PBP mutations",

    # Genes
    "fluoroquinolone resistant gyrA": "gyrA",
    "fluoroquinolone resistant parC": "parC",
    "TEM beta-lactamase": "TEM-1",
    "resistance-nodulation-cell division (RND) antibiotic efflux pump": "RND efflux pump",
    "major facilitator superfamily (MFS) antibiotic efflux pump": "MFS efflux pump",

    # Mechanisms
    "antibiotic efflux": "efflux",
    "antibiotic target alteration": "target alteration",
    "antibiotic target protection": "target protection",
    "reduced permeability to antibiotic": "reduced permeability",
    "antibiotic inactivation": "inactivation"
}

def apply_abbreviations(text):
    if isinstance(text, str):
        if '; ' in text:
            parts = [part.strip() for part in text.split(';')]
            processed_parts = []
            for part in parts:
                sorted_keys = sorted(ABBREVIATION_MAP.keys(), key=len, reverse=True)
                for long_form in sorted_keys:
                    if long_form in part:
                        part = part.replace(long_form, ABBREVIATION_MAP[long_form])
                processed_parts.append(part)
            return '; '.join(processed_parts)
        else:
            sorted_keys = sorted(ABBREVIATION_MAP.keys(), key=len, reverse=True)
            for long_form in sorted_keys:
                text = text.replace(long_form, ABBREVIATION_MAP[long_form])
            return text
    return text

# --- General shortening function (if a label is still too long) ---
def shorten_label_fallback(label, max_length=75):
    if len(label) > max_length:
        return label[:max_length-3] + '...'
    return label

# --- 2. Load data from the specified .csv file ---
file_path = "/content/combined_data.csv"

try:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at path: {file_path}")

    raw_df = pd.read_csv(file_path)

except FileNotFoundError as e:
    print(f"‚ùå Error: {e}. Please upload `combined_data.csv` or provide the correct filename.")
    # exit()
except Exception as e:
    print(f"‚ùå An error occurred while loading the file: {e}")
    # exit()

# --- 3. Preprocessing: Preparing data for visualization ---
required_columns_for_viz = ['Antibiotic', 'AMR Gene Family', 'Resistance Mechanism', 'Source_File']

if all(col in raw_df.columns for col in required_columns_for_viz):
    raw_df['Strain'] = raw_df['Source_File'].apply(lambda x: os.path.splitext(os.path.basename(x))[0])

    # Create a combined column with FULL names for console output
    df_combined_exploded_full = raw_df.assign(
        Antibiotic=raw_df['Antibiotic'].astype(str).str.split('; ')
    ).explode('Antibiotic')
    df_combined_exploded_full['Antibiotic'] = df_combined_exploded_full['Antibiotic'].str.strip()
    df_combined_exploded_full = df_combined_exploded_full.dropna(subset=['Antibiotic', 'AMR Gene Family', 'Resistance Mechanism'])
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['Antibiotic'].notna() & (df_combined_exploded_full['Antibiotic'] != 'nan') & (df_combined_exploded_full['Antibiotic'] != 'n/a')]
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['AMR Gene Family'].notna() & (df_combined_exploded_full['AMR Gene Family'] != 'n/a')]
    df_combined_exploded_full = df_combined_exploded_full[df_combined_exploded_full['Resistance Mechanism'].notna() & (df_combined_exploded_full['Resistance Mechanism'] != 'n/a')]

    if not df_combined_exploded_full.empty:
        df_combined_exploded_full = df_combined_exploded_full.reset_index(drop=True)
        df_combined_exploded_full['Antibiotic_Gene_Mechanism_full'] = \
            df_combined_exploded_full['Antibiotic'].astype(str) + " / " + \
            df_combined_exploded_full['AMR Gene Family'].astype(str) + " / " + \
            df_combined_exploded_full['Resistance Mechanism'].astype(str)

        # Create DataFrame for plotting with abbreviations
        df_combined_exploded_processed = df_combined_exploded_full.copy()
        df_combined_exploded_processed['AMR Gene Family_processed'] = df_combined_exploded_processed['AMR Gene Family'].apply(apply_abbreviations)
        df_combined_exploded_processed['Resistance Mechanism_processed'] = df_combined_exploded_processed['Resistance Mechanism'].apply(apply_abbreviations)
        df_combined_exploded_processed['Antibiotic_Gene_Mechanism_combined_for_plot'] = \
            df_combined_exploded_processed['Antibiotic'].astype(str) + " / " + \
            df_combined_exploded_processed['AMR Gene Family_processed'].astype(str) + " / " + \
            df_combined_exploded_processed['Resistance Mechanism_processed'].astype(str)

        # --- Visualization: Overview of the most frequent combinations ---
        top_combinations_full = df_combined_exploded_full['Antibiotic_Gene_Mechanism_full'].value_counts().head(25)
        top_combinations_plot = df_combined_exploded_processed['Antibiotic_Gene_Mechanism_combined_for_plot'].value_counts().head(25)

        # --- Final output and plotting ---
        print("--- Top 25 Most Frequent Combinations (Antibiotic / Gene / Mechanism) ---")
        print("For legend (full names):")
        print(top_combinations_full.to_string())

        if not top_combinations_plot.empty:
            final_labels_for_plot = top_combinations_plot.index.map(lambda x: shorten_label_fallback(x, max_length=75))

            plt.figure(figsize=(13, 10)) # Increased height for 25 items
            sns.barplot(x=top_combinations_plot.values, y=final_labels_for_plot, palette='viridis')
            plt.title('Top 25 Most Frequent Combinations (Antibiotic / Gene / Mechanism)', fontsize=16)
            plt.xlabel('Count', fontsize=12)
            plt.ylabel('Combination', fontsize=12)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=9) # May need to slightly reduce y-label font size
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()
        else:
            print("‚ùå No data available for plotting 'Top 25 Combinations'.")
    else:
        print("‚ùå WARNING: DataFrame is empty after cleaning! Check the source data.")
else:
    print(f"‚ùå Error: Not all required columns found in '{file_path}'.")
