In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

In [None]:
#change display settings for whole dfs
pd.set_option('display.max_rows', None)       # Display all rows
pd.set_option('display.max_columns', None)    # Display all columns
pd.set_option('display.width', None)          # Disable line wrapping
pd.set_option('display.max_colwidth', None)   # Show full column width

In [None]:
#reading json files downloaded from biosurfdb
#list of file "name" -> df
name_list = [
    "organism",
    "gene",
    "protein",
    "pathway",
    "surfactant",
    "metagenome",
    "biosurfactant",
    "organism_pathway"
]

#loading data into df inside a dict for all files
df_dict = {}
for name in name_list:

  # Load json
  with open(f'{name}.json') as f:
    raw_data = json.load(f)

  try:
      # normalizing data (without success)
      data_normalized = pd.json_normalize(raw_data['success'])
  except Exception as e:
      print(f"An error occurred during normalization: {e}")
  #Save to dict with proper name
  df_dict[f"{name}_df"] = data_normalized

  #Show df
  display(df_dict[f"{name}_df"].head())

In [None]:
#merge dfs into df_combined

#biosurfactant+surfactant
df_dict["biosurfactant_surfactant_df_combined"] = pd.merge(df_dict["biosurfactant_df"], df_dict["surfactant_df"], on='surfactant_id', how='inner')

#organism_pathway+additional columns
  #0.change column names to be common
  #gene_df:  gene_ncbi -> gene_GI
  #pathway_df: name -> pathway
  #protein_df: protein_ncbi -> protein_GI
  #organism_df: binomial_name -> organism
df_dict['gene_df'].rename(columns={'gene_ncbi': 'Gene_GI'}, inplace=True)
df_dict['pathway_df'].rename(columns={'name': 'Pathway'}, inplace=True)
df_dict['protein_df'].rename(columns={'protein_ncbi': 'Protein_GI'}, inplace=True)
df_dict['organism_df'].rename(columns={'binomial_name': 'Organism'}, inplace=True)

df_dict["organism_pathway_df_combined"] = pd.merge(df_dict["organism_pathway_df"], df_dict["organism_df"], on='Organism', how='left')
df_dict["organism_pathway_df_combined"] = pd.merge(df_dict["organism_pathway_df"], df_dict["gene_df"], on='Gene_GI', how='left')
df_dict["organism_pathway_df_combined"] = pd.merge(df_dict["organism_pathway_df"], df_dict["pathway_df"], on='Pathway', how='left')
df_dict["organism_pathway_df_combined"] = pd.merge(df_dict["organism_pathway_df"], df_dict["protein_df"], on='Protein_GI', how='left')

#metagenome+organism_pathway by reference -> filter
df_dict["metagenome_df"].rename(columns={'reference': 'Reference'}, inplace=True)
#filter None in Reference in metagenome_df
df_dict["metagenome_df"] = df_dict["metagenome_df"][df_dict["metagenome_df"]["Reference"].notna()]
df_dict["metagenome_df"] = df_dict["metagenome_df"][['environment', 'Reference']]
display(df_dict["metagenome_df"])
df_dict["metagenome_organism_pathway_df_combined"] = pd.merge(df_dict["metagenome_df"], df_dict["organism_pathway_df"], on='Reference', how='left')

#pathway+biosurfactant by reference

df_dict["pathway_biosurfactant_df_combined"] = pd.merge(df_dict["pathway_df"], df_dict["biosurfactant_df"], on='reference', how='inner')

In [None]:
#Display combined dfs
display(df_dict['organism_pathway_df_combined'])
display(df_dict['metagenome_organism_pathway_df_combined'])
display(df_dict['pathway_biosurfactant_df_combined'])

In [None]:
#plot 1 (table) - Biosurfactants classification by classes

biosurfactant_surfactant_df_combined_cleaned = df_dict['biosurfactant_surfactant_df_combined']
map = {
    'Polymeric Surfactants': 'Polymeric biosurfactants',
    'Polymeric surfactants': 'Polymeric biosurfactants',
    'Glycolipid': 'Glycolipids',
    'Lipopeptide': 'Lipopeptides',
    ' lipopeptide': 'Lipopeptides',
    'Biosurfactant': 'Other biosurfactants',
    'Biosurfactants': 'Other biosurfactants'
}
biosurfactant_surfactant_df_combined_cleaned['class_name'] = biosurfactant_surfactant_df_combined_cleaned['class_name'].replace(map)
biosurf_by_class = biosurfactant_surfactant_df_combined_cleaned.groupby('class_name')['surfactant_name'].nunique()
display(biosurf_by_class)
biosurf_by_class_plot = plt.bar(biosurf_by_class.index, biosurf_by_class.values, color='orange')
plt.xticks(rotation=90)
plt.xlabel("Clase de biosurfactante")
plt.ylabel("Cantidad de biosurfactantes")
plt.title("Cantidad de biosurfactantes por clase")
plt.savefig("biosurfactants_by_class.png", dpi=300, bbox_inches='tight')
plt.show(biosurf_by_class_plot)

In [None]:
#PLOT 2.1 - Distribution of biosurfactants production ability in bacterial genera (pathways)
pathways_in_organism = df_dict['organism_pathway_df_combined'].groupby('Pathway')['Organism'].nunique()
display(pathways_in_organism)
#normalized version
pathways_in_organism = pathways_in_organism[pathways_in_organism.values < 100]

pathways_in_organism_plot = plt.bar(pathways_in_organism.index, pathways_in_organism.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Pathway")
plt.ylabel("Number of organisms")
plt.title("Number of organisms by pathway")
plt.savefig("organisms_by_pathway_normalized.png", dpi=300, bbox_inches='tight')
plt.show(pathways_in_organism_plot)

In [None]:
#PLOT 2.2 - Distribution of  biosurfactants production ability in metagenomes (pathways)

nr_pathways_in_metagenome = df_dict['organism_pathway_df_combined'].groupby('Pathway')['Reference'].nunique()
#normalized version
nr_pathways_in_metagenome = nr_pathways_in_metagenome[nr_pathways_in_metagenome.values < 30]

display(nr_pathways_in_metagenome)
nr_pathways_in_metagenome_plot = plt.bar(nr_pathways_in_metagenome.index, nr_pathways_in_metagenome.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Pathway")
plt.ylabel("Number of metagenomes")
plt.title("Number of metagenomes by pathway")
plt.savefig("metagenomes_by_pathway_normalized.png", dpi=300, bbox_inches='tight')
plt.show(nr_pathways_in_metagenome_plot)

#reverse plot

nr_pathways_in_metagenome_reverse = df_dict['organism_pathway_df_combined'].groupby('Reference')['Pathway'].nunique()
#normalized version
nr_pathways_in_metagenome_reverse = nr_pathways_in_metagenome_reverse[nr_pathways_in_metagenome_reverse.values > 3]
nr_pathways_in_metagenome_reverse = nr_pathways_in_metagenome_reverse[nr_pathways_in_metagenome_reverse.values < 10]

display(nr_pathways_in_metagenome_reverse)
nr_pathways_in_metagenome_reverse_plot = plt.bar(nr_pathways_in_metagenome_reverse.index, nr_pathways_in_metagenome_reverse.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Metagenome")
plt.ylabel("Number of pathways")
plt.title("Number of pathways by metagenomes")
plt.savefig("metagenomes_by_pathway_reverse_metagenomelist.png", dpi=300, bbox_inches='tight')
plt.show(nr_pathways_in_metagenome_reverse_plot)

In [None]:
#PLOT 3 - Amount of genes included in metabolical pathways
nr_genes_in_pathway = df_dict['organism_pathway_df_combined'].groupby('Pathway')['Gene_GI'].nunique()
#normalized version
#nr_genes_in_pathway = nr_genes_in_pathway[nr_genes_in_pathway.values < 200]

nr_genes_in_pathway_plot = plt.bar(nr_genes_in_pathway.index, nr_genes_in_pathway.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Ruta metabólica")
plt.ylabel("Cantidad de genes")
plt.title("Cantidad de genes por ruta metabólica")
plt.savefig("genes_by_pathway_normalized.png", dpi=300, bbox_inches='tight')
plt.show(nr_genes_in_pathway_plot)

#plot 3_colored

nr_genes_in_pathway = df_dict['organism_pathway_df_combined'].groupby('Pathway')['Gene_GI'].nunique()

# Create map Pathway → Family
pathway_family_map = df_dict['organism_pathway_df_combined'].drop_duplicates(subset='Pathway')[['Pathway', 'Family']].set_index('Pathway')['Family']
# Make sure indexes match
families = pathway_family_map.loc[nr_genes_in_pathway.index]

# Assign colors to unique families
import seaborn as sns
unique_families = families.unique()
palette = sns.color_palette("colorblind", n_colors=len(unique_families))
color_map = dict(zip(unique_families, palette))
bar_colors = families.map(color_map)

# Plot
plt.figure(figsize=(10, 6))
bars = plt.bar(nr_genes_in_pathway.index, nr_genes_in_pathway.values, color=bar_colors)

plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Ruta metabólica")
plt.ylabel("Cantidad de genes")
plt.title("Cantidad de genes por ruta metabólica")

from matplotlib.patches import Patch
legend_handles = [Patch(color=color_map[f], label=f) for f in unique_families]
plt.legend(handles=legend_handles, title='Familia')

plt.savefig("genes_by_pathway_colored.png", dpi=300, bbox_inches='tight')
plt.show(nr_genes_in_pathway_plot)

In [None]:
#PLOT metagenome_by_organism
#plot 4.X** - Distribution of biosurfactants producing bacteria in metagenomes

#cleaned by genera

def map_to_sp(org):
    first = org.split()[0]
    return mapping_dict.get(first, org)

nr_metagenomes_by_organism = df_dict['organism_pathway_df_combined'].groupby('Organism')['Reference'].nunique()
df_metagenomes = nr_metagenomes_by_organism.reset_index()
df_metagenomes.columns = ['Organism', 'Reference_count']
df_metagenomes['first_word'] = df_metagenomes['Organism'].str.split().str[0]
first_word_counts = df_metagenomes['first_word'].value_counts()
mapping_dict = {word: f"{word} sp." for word, count in first_word_counts.items() if count > 1 and word[0].isupper()}

df_metagenomes['Organism_mapped'] = df_metagenomes['Organism'].apply(map_to_sp)
nr_metagenomes_by_organism_cleaned = df_metagenomes.groupby('Organism_mapped')['Reference_count'].sum()
display(nr_metagenomes_by_organism_cleaned)

#without normalization
plt.figure(figsize=(50, 10))
nr_metagenomes_by_organism_cleaned_plot = plt.bar(nr_metagenomes_by_organism_cleaned.index, nr_metagenomes_by_organism_cleaned.values, color='orange')
plt.xticks(rotation=90, fontsize=4)
plt.xlabel("Organismo")
plt.ylabel("Cantidad de metagenomas")
plt.title("Cantidad de metagenomas por organismo")
plt.savefig("metagenomes_by_organism_cleaned.png", dpi=300, bbox_inches='tight')
plt.show(nr_metagenomes_by_organism_cleaned_plot)

#normalized

nr_metagenomes_by_organism_normalized = nr_metagenomes_by_organism_cleaned[nr_metagenomes_by_organism_cleaned.values > 4]
display(nr_metagenomes_by_organism_normalized)

plt.figure(figsize=(30, 10))
nr_metagenomes_by_organism_normalized_plot = plt.bar(nr_metagenomes_by_organism_normalized.index, nr_metagenomes_by_organism_normalized.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Organismo")
plt.ylabel("Cantidad de metagenomas")
plt.title("Cantidad de metagenomas por organismo")
plt.savefig("metagenomes_by_organism_normalized.png", dpi=300, bbox_inches='tight')
plt.show(nr_metagenomes_by_organism_normalized_plot)
#display(nr_metagenomes_by_organism_normalized)



#reverse plot

nr_metagenomes_by_organism_cleaned_reversed = nr_metagenomes_by_organism_cleaned.groupby('Reference')['Organism'].nunique()

nr_metagenomes_by_organism_cleaned_reversed = nr_metagenomes_by_organism_cleaned_reversed[nr_metagenomes_by_organism_cleaned_reversed.values > 3]
nr_metagenomes_by_organism_cleaned_reversed = nr_metagenomes_by_organism_cleaned_reversed[nr_metagenomes_by_organism_cleaned_reversed.values < 200]


nr_metagenomes_by_organism_cleaned_reversed_plot = plt.bar(nr_metagenomes_by_organism_cleaned_reversed.index, nr_metagenomes_by_organism_cleaned_reversed.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Metagenome")
plt.ylabel("Number of organisms")
plt.title("Number of organisms by metagenome")
plt.savefig("metagenomes_by_organism_cleaned_reversed.png", dpi=300, bbox_inches='tight')
plt.show(nr_metagenomes_by_organism_cleaned_reversed_plot)
display(nr_metagenomes_by_organism_cleaned_reversed)

In [None]:
#list of metagenomes comparison

metagenomes_list_comparison = pd.merge(nr_pathways_in_metagenome_reverse, nr_metagenomes_by_organism_cleaned_reversed, on='Reference', how='inner')
display(metagenomes_list_comparison)

In [None]:
#plot Metagenome distribution by environment from TSV file from PROMOTOR

df_environments = pd.read_csv("metagenomes.tsv", sep="\t")

nr_metagenomes_by_environments = df_environments.groupby('category')['environment'].nunique()

display(nr_metagenomes_by_environments)

nr_metagenomes_by_environments_plot = plt.bar(nr_metagenomes_by_environments.index, nr_metagenomes_by_environments.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Medio ambiente")
plt.ylabel("Cantidad de metagenomas")
plt.title("Cantidad de metagenomas por medio ambiente")
plt.savefig("metagenomes_by_environment.png", dpi=300, bbox_inches='tight')
plt.show(nr_metagenomes_by_environments_plot)

In [None]:
#plots for species analysis in mgr
#PLOT genes_by_organism

#cleaned by genera version

nr_genes_in_organism = df_dict['organism_pathway_df_combined'].groupby('Organism')['Gene_GI'].nunique()
df_genes = nr_genes_in_organism.reset_index()
df_genes.columns = ['Organism', 'Gene_count']
df_genes['first_word'] = df_genes['Organism'].str.split().str[0]
first_word_counts = df_genes['first_word'].value_counts()
mapping_dict = {word: f"{word} sp." for word, count in first_word_counts.items() if count > 1 and word[0].isupper()}

def map_to_sp(org):
    first = org.split()[0]
    return mapping_dict.get(first, org)
df_genes['Organism_mapped'] = df_genes['Organism'].apply(map_to_sp)
nr_genes_in_organism_cleaned = df_genes.groupby('Organism_mapped')['Gene_count'].sum()
print(nr_genes_in_organism_cleaned)





plt.figure(figsize=(30, 10))
nr_genes_in_organism_cleaned_plot = plt.bar(nr_genes_in_organism_cleaned.index, nr_genes_in_organism_cleaned.values, color='orange', width=0.4)
plt.xticks(rotation=90, fontsize=2)
plt.xlabel("Organismo")
plt.ylabel("Cantidad de genes")
plt.title("Cantidad de genes por organismo")
plt.savefig("genes_by_organism_cleaned.png", dpi=300, bbox_inches='tight')
plt.show(nr_genes_in_organism_cleaned_plot)
#display(nr_genes_in_organism_cleaned)

nr_genes_in_organism_cleaned_normalized = nr_genes_in_organism_cleaned[nr_genes_in_organism_cleaned.values > 10]
plt.figure(figsize=(20, 10))
nr_genes_in_organism_cleaned_normalized_plot = plt.bar(nr_genes_in_organism_cleaned_normalized.index, nr_genes_in_organism_cleaned_normalized.values, color='orange')
plt.xticks(rotation=90, fontsize=6)
plt.xlabel("Organismo")
plt.ylabel("Cantidad de genes")
plt.title("Cantidad de genes por organismo")
plt.savefig("genes_by_organism_cleaned_normalized.png", dpi=300, bbox_inches='tight')
plt.show(nr_genes_in_organism_cleaned_normalized_plot)
display(nr_genes_in_organism_cleaned_normalized)

In [None]:
#df_environments
df_ncbi_choose = pd.read_csv("unikalne_wartosci.tsv", sep="\t")
df_ncbi_choose.columns = ['reference']

ncbi_environment_combined = pd.merge(df_ncbi_choose, df_environments, on='reference', how='inner')
ncbi_environment_combined = ncbi_environment_combined[['reference', 'category']]
display(ncbi_environment_combined)
count_ncbi_environment = ncbi_environment_combined.groupby('category')['reference'].nunique()
display(count_ncbi_environment)