In [None]:
# Base imports
import os
import pickle

# Compute imports
import numpy as np
import pandas as pd
import scipy
from tqdm.notebook import tqdm, trange

# Plotting imports
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from plotly import express as px

# ML import
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error, median_absolute_error
from sklearn.metrics.pairwise import cosine_similarity


import multiprocessing
from multiprocessing import Pool


matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.rcParams['font.family'] = 'sans-serif'
sns.set_style('ticks')
matplotlib.rcParams['text.color'] = '#000000'
matplotlib.rcParams['axes.labelcolor'] = '#000000'
matplotlib.rcParams['xtick.color'] = '#000000'
matplotlib.rcParams['ytick.color'] = '#000000'

In [None]:
DF_GENES = '../../data/processed/cd-hit-results/sim80/Ebacter_strain_by_gene.pickle.gz'
ENRICHED_METADATA = '../../data/metadata/enriched_metadata.csv'
DF_EGGNOG = '../../data/processed/df_eggnog.csv'

DF_ACC_COMPLETE = '../../data/processed/CAR_genomes/df_acc_complete.pickle'
DF_RARE_COMPLETE = '../../data/processed/CAR_genomes/df_rare_complete.pickle'
DF_CORE_COMPLETE = '../../data/processed/CAR_genomes/df_core_complete.pickle'

In [None]:
df_rare = pd.read_pickle(DF_RARE_COMPLETE)
df_acc = pd.read_pickle(DF_ACC_COMPLETE)
df_core = pd.read_pickle(DF_CORE_COMPLETE)

In [None]:
A_BINARIZED = '../../data/processed/nmf-outputs/A_binarized.csv'
A_binarized = pd.read_csv(A_BINARIZED, index_col=0)

In [None]:
metadata = pd.read_csv(ENRICHED_METADATA, index_col=0, dtype='object')

display( metadata.shape, metadata.head())

In [None]:
# Load in (full) P matrix
df_genes = pd.read_pickle(DF_GENES)

# Filter metadata for Complete sequences only
metadata_complete = metadata[metadata.genome_status == 'Complete'] # filter for only Complete sequences

# Filter P matrix for Complete sequences only
df_genes_complete = df_genes[metadata_complete.genome_id].copy()
df_genes_complete.fillna(0, inplace=True) # replace N/A with 0
df_genes_complete = df_genes_complete.sparse.to_dense().astype('int8') # densify & typecast to int8 for space and compute reasons
inCompleteseqs = df_genes_complete.sum(axis=1) > 0 # filter for genes found in complete sequences
df_genes_complete = df_genes_complete[inCompleteseqs]

df_genes_complete.shape

In [None]:
# Load in eggNOG annotations
df_eggnog = pd.read_csv(DF_EGGNOG, index_col=0)
df_eggnog.fillna('-', inplace=True)

display(
    df_eggnog.shape,
    df_eggnog.head()
)

In [None]:
P_allele = pd.read_pickle('../../data/processed/cd-hit-results/sim80/Ebacter_strain_by_allele.pickle.gz')
P_allele = P_allele.fillna(0).astype(int)

In [None]:
core_genes = df_core.index

In [None]:
acc_genes = df_acc.index
rare_genes = df_rare.index

In [None]:
characterized_order = ['hormaechei-xiangfangensis',
 'hormaechei-oharae',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',
 'roggenkampii',
 'asburiae',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']


df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]


clr = dict(zip(characterized_order + ["None"], custom_colors))
output_df['color'] = output_df.Label.map(clr)

## Extract Allele Sequences

The below is based on an old method I used but needs to be changed or re-worked. I need to do the following:
1. Find software to build a phylogenetic tree

In [None]:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MafftCommandline
from io import StringIO
from Bio import SeqIO
import tempfile
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo

# Analysis for Single Copy Core Genes

In [None]:
scc_genes = ['Ebacter_C100472',
 'Ebacter_C101558',
 'Ebacter_C103292',
 'Ebacter_C10426',
 'Ebacter_C107942',
 'Ebacter_C108573',
 'Ebacter_C110315',
 'Ebacter_C112797',
 'Ebacter_C113365',
 'Ebacter_C114601',
 'Ebacter_C11866',
 'Ebacter_C12400',
 'Ebacter_C13432',
 'Ebacter_C13437',
 'Ebacter_C13742',
 'Ebacter_C13804',
 'Ebacter_C13806',
 'Ebacter_C14408',
 'Ebacter_C16067',
 'Ebacter_C16181',
 'Ebacter_C17612',
 'Ebacter_C18127',
 'Ebacter_C19902',
 'Ebacter_C21434',
 'Ebacter_C2440',
 'Ebacter_C24576',
 'Ebacter_C25412',
 'Ebacter_C25641',
 'Ebacter_C26119',
 'Ebacter_C27162',
 'Ebacter_C27467',
 'Ebacter_C28628',
 'Ebacter_C28733',
 'Ebacter_C29374',
 'Ebacter_C30093',
 'Ebacter_C34271',
 'Ebacter_C34511',
 'Ebacter_C3501',
 'Ebacter_C35293',
 'Ebacter_C35487',
 'Ebacter_C37335',
 'Ebacter_C37771',
 'Ebacter_C38778',
 'Ebacter_C38917',
 'Ebacter_C39435',
 'Ebacter_C42042',
 'Ebacter_C4210',
 'Ebacter_C42587',
 'Ebacter_C43046',
 'Ebacter_C43535',
 'Ebacter_C43888',
 'Ebacter_C44386',
 'Ebacter_C44907',
 'Ebacter_C48305',
 'Ebacter_C49588',
 'Ebacter_C49697',
 'Ebacter_C49722',
 'Ebacter_C50924',
 'Ebacter_C51224',
 'Ebacter_C51497',
 'Ebacter_C51498',
 'Ebacter_C51904',
 'Ebacter_C52224',
 'Ebacter_C58596',
 'Ebacter_C59704',
 'Ebacter_C59913',
 'Ebacter_C61078',
 'Ebacter_C61459',
 'Ebacter_C61836',
 'Ebacter_C6228',
 'Ebacter_C62596',
 'Ebacter_C62952',
 'Ebacter_C63161',
 'Ebacter_C65930',
 'Ebacter_C67022',
 'Ebacter_C67092',
 'Ebacter_C67503',
 'Ebacter_C69060',
 'Ebacter_C70584',
 'Ebacter_C7096',
 'Ebacter_C71129',
 'Ebacter_C71264',
 'Ebacter_C72288',
 'Ebacter_C73125',
 'Ebacter_C73519',
 'Ebacter_C74659',
 'Ebacter_C74690',
 'Ebacter_C75003',
 'Ebacter_C76258',
 'Ebacter_C78689',
 'Ebacter_C80576',
 'Ebacter_C81013',
 'Ebacter_C81014',
 'Ebacter_C82241',
 'Ebacter_C85970',
 'Ebacter_C88246',
 'Ebacter_C91317',
 'Ebacter_C91821',
 'Ebacter_C92829',
 'Ebacter_C9312',
 'Ebacter_C93270',
 'Ebacter_C94777',
 'Ebacter_C95738',
 'Ebacter_C96274',
 'Ebacter_C97800']


In [None]:
len(scc_genes)

In [None]:
# set gene of interest for multiple alignment
gene = scc_genes
alleles = [x for x in P_allele.index if x.split('A')[0] in gene]
allele_P = P_allele.loc[alleles, df_core.columns]

In [None]:
# Path to the FASTA file
fasta_file = "../../data/processed/cd-hit-results/sim80/Ebacter_nr.faa"


# Dictionary to store sequences
sequences = {}

# Iterate through the FASTA file and store sequences associated with headers in gene_list
for record in SeqIO.parse(fasta_file, "fasta"):
    if record.id in allele_P.index:
        sequences[record.id] = str(record.seq)

In [None]:
strain_seqeunces = {}
for strain in tqdm(allele_P.columns):
    strain_seqeunces[strain] = ""
    for target_seq in allele_P[strain][allele_P[strain] > 0].index:
        strain_seqeunces[strain] += sequences[target_seq]

In [None]:
with open("strains_sequences.fasta", "w") as fasta_file:
    for strain, sequence in strain_seqeunces.items():
        # Write the strain name as the header line and the sequence as the sequence line
        fasta_file.write(f"> {strain}\n{sequence}\n")

I ran command line mafft on the fasta file using the following command: 
```
 # old mafft --thread 60 --algorithm FFT-NS-2 strains_sequences.fasta > aligned_sequences.fasta
"/home/jtburrows/private/phylon_analysis/bin/mafft"  --retree 2 --inputorder "strains_sequences.fasta" > "aligned_sequences.fasta"
```


In [None]:
# Parse the alignment
alignment = AlignIO.read(('aligned_sequences.fasta'), "fasta")

# # Save the alignment in Clustal format (optional)
# with open("alignment.aln", "w") as output_file:
#     AlignIO.write(alignment, output_file, "clustal")

In [None]:
# Calculate the distance matrix
calculator = DistanceCalculator('identity')
distance_matrix = calculator.get_distance(alignment)

# Build the tree using the neighbor-joining algorithm
constructor = DistanceTreeConstructor()
tree = constructor.nj(distance_matrix)

In [None]:
from Bio import Phylo
from io import StringIO

# Assuming you have your tree object in the variable `tree`
output = StringIO()
Phylo.write(tree, output, "newick")
newick_str = output.getvalue()

# Save to a file
with open("tree.newick", "w") as f:
    f.write(newick_str)


# Plot as a circular tree

In [None]:
df_species = metadata_complete[metadata_complete.genome_status == 'Complete'].loc[:,["genome_id", "genome_name"]]
df_species["species"] = df_species["genome_name"].apply(lambda x: x.split()[0]+" " +x.split()[1])
df_species.set_index('genome_id', inplace=True)
custom_colors = {'Enterobacter hormaechei': 'FireBrick',
 'Enterobacter cloacae': 'Pink',
 'Enterobacter sp.': 'SlateGray',
 'Enterobacter roggenkampii': 'Green',
 'Enterobacter kobei': 'Purple',
 'Enterobacter cancerogenus': 'Magenta',
 'Enterobacter bugandensis': 'Cyan',
 'Enterobacter asburiae': 'Blue',
 'Enterobacter ludwigii': 'Lime',
 'Enterobacter mori': '#F5F5DC',
 'Enterobacter xiangfangensis': 'Red'}

df_species['color'] = df_species.species.map(custom_colors)

if 'ref_seq' in strain_seqeunces.keys():
    df_species.loc['ref'] = ['ref', 'ref', 'black']

In [None]:
def get_strains(phylon, A_binarized = A_binarized):
    phylon_membership = A_binarized.loc[phylon]
    return (phylon_membership[phylon_membership == 1]).index

In [None]:
# Alternate colors for each strain from phylons
characterized_order = ['hormaechei-xiangfangensis',
 'hormaechei-oharae',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',
 'roggenkampii',
 'asburiae',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']

df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "#FF0000",      # Red
    "#CD5C5C",      # IndianRed
    "#8B0000",      # DarkRed
    "#B22222",      # FireBrick
    "#FF6347",      # Tomato
    "#FFD700",      # Gold
    "#B8860B",      # DarkGoldenrod
    "#DAA520",      # Goldenrod
    # Other species
    "#008000",      # Green
    "#0000FF",      # Blue
    "#800080",      # Purple
    "#00FFFF",      # Cyan
    "#FF00FF",      # Magenta
    "#00FF00",      # Lime
    "#FFC0CB",      # Pink
]

clr = dict(zip(characterized_order + ["None"], custom_colors))
custom_colors = clr
output_df['color'] = output_df.Label.map(clr)

df_species = output_df

In [None]:
L_BINARIZED = '../../data/processed/nmf-outputs/L_binarized.csv'
A_BINARIZED = '../../data/processed/nmf-outputs/A_binarized.csv'

# Load in L_binarized matrix
L_binarized = pd.read_csv(L_BINARIZED, index_col=0)
A_binarized = pd.read_csv(A_BINARIZED, index_col=0)
L_binarized[(L_binarized.loc[:,'hormaechei-steigerwaltii-3'] == 1) & (L_binarized.loc[:,'hormaechei-steigerwaltii-1'] == 1) & (L_binarized.sum(axis=1) == 2)]

In [None]:
import matplotlib.pyplot as plt
from pycirclize import Circos
from matplotlib.colors import LinearSegmentedColormap
from sklearn.preprocessing import LabelEncoder

# Initialize the Circos plot
circos, tv = Circos.initialize_from_tree("tree.newick", 
    leaf_label_size=0,
    start=10,
    end=350,
    r_lim=(20, 55),
    line_kws=dict(color="black", lw=2),
    align_line_kws=dict(ls="dashdot", lw=.3, alpha=.5),
    align_leaf_label=True,
    ladderize=True
)


# Plot heatmap with various style
sector = circos.sectors[0]
sector.rect(r_lim=(55, 65), ec="grey", lw=1)

# get labels for the phylons to map to the phylon colors
label_encoder = LabelEncoder()
test_df = df_species.copy()
test_df['num_labels'] = label_encoder.fit_transform(df_species.Label)

heatmap_track1 = sector.add_track((55, 60))
phylon_colors = LinearSegmentedColormap.from_list("phylon_colors", test_df.loc[tv.leaf_labels].sort_values('num_labels').fillna('white').color.unique())
heatmap_track1.heatmap(test_df.loc[tv.leaf_labels].num_labels.values, cmap=phylon_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("Phylon", r=heatmap_track1.r_center, size=8, color="black")

# list of colors for the mash clusters
color_list = [
    "#FFFF00",  # Bright Yellow
    "#FFEA00",  # Yellow 1 (brighter yellow)
    "#FFD700",  # Gold (distinct yellow)
    "#FF5500",  # Deep Yellow-Orange
    "#FF9900",  # Deep Orange 1
    "#FF8000",  # Orange 1
    "#FF6600",  # Orange 2
    "#FF3300",  # Deep Orange-Red
    "#FF4444",  # Red
    "#9F7FBF",  # Purple
    "#66A066",  # Green 1
    "#66D966",  # Green 2
    "#6699FF",  # Blue 1
    "#80BFFF",  # Blue 2
    "#80A3C1",  # Blue 3
    "#A3B8C0",  # Blue 4
    "#F2F2F2",  # White
    "#80B3B3",  # Teal
    "#B3FFB3",  # Lime
    "#C2A8E0",  # Bright Purple
    "#FF80BF"   # Pink
]

data = metadata_complete.set_index('genome_id')
heatmap_track2 = sector.add_track((60, 65))
mash_cluster_colors = LinearSegmentedColormap.from_list("mash_cluster_colors", color_list)
heatmap_track2.heatmap(data.loc[tv.leaf_labels].complete_mash_cluster.values.astype(float), cmap=mash_cluster_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("Mash Cluster", r=heatmap_track2.r_center, size=8, color="black")

### add row for traits of interest ###

## mobile phylon tracks
phylon_data = (A_binarized.loc['unchar-1', tv.leaf_labels] > 0).astype(int)
heatmap_mobile1 = sector.add_track((66, 68))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#FF6961"])
heatmap_mobile1.heatmap(phylon_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("mobile-1", r=heatmap_mobile1.r_center, size=8, color="black")

phylon_data = (A_binarized.loc['unchar-2', tv.leaf_labels] > 0).astype(int)
heatmap_mobile2 = sector.add_track((68, 70))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#AEC6CF"])
heatmap_mobile2.heatmap(phylon_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("mobile-2", r=heatmap_mobile2.r_center, size=8, color="black")

phylon_data = (A_binarized.loc['unchar-3', tv.leaf_labels] > 0).astype(int)
heatmap_mobile3 = sector.add_track((70, 72))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#77DD77"])
heatmap_mobile3.heatmap(phylon_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("mobile-3", r=heatmap_mobile3.r_center, size=8, color="black")

## mobile phylon tracks
phylon_data = (A_binarized.loc['unchar-4', tv.leaf_labels] > 0).astype(int)
heatmap_mobile4 = sector.add_track((72, 74))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#B39EB5"])
heatmap_mobile4.heatmap(phylon_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("mobile-4", r=heatmap_mobile4.r_center, size=8, color="black")


# trait 1
# pqq genes associated with pyrroloquinoline quinone
# only found in hormaechei cluster
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2245851/ - promotes plant growth, rhizosphere associated
cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'pqq' in x)
cond2 = df_eggnog.loc[df_genes.index][cond].index
inds = df_acc.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, tv.leaf_labels].index
genetic_trait_data = (df_acc.loc[inds, tv.leaf_labels].sum() > 2).astype(int)
heatmap_track3 = sector.add_track((75, 76))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#FF0000"])
heatmap_track3.heatmap(genetic_trait_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("pqq Genes", r=heatmap_track3.r_center, size=8, color="black")

# trait 2
# genes associated with Salmochelin production
cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'iro' in x)
cond2 = df_eggnog.loc[df_genes.index][cond].index
inds = df_genes_complete.loc[df_eggnog.loc[[x for x in cond2 if x in df_genes_complete.index]].index, tv.leaf_labels].index
genetic_trait_data = (df_genes_complete.loc[inds, tv.leaf_labels].sum() > 0).astype(int)
heatmap_track4 = sector.add_track((77, 78))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#000000"])
heatmap_track4.heatmap(genetic_trait_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("Salmochelin VFs", r=heatmap_track4.r_center, size=8, color="black")

# trait 3, arn operon associated with colistin resistance
cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'arn' in x)
cond2 = df_eggnog.loc[df_genes.index][cond].index
inds = df_acc.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, tv.leaf_labels].index
genetic_trait_data = (df_acc.loc[inds, tv.leaf_labels].sum() > 6).astype(int)
heatmap_track5 = sector.add_track((79, 80))
trait_colors = LinearSegmentedColormap.from_list("trait_colors", ["#FFFFFF", "#3333FF"])
heatmap_track5.heatmap(genetic_trait_data.loc[tv.leaf_labels].values, cmap=trait_colors, rect_kws=dict(ec="lightgrey", lw=0))
circos.text("Arn Operon", r=heatmap_track5.r_center, size=8, color="black")


### End of adding traits of interest ###
# Create the Circos figure
fig = circos.plotfig(figsize=(20,10))

# Plot legend for phylons
line_handles = []

for label, name, color in zip(test_df.loc[tv.leaf_labels].num_labels.unique(),test_df.loc[tv.leaf_labels].Label.unique(), test_df.loc[tv.leaf_labels].color.fillna('white').unique()):
    line_handles.append(Line2D([], [], color=color, label=name, lw=4))

# line_legend = circos.ax.legend(
#     handles=line_handles,
#     bbox_to_anchor=(0.80, 1.02),
#     # loc="upper right",
#     fontsize=8,
#     title="Phylons",
#     handlelength=2,
#     ncols=1
# )
# circos.ax.add_artist(line_legend)

# # Plot legend for mash cluster
# line_handles = []

# for name, color in zip(list(range(int(data.loc[tv.leaf_labels].complete_mash_cluster.astype(float).max()))), color_list):
#     line_handles.append(Line2D([], [], color=color, label=name, lw=4))

# line_legend = circos.ax.legend(
#     handles=line_handles,
#     bbox_to_anchor=(0, 1.02),
#     loc="upper left",
#     fontsize=8,
#     title="Mash Clusters",
#     handlelength=2,
#     ncols=1
# )
# circos.ax.add_artist(line_legend)

# plt.title("Single-Copy Core Gene Phylogeny")
plt.savefig("circos_plot.png", format='png', dpi=600, bbox_inches='tight')
plt.show()