# Setup

In [None]:
# Base imports
import os
import pickle
import re

# Compute imports
import numpy as np
import pandas as pd
import scipy
from tqdm.notebook import tqdm, trange

# Plotting imports
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px
import matplotlib.patches as mpatches

# ML import
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error, median_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['svg.fonttype'] = 'none'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.rcParams['font.family'] = 'sans-serif'
sns.set_style('ticks')
matplotlib.rcParams['text.color'] = '#000000'
matplotlib.rcParams['axes.labelcolor'] = '#000000'
matplotlib.rcParams['xtick.color'] = '#000000'
matplotlib.rcParams['ytick.color'] = '#000000'

In [None]:
DF_GENES = '../../data/processed/cd-hit-results/sim80/Ebacter_strain_by_gene.pickle.gz'
ENRICHED_METADATA = '../../data/metadata/enriched_metadata.csv'
DF_EGGNOG = '../../data/processed/df_eggnog.csv'

DF_CORE_COMPLETE = '../../data/processed/CAR_genomes/df_core_complete.pickle'
DF_ACC_COMPLETE = '../../data/processed/CAR_genomes/df_acc_complete.pickle'
DF_RARE_COMPLETE = '../../data/processed/CAR_genomes/df_rare_complete.pickle'

L_BINARIZED = '../../data/processed/nmf-outputs/L_binarized.csv'
A_BINARIZED = '../../data/processed/nmf-outputs/A_binarized.csv'
L_MATRIX = '../../data/processed/nmf-outputs/L.csv'
A_MATRIX = '../../data/processed/nmf-outputs/A.csv'
BAKTA_ANNOTATIONS = '../../data/processed/bakta_gene_annotations.csv'

In [None]:
bakta_annotations = pd.read_csv(BAKTA_ANNOTATIONS, index_col=0)

In [None]:
gene_locs_acc = pd.read_csv('acc_gene_location.csv', index_col=0)
gene_locs = pd.read_csv('complete_gene_location.csv', index_col=0)

In [None]:
df_rare = pd.read_pickle(DF_RARE_COMPLETE)
df_acc = pd.read_pickle(DF_ACC_COMPLETE)
df_core = pd.read_pickle(DF_CORE_COMPLETE)

In [None]:
metadata = pd.read_csv(ENRICHED_METADATA, index_col=0, dtype='object')

display( metadata.shape, metadata.head())

In [None]:
# Load in (full) P matrix
df_genes = pd.read_pickle(DF_GENES)

# Filter metadata for Complete sequences only
metadata_complete = metadata[metadata.genome_status == 'Complete'] # filter for only Complete sequences

# Filter P matrix for Complete sequences only
df_genes_complete = df_genes[metadata_complete.genome_id].copy()
df_genes_complete.fillna(0, inplace=True) # replace N/A with 0
df_genes_complete = df_genes_complete.sparse.to_dense().astype('int8') # densify & typecast to int8 for space and compute reasons
inCompleteseqs = df_genes_complete.sum(axis=1) > 0 # filter for genes found in complete sequences
df_genes_complete = df_genes_complete[inCompleteseqs]

df_genes_complete.shape

In [None]:
# Load in eggNOG annotations
df_eggnog = pd.read_csv(DF_EGGNOG, index_col=0)
df_eggnog.fillna('-', inplace=True)

display(
    df_eggnog.shape,
    df_eggnog.head()
)

In [None]:
# Load in A_binarized matrix
A_binarized = pd.read_csv(A_BINARIZED, index_col=0)
A_binarized

In [None]:
# Load in L_binarized matrix
L_binarized = pd.read_csv(L_BINARIZED, index_col=0)
L_binarized

In [None]:
phylon_order = ['hormaechei-xiangfangensis',
 'hormaechei-oharae',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',
 'unchar-1',
 'unchar-2',
 'unchar-3',
 'unchar-4',
 'roggenkampii',
 'asburiae',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']

In [None]:
gene_order = []

# Add in zero-phylon genes
zero_cond = L_binarized.sum(axis=1) == 0
gene_order.extend(L_binarized[zero_cond].index)

# Add in single-phylon genes
for phylon in phylon_order:
    single_cond = L_binarized.sum(axis=1) == 1
    inPhylon = L_binarized[phylon] == 1
    gene_order.extend(L_binarized[inPhylon & single_cond].index)

# Add in poly-phylon genes
for num_active_phylons in trange(2, int(L_binarized.sum(axis=1).max())+1):
    num_cond = L_binarized.sum(axis=1) == num_active_phylons
    gg = sns.clustermap(L_binarized[num_cond], method='ward', metric='euclidean', col_cluster=False, yticklabels=False);
    gene_order.extend(gg.data2d.index)

In [None]:
# Main sorted clustermap

g = sns.clustermap(
    L_binarized.loc[gene_order],
    method='ward',
    metric='euclidean',
    row_cluster=False,
    yticklabels=False,
    cmap='Greys'
);

In [None]:
strain_order = []
unchar_strain_order = []


# zero-phylon strains
noPhylon = A_binarized.sum() == 0
strain_order.extend(A_binarized.sum()[noPhylon].index.tolist())

# strain lists
single_phylon_strains = A_binarized.sum()[A_binarized.sum() == 1].index
multi_phylon_strains = A_binarized.sum()[A_binarized.sum() > 1].index

for phylon in phylon_order:
    if 'unchar' in phylon:
        continue
    else:
        phylon_aff_binarized_single = A_binarized.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_binarized.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(list1)
        strain_order.extend(new_list2)

for phylon in phylon_order: # must be done after the first loop
    if 'unchar' in phylon:
        phylon_aff_binarized_single = A_binarized.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_binarized.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list1 = list(set(list1) - set(strain_order)) # ensures no double-counting
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(new_list1)
        strain_order.extend(new_list2)

strain_order += unchar_strain_order

# A-binarized
sns.clustermap(A_binarized.loc[phylon_order, strain_order], cmap='Greys', xticklabels=False, row_cluster=False, col_cluster=False)

In [None]:
characterized_order = ['hormaechei-xiangfangensis',
 'hormaechei-oharae',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',
 'roggenkampii',
 'asburiae',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']

In [None]:
df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]


clr = dict(zip(characterized_order, custom_colors))

In [None]:
# Main sorted clustermap

g = sns.clustermap(
    L_binarized.loc[gene_order, characterized_order],
    method='ward',
    metric='euclidean',
    row_cluster=False,
    yticklabels=False,
    cmap='Greys',
    col_colors=list(clr.values())
);

plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/L.png', format='png', dpi=300, bbox_inches='tight')


In [None]:
strain_order = []
unchar_strain_order = []
characterized = characterized_order
A_bin_char = A_binarized.loc[characterized]
# zero-phylon strains
noPhylon = A_bin_char.sum() == 0
strain_order.extend(A_bin_char.sum()[noPhylon].index.tolist())

# strain lists
single_phylon_strains = A_bin_char.sum()[A_bin_char.sum() == 1].index
multi_phylon_strains = A_bin_char.sum()[A_bin_char.sum() > 1].index

for phylon in characterized:
    if 'unchar' in phylon:
        continue
    else:
        phylon_aff_binarized_single = A_bin_char.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_bin_char.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(list1)
        strain_order.extend(new_list2)

for phylon in characterized: # must be done after the first loop
    if 'unchar' in phylon:
        phylon_aff_binarized_single = A_bin_char.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_bin_char.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list1 = list(set(list1) - set(strain_order)) # ensures no double-counting
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(new_list1)
        strain_order.extend(new_list2)

strain_order += unchar_strain_order

# A-binarized
sns.clustermap(A_bin_char.loc[characterized, strain_order], cmap='Greys', xticklabels=False, row_cluster=False, col_cluster=False, row_colors=list(clr.values()))

plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/A.png', format='png', dpi=300, bbox_inches='tight')


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

links = scipy.cluster.hierarchy.linkage(L_binarized[characterized].T, method="ward")
clust = scipy.cluster.hierarchy.fcluster(links, t=16, criterion='maxclust')
den = scipy.cluster.hierarchy.dendrogram(links, labels = L_binarized[characterized].columns, orientation='left', ax=ax) 

## Useful functions for later analysis

In [None]:
def get_strains(phylon, A_binarized = A_binarized):
    phylon_membership = A_binarized.loc[phylon]
    return (phylon_membership[phylon_membership == 1]).index

def get_genes(phylon, L_binarized = L_binarized):
    return [x for x in L_binarized.index if L_binarized.loc[x, phylon] >0]

In [None]:
# Workhorse function for the rest of this notebook
def find_exclusive_genes(L_binarized, phylon_list1, phylon_list2):
    inUpper = L_binarized[phylon_list1].sum(axis=1) > 0
    inLower = L_binarized[phylon_list2].sum(axis=1) > 0
    
    notInUpper = L_binarized[phylon_list1].sum(axis=1) == 0
    notInLower = L_binarized[phylon_list2].sum(axis=1) == 0
    
    upper_only_genes = L_binarized[phylon_list1].sum(axis=1)[inUpper][notInLower].index
    lower_only_genes = L_binarized[phylon_list2].sum(axis=1)[inLower][notInUpper].index
    
    return upper_only_genes, lower_only_genes



# Accuracy Metrics

In [None]:
P_rec = L_binarized.astype(int) @ A_binarized.astype(int)

#gene presence threshold of .5
P_rec = (P_rec > .5).astype(int)
df_acc_complete = df_genes_complete.loc[L_binarized.index]
P_rec.index = df_acc_complete.index

accuracy = (df_acc_complete - P_rec == 0).sum().sum()/(df_acc_complete.shape[0] * df_acc_complete.shape[1])
print("Reconstruction Accuracy:", str(round(accuracy*100, 1)) + "%")

In [None]:
original_flat = df_acc_complete.values.flatten()
reconstructed_flat = P_rec.values.flatten()

# Calculate True Positives, False Positives, False Negatives, and True Negatives
TP = np.sum((original_flat == 1) & (reconstructed_flat == 1))  # True Positives
FP = np.sum((original_flat == 0) & (reconstructed_flat == 1))  # False Positives
FN = np.sum((original_flat == 1) & (reconstructed_flat == 0))  # False Negatives
TN = np.sum((original_flat == 0) & (reconstructed_flat == 0))  # True Negatives

# Calculate False Positive Rate and False Negative Rate
FPR = FP / (FP + TN) if (FP + TN) != 0 else 0  # Avoid division by zero
FNR = FN / (FN + TP) if (FN + TP) != 0 else 0  # Avoid division by zero

# Print the results
print(f"False Positive Rate (FPR): {FPR:.4f}")
print(f"False Negative Rate (FNR): {FNR:.4f}")

# Split Analysis

## Split 1 - Hormaechei vs Rest

In [None]:
split1_upper = ['hormaechei-xiangfangensis',
 'hormaechei-oharae',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',]

split1_lower = [ 'roggenkampii',
 'asburiae',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']

In [None]:
# 1170 total metabolic genes in acc genome
idx = df_eggnog.loc[L_binarized.index].COG_category.apply(lambda x: x[0]).str.contains('C|E|F|G|H|I|P')
total_metabolic = df_eggnog.loc[L_binarized.index].loc[idx]
df_eggnog.loc[L_binarized.index].loc[idx].shape

In [None]:
# 285 total motility genes in acc genome
cond1 = df_eggnog.loc[L_binarized.index].COG_category.apply(lambda x: x[0]).str.contains('N')
cond2 = df_eggnog.loc[L_binarized.index].Description.str.contains('pili')
cond3 = df_eggnog.loc[L_binarized.index].Description.str.contains('pilus')

total_motility = df_eggnog.loc[L_binarized.index][cond1 | cond2 | cond3] 

df_eggnog.loc[L_binarized.index][cond1 | cond2 | cond3].shape  

In [None]:
upper_only_genes, lower_only_genes = find_exclusive_genes(L_binarized, split1_upper, split1_lower)

display(
    len(upper_only_genes),
    len(lower_only_genes)
)

### Upper Phylons exclusive genes

In [None]:
# 646 genes have no known function
df_eggnog.loc[upper_only_genes].COG_category.apply(lambda x: x[0]).value_counts()

In [None]:
# 184 genes are metabolic
isMetabolic = df_eggnog.loc[upper_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
upper_metabolic = df_eggnog.loc[upper_only_genes][isMetabolic]

display(
    upper_metabolic.shape,
    upper_metabolic.head()
)

In [None]:
# 119 motility genes

isMotility1 = df_eggnog.loc[upper_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[upper_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[upper_only_genes].Description.str.contains('pili')


upper_motility = df_eggnog.loc[upper_only_genes][isMotility1 | isMotility2 | isMotility3]

display(
    upper_motility.shape,
    upper_motility.head()
)

### Lower phylon exclusive genes

In [None]:
display(df_eggnog.loc[lower_only_genes].head(), df_eggnog.loc[lower_only_genes].shape)

In [None]:
# 345 genes have no known function
df_eggnog.loc[lower_only_genes].COG_category.apply(lambda x: x[0]).value_counts()

In [None]:
# 279 genes are metabolic
isMetabolic = df_eggnog.loc[lower_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
lower_metabolic = df_eggnog.loc[lower_only_genes][isMetabolic]

display(
    lower_metabolic.shape,
    lower_metabolic.head()
)

In [None]:
# 70 motility genes

isMotility1 = df_eggnog.loc[lower_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[lower_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[lower_only_genes].Description.str.contains('pili')

lower_motility = df_eggnog.loc[lower_only_genes][isMotility1 | isMotility2 | isMotility3]

display(
    lower_motility.shape,
    lower_motility.head()
)

## Split 2 - Lower Phylons - arkb vs clmc

In [None]:
split2_arkb = ['asburiae',
 'roggenkampii',
 'kobei',
 'bugandensis']

split2_clc = ['cancerogenous',
 'ludwigii',
 'cloacae']

In [None]:
arkb_only_genes, clc_only_genes = find_exclusive_genes(
    L_binarized.loc[lower_only_genes],
    split2_arkb,
    split2_clc
)

display(
    f'arkb exclusive genes: {len(arkb_only_genes)}',
    f'clc exclusive genes: {len(clc_only_genes)}',
)

In [None]:
len(lower_only_genes) - len(arkb_only_genes) - len(clc_only_genes) # 462 genes split/shared across

### ARKB Phylon Genes

In [None]:
display(
    df_eggnog.loc[arkb_only_genes].shape,
    df_eggnog.loc[arkb_only_genes].head(),
    df_eggnog.loc[arkb_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 167 unknown genes
    
)

In [None]:
# 87 genes are metabolic
isMetabolic = df_eggnog.loc[arkb_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
arkb_metabolic = df_eggnog.loc[arkb_only_genes][isMetabolic]

display(arkb_metabolic)

In [None]:
# 33 motility genes

isMotility1 = df_eggnog.loc[arkb_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[arkb_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[arkb_only_genes].Description.str.contains('pili')

arkb_motility = df_eggnog.loc[arkb_only_genes][isMotility1 | isMotility2 | isMotility3]
arkb_motility

### CLC Phylon Genes

In [None]:
display(
    df_eggnog.loc[clc_only_genes].shape,
    df_eggnog.loc[clc_only_genes].head(),
    df_eggnog.loc[clc_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 43 unknown genes
    
)

In [None]:
# 44 genes are metabolic
isMetabolic = df_eggnog.loc[clc_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
clc_metabolic = df_eggnog.loc[clc_only_genes][isMetabolic]

display(clc_metabolic)

In [None]:
# 7 motility genes

isMotility1 = df_eggnog.loc[clc_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[clc_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[clc_only_genes].Description.str.contains('pili')

clc_motility = df_eggnog.loc[clc_only_genes][isMotility1 | isMotility2 | isMotility3]
clc_motility

## Split 3 - AR vs KB Phylon Genes

In [None]:
split3_ar = ['asburiae',
 'roggenkampii']

split3_kb = ['kobei',
 'bugandensis']


In [None]:
ar_only_genes, kb_only_genes = find_exclusive_genes(
    L_binarized.loc[arkb_only_genes],
    split3_ar,
    split3_kb
)

display(
    f'ar exclusive genes: {len(ar_only_genes)}',
    f'kb exclusive genes: {len(kb_only_genes)}',
)

In [None]:
len(arkb_only_genes) - len(ar_only_genes) - len(kb_only_genes) # 143 genes split/shared across

### AR Genes

In [None]:
display(
    df_eggnog.loc[ar_only_genes].shape,
    df_eggnog.loc[ar_only_genes].head(),
    df_eggnog.loc[ar_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 82 unknown genes
    
)

In [None]:
# 39 genes are metabolic
isMetabolic = df_eggnog.loc[ar_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
ar_metabolic = df_eggnog.loc[ar_only_genes][isMetabolic]

display(ar_metabolic)

In [None]:
# 22 motility genes

isMotility1 = df_eggnog.loc[ar_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[ar_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[ar_only_genes].Description.str.contains('pili')

ar_motility = df_eggnog.loc[ar_only_genes][isMotility1 | isMotility2 | isMotility3]
ar_motility

### KB Genes

In [None]:
display(
    df_eggnog.loc[kb_only_genes].shape,
    df_eggnog.loc[kb_only_genes].head(),
    df_eggnog.loc[kb_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 41 unknown genes
    
)

In [None]:
# 8 genes are metabolic
isMetabolic = df_eggnog.loc[kb_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
kb_metabolic = df_eggnog.loc[kb_only_genes][isMetabolic]

display(kb_metabolic)

In [None]:
# 3 motility genes

isMotility1 = df_eggnog.loc[kb_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[kb_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[kb_only_genes].Description.str.contains('pili')

kb_motility = df_eggnog.loc[kb_only_genes][isMotility1 | isMotility2 | isMotility3]
kb_motility

## Split 4 - Asburiae vs Roggenkampii

In [None]:
split4_a = ['asburiae',]
 
split4_r = ['roggenkampii']



In [None]:
a_only_genes, r_only_genes = find_exclusive_genes(
    L_binarized.loc[ar_only_genes],
    split4_a,
    split4_r
)

display(
    f'a exclusive genes: {len(a_only_genes)}',
    f'r exclusive genes: {len(r_only_genes)}',
)

In [None]:
len(ar_only_genes) - len(a_only_genes) - len(r_only_genes) # 34 genes split/shared across

### Asburiae Genes

In [None]:
display(
    df_eggnog.loc[a_only_genes].shape,
    df_eggnog.loc[a_only_genes].head(),
    df_eggnog.loc[a_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 30 unknown genes
    
)

In [None]:
# 13 genes are metabolic
isMetabolic = df_eggnog.loc[a_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
a_metabolic = df_eggnog.loc[a_only_genes][isMetabolic]

display(a_metabolic)

In [None]:
# 7 motility genes

isMotility1 = df_eggnog.loc[a_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[a_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[a_only_genes].Description.str.contains('pili')

a_motility = df_eggnog.loc[a_only_genes][isMotility1 | isMotility2 | isMotility3]
a_motility

### Roggenkampii Genes

In [None]:
display(
    df_eggnog.loc[r_only_genes].shape,
    df_eggnog.loc[r_only_genes].head(),
    df_eggnog.loc[r_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 44 unknown genes
    
)

In [None]:
# 16 genes are metabolic
isMetabolic = df_eggnog.loc[r_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
r_metabolic = df_eggnog.loc[r_only_genes][isMetabolic]

display(r_metabolic)

In [None]:
# 13 motility genes

isMotility1 = df_eggnog.loc[r_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[r_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[r_only_genes].Description.str.contains('pili')

r_motility = df_eggnog.loc[r_only_genes][isMotility1 | isMotility2 | isMotility3]
r_motility

## Split 4 - Kobei vs Bugandensis

In [None]:
split4_k = ['kobei',]
 
split4_b = ['bugandensis']

In [None]:
k_only_genes, b_only_genes = find_exclusive_genes(
    L_binarized.loc[kb_only_genes],
    split4_k,
    split4_b
)

display(
    f'k exclusive genes: {len(k_only_genes)}',
    f'b exclusive genes: {len(b_only_genes)}',
)

In [None]:
len(kb_only_genes) - len(k_only_genes) - len(b_only_genes) # 15 genes split/shared across

### Kobei Genes

In [None]:
display(
    df_eggnog.loc[k_only_genes].shape,
    df_eggnog.loc[k_only_genes].head(),
    df_eggnog.loc[k_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 14 unknown genes
)

In [None]:
# 1 genes are metabolic
isMetabolic = df_eggnog.loc[k_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
k_metabolic = df_eggnog.loc[k_only_genes][isMetabolic]

display(k_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[k_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[k_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[k_only_genes].Description.str.contains('pili')

k_motility = df_eggnog.loc[k_only_genes][isMotility1 | isMotility2 | isMotility3]
k_motility

### Bugandensis Genes

In [None]:
display(
    df_eggnog.loc[b_only_genes].shape,
    df_eggnog.loc[b_only_genes].head(),
    df_eggnog.loc[b_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 21 unknown genes
    
)

In [None]:
# 4 genes are metabolic
isMetabolic = df_eggnog.loc[b_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
b_metabolic = df_eggnog.loc[b_only_genes][isMetabolic]

display(b_metabolic)

In [None]:
# 2 motility genes

isMotility1 = df_eggnog.loc[b_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[b_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[b_only_genes].Description.str.contains('pili')

b_motility = df_eggnog.loc[b_only_genes][isMotility1 | isMotility2 | isMotility3]
b_motility

## Split 3 - Cancerogenous vs LC Phylon Genes

In [None]:
split3_can = ['cancerogenous']

split3_lc = ['ludwigii',
 'cloacae']


In [None]:
can_only_genes, lc_only_genes = find_exclusive_genes(
    L_binarized.loc[clc_only_genes],
    split3_can,
    split3_lc
)

display(
    f'Can exclusive genes: {len(can_only_genes)}',
    f'lmc exclusive genes: {len(lc_only_genes)}',
)

In [None]:
len(clc_only_genes) - len(can_only_genes) - len(lc_only_genes) # 29 genes split/shared across

### Cancerogenous Genes

In [None]:
display(
    df_eggnog.loc[can_only_genes].shape,
    df_eggnog.loc[can_only_genes].head(),
    df_eggnog.loc[can_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 4 unknown genes
    
)

In [None]:
# 3 genes are metabolic
isMetabolic = df_eggnog.loc[can_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
can_metabolic = df_eggnog.loc[can_only_genes][isMetabolic]

display(can_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[can_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[can_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[can_only_genes].Description.str.contains('pili')

can_motility = df_eggnog.loc[can_only_genes][isMotility1 | isMotility2 | isMotility3]
can_motility

### LMC Genes

In [None]:
display(
    df_eggnog.loc[lc_only_genes].shape,
    df_eggnog.loc[lc_only_genes].head(),
    df_eggnog.loc[lc_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 26 unknown genes
    
)

In [None]:
# 35 genes are metabolic
isMetabolic = df_eggnog.loc[lc_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
lc_metabolic = df_eggnog.loc[lc_only_genes][isMetabolic]

display(lc_metabolic)

In [None]:
# 6 motility genes

isMotility1 = df_eggnog.loc[lc_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[lc_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[lc_only_genes].Description.str.contains('pili')

lc_motility = df_eggnog.loc[lc_only_genes][isMotility1 | isMotility2 | isMotility3]
lc_motility

## Split 4 - Ludwigii vs Cloacae Phylon Genes

In [None]:
split4_l = ['ludwigii']

split4_c = ['cloacae']


In [None]:
l_only_genes, c_only_genes = find_exclusive_genes(
    L_binarized.loc[lc_only_genes],
    split4_l,
    split4_c
)

display(
    f'l exclusive genes: {len(l_only_genes)}',
    f'c exclusive genes: {len(c_only_genes)}',
)

In [None]:
len(lc_only_genes) - len(l_only_genes) - len(c_only_genes) # 28 genes split/shared across

### Ludwigii Genes

In [None]:
display(
    df_eggnog.loc[l_only_genes].shape,
    df_eggnog.loc[l_only_genes].head(),
    df_eggnog.loc[l_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 17 unknown genes
    
)

In [None]:
# 7 genes are metabolic
isMetabolic = df_eggnog.loc[l_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
l_metabolic = df_eggnog.loc[l_only_genes][isMetabolic]

display(l_metabolic)

In [None]:
# 4 motility genes

isMotility1 = df_eggnog.loc[l_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[l_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[l_only_genes].Description.str.contains('pili')
l_motility = df_eggnog.loc[l_only_genes][isMotility1 | isMotility2 | isMotility3]
l_motility

### Cloacae Genes

In [None]:
display(
    df_eggnog.loc[c_only_genes].shape,
    df_eggnog.loc[c_only_genes].head(),
    df_eggnog.loc[c_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 8 unknown genes
    
)

In [None]:
# 14 genes are metabolic
isMetabolic = df_eggnog.loc[c_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
c_metabolic = df_eggnog.loc[c_only_genes][isMetabolic]

display(c_metabolic)

In [None]:
# 1 motility genes

isMotility1 = df_eggnog.loc[c_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[c_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[c_only_genes].Description.str.contains('pili')
c_motility = df_eggnog.loc[c_only_genes][isMotility1 | isMotility2 | isMotility3]
c_motility

## Split 2 - hormaechei-hoffmannii-1/2 and horm vs SOX

In [None]:
split2_hoff_horm = ['hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2', 'hormaechei-hormaechei']

split2_hormaechei_sox = ['hormaechei-steigerwaltii-3',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-2',
 'hormaechei-oharae',
 'hormaechei-xiangfangensis',]

In [None]:
hoff_horm_only_genes, hormaechei_sox_only_genes = find_exclusive_genes(
    L_binarized.loc[upper_only_genes],
    split2_hoff_horm,
    split2_hormaechei_sox
)

display(
    f'hoff_horm exclusive genes: {len(hoff_horm_only_genes)}',
    f'sox exclusive genes: {len(hormaechei_sox_only_genes)}',
)

In [None]:
len(upper_only_genes) - len(hoff_horm_only_genes) - len(hormaechei_sox_only_genes) # 410 genes split/shared across

### hoff12 Phylon Genes

In [None]:
display(
    df_eggnog.loc[hoff_horm_only_genes].shape,
    df_eggnog.loc[hoff_horm_only_genes].head(),
    df_eggnog.loc[hoff_horm_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 201 unknown genes
    
)

In [None]:
# 38 genes are metabolic
isMetabolic = df_eggnog.loc[hoff_horm_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hoff_horm_metabolic = df_eggnog.loc[hoff_horm_only_genes][isMetabolic]

display(hoff_horm_metabolic)

In [None]:
# 26 motility genes

isMotility1 = df_eggnog.loc[hoff_horm_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hoff_horm_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hoff_horm_only_genes].Description.str.contains('pili')

hoff_horm_motility = df_eggnog.loc[hoff_horm_only_genes][isMotility1 | isMotility2 | isMotility3]
hoff_horm_motility

### SOX Phylon Genes

In [None]:
display(
    df_eggnog.loc[hormaechei_sox_only_genes].shape,
    df_eggnog.loc[hormaechei_sox_only_genes].head(),
    df_eggnog.loc[hormaechei_sox_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 265 unknown genes
    
)

In [None]:
# 63 genes are metabolic
isMetabolic = df_eggnog.loc[hormaechei_sox_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hormaechei_sox_metabolic = df_eggnog.loc[hormaechei_sox_only_genes][isMetabolic]

display(hormaechei_sox_metabolic)

In [None]:
# 73 motility genes

isMotility1 = df_eggnog.loc[hormaechei_sox_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hormaechei_sox_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hormaechei_sox_only_genes].Description.str.contains('pili')

hormaechei_sox_motility = df_eggnog.loc[hormaechei_sox_only_genes][isMotility1 | isMotility2 | isMotility3]
hormaechei_sox_motility

## Split 3 - hormaechei-hoffmannii-1/2 vs horm

In [None]:
split3_hoff12 = ['hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2']

split3_horm = ['hormaechei-hormaechei']

In [None]:
hoff12_only_genes, horm_only_genes = find_exclusive_genes(
    L_binarized.loc[hoff_horm_only_genes],
    split3_hoff12,
    split3_horm
)

display(
    f'hormaechei-hoffmannii-1/2 exclusive genes: {len(hoff12_only_genes)}',
    f'hormaechei-hormaechei exclusive genes: {len(horm_only_genes)}',
)

In [None]:
len(hoff_horm_only_genes) - len(hoff12_only_genes) - len(horm_only_genes) # 26 genes split/shared across

### hormaechei-hoffmannii-1/2 Phylon Genes

In [None]:
display(
    df_eggnog.loc[hoff12_only_genes].shape,
    df_eggnog.loc[hoff12_only_genes].head(),
    df_eggnog.loc[hoff12_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 103 unknown genes
    
)

In [None]:
# 21 genes are metabolic
isMetabolic = df_eggnog.loc[hoff12_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hoff12_metabolic = df_eggnog.loc[hoff12_only_genes][isMetabolic]

display(hoff12_metabolic)

In [None]:
# 25 motility genes

isMotility1 = df_eggnog.loc[hoff12_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hoff12_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hoff12_only_genes].Description.str.contains('pili')

hoff12_motility = df_eggnog.loc[hoff12_only_genes][isMotility1 | isMotility2 | isMotility3]
hoff12_motility

### hormaechei-hormaechei Phylon Genes

In [None]:
display(
    df_eggnog.loc[horm_only_genes].shape,
    df_eggnog.loc[horm_only_genes].head(),
    df_eggnog.loc[horm_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 86 unknown genes
    
)

In [None]:
# 11 genes are metabolic
isMetabolic = df_eggnog.loc[horm_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
horm_metabolic = df_eggnog.loc[horm_only_genes][isMetabolic]

display(horm_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[horm_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[horm_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[horm_only_genes].Description.str.contains('pili')

horm_motility = df_eggnog.loc[horm_only_genes][isMotility1 | isMotility2 | isMotility3]
horm_motility

## Split 4 - Hoff1 vs Hoff2 Phylon Genes

In [None]:
split4_hoff1 = ['hormaechei-hoffmannii-1',]

split4_hoff2 = [ 'hormaechei-hoffmannii-2',]

In [None]:
hoff1_only_genes, hoff2_only_genes = find_exclusive_genes(
    L_binarized.loc[hoff12_only_genes],
    split4_hoff1,
    split4_hoff2
)

display(
    f'hormaechei-hoffmannii-1 exclusive genes: {len(hoff1_only_genes)}',
    f'hormaechei-hoffmannii-2 exclusive genes: {len(hoff2_only_genes)}',
)

In [None]:
len(hoff12_only_genes) - len(hoff1_only_genes) - len(hoff2_only_genes) # 74 genes split/shared across

### hormaechei-hoffmannii-1 Phylon Genes

In [None]:
display(
    df_eggnog.loc[hoff1_only_genes].shape,
    df_eggnog.loc[hoff1_only_genes].head(),
    df_eggnog.loc[hoff1_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 59 unknown genes
    
)

In [None]:
# 16 genes are metabolic
isMetabolic = df_eggnog.loc[hoff1_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hoff1_metabolic = df_eggnog.loc[hoff1_only_genes][isMetabolic]

display(hoff1_metabolic)

In [None]:
# 5 motility genes

isMotility1 = df_eggnog.loc[hoff1_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hoff1_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hoff1_only_genes].Description.str.contains('pili')

hoff1_motility = df_eggnog.loc[hoff1_only_genes][isMotility1 | isMotility2 | isMotility3]
hoff1_motility

### hormaechei-hoffmannii-2 Phylon Genes

In [None]:
display(
    df_eggnog.loc[hoff2_only_genes].shape,
    df_eggnog.loc[hoff2_only_genes].head(),
    df_eggnog.loc[hoff2_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 5 unknown genes
    
)

In [None]:
# 2 genes are metabolic
isMetabolic = df_eggnog.loc[hoff2_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hoff2_metabolic = df_eggnog.loc[hoff2_only_genes][isMetabolic]

display(hoff2_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[hoff2_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hoff2_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hoff2_only_genes].Description.str.contains('pili')

hoff2_motility = df_eggnog.loc[hoff2_only_genes][isMotility1 | isMotility2 | isMotility3]
hoff2_motility

## Split 3 - XO vs S

In [None]:
split3_steigerwaltii = ['hormaechei-steigerwaltii-2','hormaechei-steigerwaltii-1','hormaechei-steigerwaltii-3']

split3_hormaechei_ox = ['hormaechei-oharae','hormaechei-xiangfangensis']

In [None]:
steigerwaltii_only_genes, hormaechei_ox_only_genes = find_exclusive_genes(
    L_binarized.loc[hormaechei_sox_only_genes],
    split3_steigerwaltii,
    split3_hormaechei_ox
)

display(
    f'hormaechei-steigerwaltii exclusive genes: {len(steigerwaltii_only_genes)}',
    f'hormaechei-OX exclusive genes: {len(hormaechei_ox_only_genes)}',
)

In [None]:
len(hormaechei_sox_only_genes) - len(steigerwaltii_only_genes) - len(hormaechei_ox_only_genes) # 89 genes split/shared across

### hormaechei-steigerwaltii Phylon Genes

In [None]:
display(
    df_eggnog.loc[steigerwaltii_only_genes].shape,
    df_eggnog.loc[steigerwaltii_only_genes].head(),
    df_eggnog.loc[steigerwaltii_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 178 unknown genes
    
)

In [None]:
# 45 genes are metabolic
isMetabolic = df_eggnog.loc[steigerwaltii_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
steigerwaltii_metabolic = df_eggnog.loc[steigerwaltii_only_genes][isMetabolic]

display(steigerwaltii_metabolic)

In [None]:
# 53 motility genes

isMotility1 = df_eggnog.loc[steigerwaltii_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[steigerwaltii_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[steigerwaltii_only_genes].Description.str.contains('pili')

steigerwaltii_motility = df_eggnog.loc[steigerwaltii_only_genes][isMotility1 | isMotility2 | isMotility3]
steigerwaltii_motility

### hormaechei-OX Phylon Genes

In [None]:
display(
    df_eggnog.loc[hormaechei_ox_only_genes].shape,
    df_eggnog.loc[hormaechei_ox_only_genes].head(),
    df_eggnog.loc[hormaechei_ox_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 36 unknown genes
    
)

In [None]:
# 11 genes are metabolic
isMetabolic = df_eggnog.loc[hormaechei_ox_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
hormaechei_ox_metabolic = df_eggnog.loc[hormaechei_ox_only_genes][isMetabolic]

display(hormaechei_ox_metabolic)

In [None]:
# 11 motility genes

isMotility1 = df_eggnog.loc[hormaechei_ox_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[hormaechei_ox_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[hormaechei_ox_only_genes].Description.str.contains('pili')

hormaechei_ox_motility = df_eggnog.loc[hormaechei_ox_only_genes][isMotility1 | isMotility2 | isMotility3]
hormaechei_ox_motility

## Split 4 - oharae vs xiangfangensis

In [None]:
split4_oharae = ['hormaechei-oharae']

split4_xiangfangensis = ['hormaechei-xiangfangensis']

In [None]:
oharae_only_genes, xiangfangensis_only_genes = find_exclusive_genes(
    L_binarized.loc[hormaechei_ox_only_genes],
    split4_oharae,
    split4_xiangfangensis
)

display(
    f'hormaechei-oharae exclusive genes: {len(oharae_only_genes)}',
    f'hormaechei-xiangfangensis exclusive genes: {len(xiangfangensis_only_genes)}',
)

In [None]:
len(hormaechei_ox_only_genes) - len(oharae_only_genes) - len(xiangfangensis_only_genes) # 14 genes split/shared across

### hormaechei-oharae Phylon Genes

In [None]:
display(
    df_eggnog.loc[oharae_only_genes].shape,
    df_eggnog.loc[oharae_only_genes].head(),
    df_eggnog.loc[oharae_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 16 unknown genes
    
)

In [None]:
# 6 genes are metabolic
isMetabolic = df_eggnog.loc[oharae_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
oharae_metabolic = df_eggnog.loc[oharae_only_genes][isMetabolic]

display(oharae_metabolic)

In [None]:
# 5 motility genes

isMotility1 = df_eggnog.loc[oharae_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[oharae_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[oharae_only_genes].Description.str.contains('pili')

oharae_motility = df_eggnog.loc[oharae_only_genes][isMotility1 | isMotility2 | isMotility3]
oharae_motility

### hormaechei-xiangfangensis-17 Phylon Genes

In [None]:
display(
    df_eggnog.loc[xiangfangensis_only_genes].shape,
    df_eggnog.loc[xiangfangensis_only_genes].head(),
    df_eggnog.loc[xiangfangensis_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 16 unknown genes
    
)

In [None]:
# 2 genes are metabolic
isMetabolic = df_eggnog.loc[xiangfangensis_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
xiangfangensis_metabolic = df_eggnog.loc[xiangfangensis_only_genes][isMetabolic]

display(xiangfangensis_metabolic)

In [None]:
# 6 motility genes

isMotility1 = df_eggnog.loc[xiangfangensis_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[xiangfangensis_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[xiangfangensis_only_genes].Description.str.contains('pili')

xiangfangensis_motility = df_eggnog.loc[xiangfangensis_only_genes][isMotility1 | isMotility2 | isMotility3]
xiangfangensis_motility

## Split 4 - hormaechei-steigerwaltii-2 vs hormaechei-steigerwaltii-13

In [None]:
split4_steigerwaltii2 = ['hormaechei-steigerwaltii-2']

split4_steigerwaltii13 = ['hormaechei-steigerwaltii-1','hormaechei-steigerwaltii-3']

In [None]:
steigerwaltii2_only_genes, steigerwaltii13_only_genes = find_exclusive_genes(
    L_binarized.loc[steigerwaltii_only_genes],
    split4_steigerwaltii2,
    split4_steigerwaltii13
)

display(
    f'hormaechei-steigerwaltii-2 exclusive genes: {len(steigerwaltii2_only_genes)}',
    f'hormaechei-steigerwaltii-18 exclusive genes: {len(steigerwaltii13_only_genes)}',
)

In [None]:
len(steigerwaltii_only_genes) - len(steigerwaltii2_only_genes) - len(steigerwaltii13_only_genes) # 76 genes split/shared across

### hormaechei-steigerwaltii-2 Phylon Genes

In [None]:
display(
    df_eggnog.loc[steigerwaltii2_only_genes].shape,
    df_eggnog.loc[steigerwaltii2_only_genes].head(),
    df_eggnog.loc[steigerwaltii2_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 78 unknown genes
    
)

In [None]:
# 2 genes are metabolic
isMetabolic = df_eggnog.loc[steigerwaltii2_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
steigerwaltii2_metabolic = df_eggnog.loc[steigerwaltii2_only_genes][isMetabolic]

display(steigerwaltii2_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[steigerwaltii2_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[steigerwaltii2_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[steigerwaltii2_only_genes].Description.str.contains('pili')

steigerwaltii2_motility = df_eggnog.loc[steigerwaltii2_only_genes][isMotility1 | isMotility2 | isMotility3]
steigerwaltii2_motility

In [None]:
# Use this check to better name HC phylons based on strain
phylon = 'hormaechei-steigerwaltii-2'
strains = list(A_binarized.loc[:,A_binarized.loc[phylon] > 0].columns)
df_acc.loc[steigerwaltii2_only_genes, strains].sum(axis=1).hist()

### hormaechei-steigerwaltii-13 Phylon Genes

In [None]:
display(
    df_eggnog.loc[steigerwaltii13_only_genes].shape,
    df_eggnog.loc[steigerwaltii13_only_genes].head(),
    df_eggnog.loc[steigerwaltii13_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 77 unknown genes    
)

In [None]:
# 11 genes are metabolic
isMetabolic = df_eggnog.loc[steigerwaltii13_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
steigerwaltii13_metabolic = df_eggnog.loc[steigerwaltii13_only_genes][isMetabolic]

display(steigerwaltii13_metabolic)

In [None]:
# 45 motility genes

isMotility1 = df_eggnog.loc[steigerwaltii13_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[steigerwaltii13_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[steigerwaltii13_only_genes].Description.str.contains('pili')

steigerwaltii13_motility = df_eggnog.loc[steigerwaltii13_only_genes][isMotility1 | isMotility2 | isMotility3]
steigerwaltii13_motility

## Split 5 - hormaechei-steigerwaltii-1 vs hormaechei-steigerwaltii-3 

In [None]:
split5steigerwaltii1 = ['hormaechei-steigerwaltii-1']

split5steigerwaltii_3 = ['hormaechei-steigerwaltii-3']

In [None]:
steigerwaltii1_only_genes, steigerwaltii3_only_genes = find_exclusive_genes(
    L_binarized.loc[steigerwaltii13_only_genes],
    split5steigerwaltii1,
    split5steigerwaltii_3
)

display(
    f'hormaechei-steigerwaltii-1 exclusive genes: {len(steigerwaltii1_only_genes)}',
    f'hormaechei-steigerwaltii-3 exclusive genes: {len(steigerwaltii3_only_genes)}',
)

In [None]:
len(steigerwaltii13_only_genes) - len(steigerwaltii1_only_genes) - len(steigerwaltii3_only_genes) # 61 genes split/shared across

### hormaechei-steigerwaltii-1 Phylon Genes

In [None]:
display(
    df_eggnog.loc[steigerwaltii1_only_genes].shape,
    df_eggnog.loc[steigerwaltii1_only_genes].head(),
    df_eggnog.loc[steigerwaltii1_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 1 unknown genes
    
)

In [None]:
# 0 genes are metabolic
isMetabolic = df_eggnog.loc[steigerwaltii1_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
steigerwaltii1_metabolic = df_eggnog.loc[steigerwaltii1_only_genes][isMetabolic]

display(steigerwaltii1_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[steigerwaltii1_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[steigerwaltii1_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[steigerwaltii1_only_genes].Description.str.contains('pili')

steigerwaltii1_motility = df_eggnog.loc[steigerwaltii1_only_genes][isMotility1 | isMotility2 | isMotility3]
steigerwaltii1_motility

### hormaechei-steigerwaltii-3 Phylon Genes

In [None]:
display(
    df_eggnog.loc[steigerwaltii3_only_genes].shape,
    df_eggnog.loc[steigerwaltii3_only_genes].head(),
    df_eggnog.loc[steigerwaltii3_only_genes].COG_category.apply(lambda x: x[0]).value_counts() # 66 unknown genes
    
)

In [None]:
df_eggnog.loc['Ebacter_C27374']

In [None]:
df_eggnog.KEGG_Pathway.value_counts().head(100)

In [None]:
# 9 genes are metabolic
isMetabolic = df_eggnog.loc[steigerwaltii3_only_genes].COG_category.str.contains('C|E|F|G|H|I|P')
steigerwaltii3_metabolic = df_eggnog.loc[steigerwaltii3_only_genes][isMetabolic]

display(steigerwaltii3_metabolic)

In [None]:
# 2 motility genes

isMotility1 = df_eggnog.loc[steigerwaltii3_only_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[steigerwaltii3_only_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[steigerwaltii3_only_genes].Description.str.contains('pili')

steigerwaltii3_motility = df_eggnog.loc[steigerwaltii3_only_genes][isMotility1 | isMotility2 | isMotility3]
steigerwaltii3_motility

## Unchar Phylons

In [None]:
unchar_phylons = ['unchar-1',
 'unchar-2',
 'unchar-3',
 'unchar-4',]

In [None]:
# Main sorted clustermap

g = sns.clustermap(
    L_binarized.loc[gene_order, unchar_phylons],
    method='ward',
    metric='euclidean',
    row_cluster=False,
    yticklabels=False,
    cmap='Greys'
);

In [None]:
unchar_genes = L_binarized[L_binarized.loc[:,unchar_phylons].sum(axis=1) > 0].index

In [None]:
display(
    df_eggnog.loc[unchar_genes].shape,
    df_eggnog.loc[unchar_genes].head(),
    df_eggnog.loc[unchar_genes].COG_category.apply(lambda x: x[0]).value_counts() # 354 unknown genes
    
)

In [None]:
# 37 genes are metabolic
isMetabolic = df_eggnog.loc[unchar_genes].COG_category.str.contains('C|E|F|G|H|I|P')
unchar_metabolic = df_eggnog.loc[unchar_genes][isMetabolic]

display(unchar_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[unchar_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[unchar_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[unchar_genes].Description.str.contains('pili')

unchar_motility = df_eggnog.loc[unchar_genes][isMotility1 | isMotility2 | isMotility3]
unchar_motility

In [None]:
is_L = df_eggnog.loc[unchar_genes].COG_category.str.contains('L')
df_eggnog.loc[unchar_genes][is_L]

## Unchar-1

In [None]:
unchar_genes = L_binarized[L_binarized.loc[:,'unchar-1'] > 0].index

In [None]:
display(
    df_eggnog.loc[unchar_genes].shape,
    df_eggnog.loc[unchar_genes].head(),
    df_eggnog.loc[unchar_genes].COG_category.apply(lambda x: x[0]).value_counts() # 161 unknown genes
    
)

In [None]:
# 12 genes are metabolic
isMetabolic = df_eggnog.loc[unchar_genes].COG_category.str.contains('C|E|F|G|H|I|P')
unchar1_metabolic = df_eggnog.loc[unchar_genes][isMetabolic]

display(unchar1_metabolic)

In [None]:
# 2 motility genes

isMotility1 = df_eggnog.loc[unchar_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[unchar_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[unchar_genes].Description.str.contains('pili')

unchar1_motility = df_eggnog.loc[unchar_genes][isMotility1 | isMotility2 | isMotility3]
unchar1_motility

In [None]:
check_cogs = df_eggnog.loc[unchar_genes].COG_category.str.contains('L|O|T|K')
df_eggnog.loc[unchar_genes][check_cogs]

## Unchar-2

In [None]:
unchar_genes = L_binarized[L_binarized.loc[:,'unchar-2'] > 0].index

In [None]:
display(
    df_eggnog.loc[unchar_genes].shape,
    df_eggnog.loc[unchar_genes].head(),
    df_eggnog.loc[unchar_genes].COG_category.apply(lambda x: x[0]).value_counts() # 82 unknown genes
    
)

In [None]:
# 2 genes are metabolic
isMetabolic = df_eggnog.loc[unchar_genes].COG_category.str.contains('C|E|F|G|H|I|P')
unchar2_metabolic = df_eggnog.loc[unchar_genes][isMetabolic]

display(unchar2_metabolic)

In [None]:
# 1 motility genes

isMotility1 = df_eggnog.loc[unchar_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[unchar_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[unchar_genes].Description.str.contains('pili')

unchar2_motility = df_eggnog.loc[unchar_genes][isMotility1 | isMotility2 | isMotility3]
unchar2_motility

In [None]:
check_cogs = df_eggnog.loc[unchar_genes].COG_category.str.contains('L|K|D')
df_eggnog.loc[unchar_genes][check_cogs]

## Unchar-3

In [None]:
unchar_genes = L_binarized[L_binarized.loc[:,['unchar-3']].sum(axis=1) > 0].index

In [None]:
display(
    df_eggnog.loc[unchar_genes].shape,
    df_eggnog.loc[unchar_genes].head(),
    df_eggnog.loc[unchar_genes].COG_category.apply(lambda x: x[0]).value_counts() # 53 unknown genes
    
)

In [None]:
# 7 genes are metabolic
isMetabolic = df_eggnog.loc[unchar_genes].COG_category.str.contains('C|E|F|G|H|I|P')
unchar3_metabolic = df_eggnog.loc[unchar_genes][isMetabolic]

display(unchar3_metabolic)

In [None]:
# 3 motility genes

isMotility1 = df_eggnog.loc[unchar_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[unchar_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[unchar_genes].Description.str.contains('pili')

unchar3_motility = df_eggnog.loc[unchar_genes][isMotility1 | isMotility2 | isMotility3]
unchar3_motility

In [None]:
check_cogs = df_eggnog.loc[unchar_genes].COG_category.str.contains('L|O|T|K|D')
df_eggnog.loc[unchar_genes][check_cogs]

In [None]:
df_eggnog.loc[unchar_genes][df_eggnog.loc[unchar_genes]['Description'].str.contains('opper')]

## Unchar-4

In [None]:
unchar_genes = L_binarized[L_binarized.loc[:,'unchar-4'] > 0].index

In [None]:
display(
    df_eggnog.loc[unchar_genes].shape,
    df_eggnog.loc[unchar_genes].head(),
    df_eggnog.loc[unchar_genes].COG_category.apply(lambda x: x[0]).value_counts() # 67 unknown genes
    
)

In [None]:
# 19 genes are metabolic
isMetabolic = df_eggnog.loc[unchar_genes].COG_category.str.contains('C|E|F|G|H|I|P')
unchar4_metabolic = df_eggnog.loc[unchar_genes][isMetabolic]

display(unchar4_metabolic)

In [None]:
# 0 motility genes

isMotility1 = df_eggnog.loc[unchar_genes].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[unchar_genes].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[unchar_genes].Description.str.contains('pili')

unchar4_motility = df_eggnog.loc[unchar_genes][isMotility1 | isMotility2 | isMotility3]
unchar4_motility

In [None]:
check_cogs = df_eggnog.loc[unchar_genes].COG_category.str.contains('L|T|K')
df_eggnog.loc[unchar_genes][check_cogs]

# Genetic split testing

## Conditions

In [None]:
c_metabolic.EC.value_counts()

In [None]:
isMotility1 = df_eggnog.COG_category.str.contains('N')
isMotility2 = df_eggnog.Description.str.contains('pilus', case=False)
isMotility3 = df_eggnog.Description.str.contains('pili', case=False)

total_motility= df_eggnog[isMotility1 | isMotility2 | isMotility3]
total_motility

In [None]:
# isMotility1 = df_eggnog.loc[df_eggnog.loc[df_acc.index].index].COG_category.str.contains('N')
isMotility2 = df_eggnog.Description.str.contains('pilus', case=False)
isMotility3 = df_eggnog.Description.str.contains('pili', case=False)

total_pili= df_eggnog[isMotility2 | isMotility3]
total_pili

In [None]:
# # Fimbrial proteins
# # 16 in accessory genome, 2 in core and 1 in near-core (fimZ)
# # Pretty clear splits across 2 major groups
# # Roggenkampii has 2 unique genes of FimC and FimF 
# # One version of fimF in all horm except S1, one in all other, roggenkampii with extra one (see above)
# # fimI in core, but extra version found in steigerwalti phylons, extra fimD also found in all S phylons
# # fimY 1 version in hormaechei, another in all but cancerogenous, cancerogenous has its own version in the rare-genome
# # fimH has 2 versions: cancerogenous has none, cloacae has 1, rest have another
# # Few other dispersed ones to note/analyze individually
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'fim' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # check for location of hof genes
# # notes: hof genes hofOMN appear to  have function related to DNA as a carbon source while hofB appears to be related to fimbrial assembly
# #        hofB has 2 versions, one is found in all phylons but kobei and roggenkampii while the other is only found in those 2 phylons
# #        hofM has several versions, one is found in all horm phylons and the others throughout the other phylons in both the acc. and rare
# #        hofO has several versions, one is found in all horm phylons and the others throughout the other phylons in both the acc. and rare
# #        hofN has several versions, one is found in all horm phylons and the others throughout the other phylons in both the acc. and rare
# # conserved patterns of inheritance in asburiae, roggenkampii and ludwigii, especially, unique sets of genes
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'hof' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# check for location of che genes (chemotaxis)
# 5 core genes
# cheM shared between all members of the lower phylons, near core
# cheA has one in hormaechei and one in all other phylons
# cheV has one copy in all but S1, bugandensis and cancerogenous, but cancerogenous has another copy of cheV
# steigerwalti-1/3 has a number of genes copies shared between them in acc genome
cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'che' in x)
cond2 = df_eggnog.loc[df_genes.index][cond].index


# # check for puu genes (putrescine)
# # notes: several copies of puuR-like genes are present and are in several phylons each, no clear theme
# #        most of the puu pathway genes are found in some of the non-horm phylons, very few in horm-phylons
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'puu' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # related to dapA and yagE genes, an enzyme related to lysine biosynthesis
# # core gene of dapA
# # near-core version of yagE everywhere but asburiae and cancerogenous
# # 1 unique to cloacae, 1 shared between cloacae and asburiae
# # few versions with no major pattern of inheritance
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : '4.3.3.7' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # pehN, a endo-polygalacturonase, related to potentially breaking down plant cell walls
# # only found in the hormaechei (all but S2)
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : '3.2.1.15' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # pqq genes associated with pyrroloquinoline quinone
# # only found in hormaechei cluster
# # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2245851/ - promotes plant growth, rhizosphere associated
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'pqq' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # thiamine kinase, used for recycling thiamine
# # one thiK found in all hormaechei
# # one enzyme found only in xian and oharae
# # one thiK found in asb, rogg, kobei, and bugan
# # one thiK found in ludwigii and cloacae
# # one in rare genome of cancerogenous
# EC = '2.7.1.89'
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : EC in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # protein-disulfide reductase, appears to be called dsbD or trxC
# # trxC like gene is core
# # dsbD has one core version
# # one version of dsbD is found in all hormaechei
# # another in all non-hormaechei but cancerogenous
# # cancerogenous has a few in rare-genome
# EC = '1.8.1.8'
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : EC in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # motility genes with fli name
# # 14 core, 1 near core, some found scattered throughout a few phylons
# # several genes unique to asburiae
# # !!! Many genes associated only with steigerwalti-1/3 !!!
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'fli' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # check for cellulose producing genes
# # a number of genes found throughout phylons related to cellulose synthesis
# # 2 found in near core, bcsB and bcsA
# cond = df_eggnog.loc[df_genes.index].Description.apply(lambda x : 'ellulose' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # als genes related to allose metabolism
# # most present in stiegerwalitii
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'als' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # nan genes related to sialic acid metabolism
# # associated mostly with stiegerwaltii and cancerogenous
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'nan' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # ebg operon, beta-galactoside related
# # related to stiegerwaltii
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'ebg' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # Amino acid permease 
# # not found in hormaechei, but found in all other species in genus
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'aap' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # related to sugar transport (similar to E coli sulfoquinovose transporter)
# # not found in asburiae, found everywhere else
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'yih' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # rhamnose utilization
# # not found in rogenkampii, found everywhere else
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'rha' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # malonate utilization
# # not found in rogenkampii, asburiae, or bugandensis, found everywhere else
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'mdc' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # galactitol  pts system
# # only in horm-horm, kobei, and bugandensis
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'gat' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # related to pts, appears to be N-acetylgalactosamine utilization
# # everywherer but horm-horm, asburiae, roggenkampii, kobei, nor bugandensis
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'aga' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # iro genes, related to Salmochelin siderophores
# # found in steigerwaltii and cancerogenous
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'iro' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # ter genes, tellurite resistance genes
# # only in unchar phylon 1
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'ter' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # related to ribitol usage (just rbtD)
# # only in horm-stiegerwaltii
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'rbt' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # related to sorbitol usage
# # missing in cancerogenous and horm-horm
# # based on literature, oharae should not be able to metabolize sorbitol, but it appears it can
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'srl' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index


# # puuR operon
# # metabolic genes not found in hormaechei, asburiae, nor ludwigii, found everywhere else
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'puu' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # arn operon
# # 
# cond = df_eggnog.loc[df_genes.index].Preferred_name.apply(lambda x : 'arn' in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# # arsenite efflux pump, arsA
# # gene found only in horm-horm and kobei, 1 copy in both and an extra in horm-horm
# EC = '3.6.3.16'
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : EC in x)
# cond2 = df_eggnog.loc[df_genes.index][cond].index

# EC = '1.1.1.262'
# cond = df_eggnog.loc[df_genes.index].EC.apply(lambda x : EC in x)
# cond2 = list(df_eggnog.loc[df_genes.index][cond].index) + ['Ebacter_C26996'] # this is the highest identity sequence for pdxA according to blast



# # Acc AMR Genes - see section below

# # Analysis of distribution of pili across dataset
# # see diagram for better idea
# cond2 = total_pili.index


In [None]:
characterized

In [None]:
operon_presence = pd.DataFrame(index = characterized)
operon_presence['pqq'] = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0]
operon_presence['allose'] = [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0]
operon_presence['sialic acid'] = [0,0,1,1,1,0,0,0,0,0,0,0,1,0,0]
operon_presence['ebg'] = [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0]
operon_presence['rhamnose'] = [1,1,1,1,1,1,1,1,0,1,1,1,1,1,1]
operon_presence['malonate'] = [1,1,1,1,1,1,1,1,0,0,1,0,1,1,1]
operon_presence['galactitol'] = [0,0,0,0,0,1,0,0,0,0,1,1,0,0,0]
operon_presence['N-acetylgalatosamine'] = [1,1,1,1,1,0,1,1,0,0,0,0,1,1,1]
operon_presence['sorbitol'] = [1,1,1,1,1,0,1,1,1,1,1,1,0,1,1]

sns.clustermap(operon_presence.T, cmap = 'Greys', col_cluster=False, figsize=(12,8))

In [None]:
h_phylons = ['hormaechei-hoffmannii-1',
 'hormaechei-hoffmannii-2',
 'hormaechei-steigerwaltii-2',
 'hormaechei-steigerwaltii-1',
 'hormaechei-steigerwaltii-3',
 'hormaechei-hormaechei',
 'hormaechei-oharae',
 'hormaechei-xiangfangensis']

other_phylons = ['asburiae',
 'roggenkampii',
 'kobei',
 'bugandensis',
 'cancerogenous',
 'ludwigii',
 'cloacae']

In [None]:
# Genes found in all hormaechei phylons and no others
h_shared_genes = L_binarized[(L_binarized[h_phylons].sum(axis=1) == len(h_phylons)) & (L_binarized[other_phylons].sum(axis=1) == 0)].index

# Genes found in all non-hormaechei phylons and no others
others_shared_genes = L_binarized[(L_binarized[other_phylons].sum(axis=1) == len(other_phylons)) & (L_binarized[h_phylons].sum(axis=1) == 0)].index

# Near core genes
near_core_genes = L_binarized[(L_binarized[other_phylons+h_phylons].sum(axis=1) == len(other_phylons+h_phylons))].index

# Zero genes
zero_genes = L_binarized[(L_binarized[other_phylons+h_phylons].sum(axis=1) == 0)].index

# Non-discriminating first split
non_discriminating_genes = L_binarized[(L_binarized[h_phylons].sum(axis=1) > 0) & (L_binarized[other_phylons].sum(axis=1) > 0)].index

## Looking at gene in large categories

In [None]:
observing = h_shared_genes

In [None]:
display(
    df_eggnog.loc[observing].shape,
    df_eggnog.loc[observing].head(),
    df_eggnog.loc[observing].COG_category.apply(lambda x: x[0]).value_counts() # 86 unknown genes   
)

In [None]:
isMetabolic = df_eggnog.loc[observing].COG_category.str.contains('C|E|F|G|H|I|P')
others_shared_metabolic = df_eggnog.loc[observing][isMetabolic]
display(others_shared_metabolic.shape, others_shared_metabolic)

In [None]:
isMotility1 = df_eggnog.loc[observing].COG_category.str.contains('N')
isMotility2 = df_eggnog.loc[observing].Description.str.contains('pilus')
isMotility3 = df_eggnog.loc[observing].Description.str.contains('pili')

nonhc_clust_shared_motility = df_eggnog.loc[observing][isMotility1 | isMotility2 | isMotility3]
nonhc_clust_shared_motility

In [None]:
others_shared_metabolic.EC.value_counts()

In [None]:
others_shared_metabolic.Preferred_name.value_counts()

## Core

In [None]:
# check in core
df_eggnog.loc[[x for x in cond2 if x in df_core.index]]

In [None]:
# check in near core
df_eggnog.loc[[x for x in cond2 if x in near_core_genes]]

## Accessory Overall

In [None]:
# all genes in acc
df_eggnog.loc[[x for x in cond2 if x in df_acc.index]]

In [None]:
df_eggnog.loc[[x for x in cond2 if x in zero_genes]]

In [None]:
df_eggnog.loc[[x for x in cond2 if x in non_discriminating_genes]]

## Hormaechei

In [None]:
# shared across all hormaechei phylons
df_eggnog.loc[[x for x in cond2 if x in h_shared_genes]]

In [None]:
# check in any hormaechei phylons
df_eggnog.loc[[x for x in cond2 if x in upper_only_genes]]

## Non-hormaechei

In [None]:
# genes in all non-hc phylons
df_eggnog.loc[[x for x in cond2 if x in others_shared_genes]]

In [None]:
# genes in at least one non-hc phylon
df_eggnog.loc[[x for x in cond2 if x in lower_only_genes]]

## Check phylon distribution

In [None]:
display(df_eggnog.loc[[x for x in cond2 if x in df_acc.index]][['COG_category','Description', 'Preferred_name', 'EC', 'GOs']])

In [None]:
inds = L_binarized.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, phylon_order].index.map(
    df_eggnog.loc[[x for x in cond2 if x in df_acc.index]]['Preferred_name']
)

# inds = L_binarized.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, h_phylons+other_phylons].index
L_temp = L_binarized.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, characterized].copy()
L_temp.index = inds


colors = ["grey", "green"]

# Create the custom colormap
cmap = sns.blend_palette(colors, as_cmap=True)

try:
    g = sns.clustermap(L_temp
               , cmap= cmap, col_cluster=False, figsize=(14,14))
except:
        g = sns.clustermap(L_temp
               , cmap= cmap, col_cluster=False, row_cluster=False)

In [None]:
inds = L_binarized.loc[df_eggnog.loc[[x for x in cond2 if x in df_acc.index]].index, h_phylons+other_phylons].index
sns.heatmap(df_acc.loc[inds, strain_order])

In [None]:
df_genes.loc[df_eggnog[df_eggnog.Preferred_name == 'ptsN'].index, metadata_complete.genome_id].sum(axis=1)

In [None]:
hyper_core_genes = [x[0] for x in df_core.sum(axis=1).items() if x[1] == df_core.shape[1]]
df_eggnog.loc[hyper_core_genes]

In [None]:
df_genes.loc['Ebacter_C98738']

# AMR Check

In [None]:
df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)


clr = dict(zip(characterized + ["None"], custom_colors))

In [None]:
amr = pd.read_csv('../../data/processed/amrfinder/output', sep = '\t')
amr['Protein identifier'] = amr['Protein identifier'].apply(lambda x: x.split('A')[0])

In [None]:
phylon_amr = L_binarized.loc[[x for x in amr['Protein identifier'] if x in L_binarized.index], characterized].sum()
phylon_amr = phylon_amr.reset_index()
phylon_amr['color'] = phylon_amr['index'].apply(lambda x: clr[x])
phylon_amr.columns = ['phylon', 'num_amr', 'color']

In [None]:
amr.set_index('Protein identifier').loc[[x for x in amr['Protein identifier'] if x in L_binarized.index]].Class.value_counts()

In [None]:
sns.barplot(data = phylon_amr, x = 'phylon', y = 'num_amr', palette=phylon_amr.color.to_list());

# Rotate x-axis labels
plt.xticks([], rotation=90);

# Hide x-axis label
plt.xlabel('');

# Set y-axis tick labels as integers
plt.yticks(range(int(phylon_amr['num_amr'].min()-1), int(phylon_amr['num_amr'].max())+1))
plt.ylabel('Number of AMR Genes');

# Set title
plt.title('Number of AMR Genes by Phylon')

plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/phylon_amr.svg', format='svg', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
## Calculate correlation between plasmid number and number of AMR genes
plasmids = metadata_complete.set_index('genome_id').plasmids.fillna(0).astype(int)

inds = L_binarized.loc[[x for x in amr['Protein identifier'] if x in df_acc.index]].index
number_amr = df_genes_complete.loc[inds].sum()

df = pd.DataFrame({'plasmids': plasmids, 'number_amr': number_amr}).fillna(0)

sns.scatterplot(data=df, x = 'plasmids', y = 'number_amr')

In [None]:
# Calculate plasmid distribution per phylon
plasmid_dict = {}
for phylon in characterized_order:
    strains = list(A_binarized.loc[:,A_binarized.loc[phylon] > 0].columns)
    number_plamids = metadata_complete.set_index('genome_id').loc[strains, 'plasmids'].fillna(0).astype(int)
    plasmid_dict[phylon] = number_plamids.values

# Extract the list of distributions for the box plot
data = plasmid_dict
data_list = [data[key] for key in data]

# Create the box plot
plt.figure(figsize=(10, 6))  # Optional: adjust the size of the figure
plt.boxplot(data_list, patch_artist=True)

# Add labels to the x-axis
plt.xticks(range(1, len(data) + 1), data.keys(), rotation=90)

# Add title and labels
plt.title('Box Plots of Distributions by Key')
plt.xlabel('Keys')
plt.ylabel('Values')

# Show the plot
plt.show()


In [None]:
df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
    'Grey'
]


clr = dict(zip(characterized_order + ["Total"], custom_colors))

In [None]:
amr_genes = L_binarized.loc[[x for x in amr['Protein identifier'] if x in L_binarized.index]]
inds = [x for x in amr_genes.index if amr_genes.sum(axis=1).loc[x]]

map = {x:amr.set_index('Protein identifier').loc[x, 'Class'] for x in inds}

DF_gene_sums = pd.DataFrame(index = sorted(set(map.values())), columns = characterized, dtype=int).fillna(0)
for phylon in DF_gene_sums.columns:
    phylon_genes = get_genes(phylon)
    for gene in inds:
        if gene in phylon_genes:
            amr_phenotype = map[gene]
            DF_gene_sums.loc[amr_phenotype, phylon] += 1

# DF_gene_sums['Total'] = [0] * len(DF_gene_sums)
# for gene in inds:
#     amr_phenotype = map[gene]
#     DF_gene_sums.loc[amr_phenotype, 'Total'] += 1
    

df_melt = DF_gene_sums.reset_index().melt(id_vars='index', var_name='phylon', value_name='Presence')

fig, ax = plt.subplots(figsize=(7, 6))
ax.set_facecolor('#f4f4f4')

# Scatter plot
sizes = set()
for i, row in df_melt.iterrows():
    if row['Presence'] >= 1:
        ax.scatter(
            row['phylon'], row['index'], 
            color=clr[row['phylon']], 
            s=100 * row['Presence'],         # Adjust the size (larger points)
            edgecolor='black',  # Thin black border
            linewidth=0.5   # Border thickness
        )
        sizes.add(100 * row['Presence'])

# Remove the top, left, and right axes
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add labels
ax.set_xlabel('Phylons')
ax.set_ylabel('AMR Genes by Antiobiotic Category')
ax.set_title('Dot Plot of Accessory Genome AMR Gene Presence')

# Customize the grid
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

ax.set_xticks([])
ax.set_xticklabels([])

# Show plot
plt.xticks(rotation=45)

# Legend
#make legend
handles = []
legend_labels = []

for size in sizes:
    # Add a handle for the title
    # handles.append(plt.Line2D([0], [0], color='none'))  # Invisible handle for spacing
    size_handles = [plt.Line2D([0], [0], color='grey', lw=0, marker = 'o',markersize=size**.5, alpha=.7, markeredgecolor='black', linewidth=.5)]
    handles.extend(size_handles)
    legend_labels.append(str(int(size/100)))

# Create a single legend
plt.legend(handles, legend_labels, title='Num Genes', loc='upper left', bbox_to_anchor=(0, 1), ncol=1)


plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/AMR_Dot.svg', format='svg', dpi=300, bbox_inches='tight')


plt.show()


# Chrom vs plasmid L

In [None]:
highlight_mask = np.zeros(L_binarized.loc[gene_order].shape, dtype='bool')
gene_locs_acc = gene_locs_acc.loc[gene_order]

for i in range(0, gene_locs_acc.shape[0]):
    if gene_locs_acc.iloc[i, 1] == 'chrom':
        highlight_mask[i,:] = True
    else:
        highlight_mask[i,:] = False

highlight_mask = pd.DataFrame(highlight_mask, index = gene_locs_acc.index, columns = L_binarized.columns)

In [None]:
df = A_binarized.loc[phylon_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Unchar
    'white',
    'white',
    'white',
    'white',
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]


clr = dict(zip(phylon_order, custom_colors))

In [None]:
from scipy.cluster.hierarchy import linkage, leaves_list
from matplotlib.patches import Patch


# Main sorted clustermap
custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list('custom_cmap', ['white', 'white'])

g = sns.clustermap(
    L_binarized.loc[gene_order, phylon_order],
    method='ward',
    metric='euclidean',
    row_cluster=False,
    yticklabels=False,
    cmap=custom_cmap,
    col_colors=list(clr.values()),
    cbar_pos=None,
    figsize=(10,12)
);

# Perform hierarchical clustering
linkage_matrix = linkage(L_binarized.loc[gene_order], method='ward', metric='euclidean')

# Get the order of rows based on the clustering
row_order = leaves_list(linkage_matrix)

# Reorder the data based on clustering
data_ordered = L_binarized.loc[gene_order, phylon_order]

# Create masks for the heatmaps
mask1 = ~highlight_mask.loc[gene_order, phylon_order]
mask2 = highlight_mask.loc[gene_order, phylon_order]

# Create the figure and the axes
# fig, ax = plt.subplots(figsize=(10, 10))
ax = g.ax_heatmap

custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list('custom_cmap', ['white', 'black'])
# Plot the first heatmap
sns.heatmap(data_ordered, ax=ax, cmap=custom_cmap, mask=mask1, cbar=False, yticklabels=False)

custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list('custom_cmap', ['white', 'red'])
# Overlay the second heatmap
sns.heatmap(data_ordered, ax=ax, cmap=custom_cmap, mask=mask2, cbar=False, yticklabels=False)


legend_elements = [
    Patch(facecolor='black', edgecolor='black', label='Chromosomal'),
    Patch(facecolor='red', edgecolor='red', label='Plasmid')
]

# Adjust the legend size here
ax.legend(handles=legend_elements, loc='upper right', fontsize=20, title_fontsize='13', markerscale=1.5)
ax.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/L.png', format='png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
L_binarized.sum(axis=1).loc[gene_order]

In [None]:
import matplotlib.pyplot as plt

# Create the figure and axis for the area plot
fig, ax = plt.subplots(figsize=(3, 12))

row_sums = L_binarized.sum(axis=1).loc[gene_order].values

# Plot the area chart vertically with reversed order
ax.fill_betweenx(range(len(row_sums)), row_sums[::-1], color="black", alpha=1)
ax.plot(row_sums[::-1], range(len(row_sums)), color="black")

# Customize the plot
ax.set_xlabel("Number of Phylons")
ax.set_xlim(right=0, left=max(row_sums))
ax.set_ylim(bottom=0)
plt.tick_params(left=False, labelleft=False)

# Remove the axis borders
for spine in ax.spines.values():
    spine.set_visible(False)

# Save and display the plot
plt.savefig('../images/fig3/L_genes_hist.png', format='png', dpi=600)
plt.show()


In [None]:
strain_order = []
unchar_strain_order = []
A_bin_char = A_binarized.loc[phylon_order]
# zero-phylon strains
noPhylon = A_bin_char.sum() == 0
strain_order.extend(A_bin_char.sum()[noPhylon].index.tolist())

# strain lists
single_phylon_strains = A_bin_char.sum()[A_bin_char.sum() == 1].index
multi_phylon_strains = A_bin_char.sum()[A_bin_char.sum() > 1].index

for phylon in phylon_order:
    if 'unchar' in phylon:
        continue
    else:
        phylon_aff_binarized_single = A_bin_char.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_bin_char.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(list1)
        strain_order.extend(new_list2)

for phylon in characterized: # must be done after the first loop
    if 'unchar' in phylon:
        phylon_aff_binarized_single = A_bin_char.loc[phylon, single_phylon_strains]
        phylon_aff_binarized_multi = A_bin_char.loc[phylon, multi_phylon_strains]
    
        inPhylon_single = phylon_aff_binarized_single == 1
        inPhylon_multi = phylon_aff_binarized_multi == 1
    
        list1 = phylon_aff_binarized_single[inPhylon_single].index.tolist()
        list2 = phylon_aff_binarized_multi[inPhylon_multi].index.tolist()
        new_list1 = list(set(list1) - set(strain_order)) # ensures no double-counting
        new_list2 = list(set(list2) - set(strain_order)) # ensures no double-counting
        
        strain_order.extend(new_list1)
        strain_order.extend(new_list2)

strain_order += unchar_strain_order

A_bin_plotting = A_bin_char.loc[phylon_order, strain_order]

In [None]:
custom_colors = [
    '#FFFFFF',
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]

# custom_colors = {phylon:color for phylon, color in zip(range(len(characterized)+1), custom_colors)}

In [None]:
A_bin_plotting = A_bin_char.loc[phylon_order, strain_order]
for strain in A_bin_plotting.columns:
    if A_bin_plotting.loc[characterized, strain].max() > 0:
        phylon = A_bin_plotting.loc[characterized, strain].idxmax()
        mult = characterized.index(phylon) + 1
        A_bin_plotting[strain] *= mult

# A-binarized
sns.clustermap(A_bin_plotting.loc[phylon_order, strain_order].astype(int), xticklabels=False, row_cluster=False, col_cluster=False,
               cmap = custom_colors, figsize=(6,10), cbar_pos=None)
plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/A.png', format='png', dpi=600, bbox_inches='tight')


# BGCFlow

In [None]:
bgc_genes = pd.read_pickle('../../data/processed/bgcflow_outputs/bgcFlow_CDS_Output.pickle')
bgc_genes['genome_id'] = bgc_genes.genome_id.apply(lambda x: x.split('_')[1])

In [None]:
bgc_clusters = pd.read_pickle('../../data/processed/bgcflow_outputs/bgcFlow_clusters.pickle')
bgc_clusters['genome_id'] = bgc_clusters.genome_id.apply(lambda x: x.split('_')[1])

In [None]:
import gzip
with gzip.open(f'../../data/processed/cd-hit-results/header_to_allele_80.pickle.gz', 'rb') as f:
    header_to_allele = pickle.load(f)

In [None]:
def h2a(x):
    try:
        return header_to_allele[x].split('A')[0]
    except:
        return None

def find_cat(gene):
    if gene in df_core.index:
        return 'core'
    elif gene in df_acc.index:
        return 'acc'
    else:
        return 'rare'

In [None]:
for locus in bgc_genes.index:
    gene = h2a(locus)
    bgc_genes.loc[locus, 'Gene'] = gene
    bgc_genes.loc[locus, 'Gene_cat'] = find_cat(gene)

In [None]:
clusters_core = pd.DataFrame()
for cluster in bgc_clusters.index:
    genome_id = bgc_clusters.loc[cluster, 'genome_id']
    region = bgc_clusters.loc[cluster, 'bgc_id']
    
    cluster_genes = bgc_genes[(bgc_genes.genome_id == genome_id) & (bgc_genes.region_id == region)]
    cluster_core = cluster_genes[cluster_genes.Gene_cat == 'core']
    clusters_core.loc[cluster, 'core_genes'] = len(cluster_core.Gene.to_list())
    clusters_core.loc[cluster, 'total_genes'] = len(cluster_genes.Gene.to_list())
    if clusters_core.loc[cluster, 'total_genes'] > 0:
        clusters_core.loc[cluster, 'ratio'] = clusters_core.loc[cluster, 'core_genes'] / clusters_core.loc[cluster, 'total_genes'] 
    else:
        clusters_core.loc[cluster, 'ratio'] = 0

### Region to type plots

In [None]:
def region_to_type(region):
    return bgc_clusters.set_index('bgc_id').loc[region, 'fam_id_0.30']

def get_phylon_genes(phylon, L_binarized = L_binarized):
    phylon_genes = L_binarized.loc[:,phylon]
    return (phylon_genes[phylon_genes == 1]).index


In [None]:
phylon_bgc_classifications = {}

## First line checks ot see struccture for all bgc genes
# bgc_genes_acc = bgc_genes
bgc_genes_acc = bgc_genes[bgc_genes.Gene.isin(L_binarized.index)]

for phylon in characterized:
    strains = get_strains(phylon)
    ## First line checks to see structure for all bgc genes
    # bgc_phylon_genes = bgc_genes_acc[bgc_genes_acc.genome_id.isin(strains)]
    bgc_phylon_genes = bgc_genes_acc[bgc_genes_acc.genome_id.isin(strains) & bgc_genes_acc.Gene.isin(get_phylon_genes(phylon))]
    
    genes_and_regions = bgc_phylon_genes.groupby('Gene')['region_id'].apply(list)
    
    df_bgc_classifications = pd.DataFrame(genes_and_regions)
    df_bgc_classifications['categories'] = df_bgc_classifications['region_id'].apply(lambda x: ([region_to_type(y) for y in x][0]))
    
    phylon_bgc_classifications[phylon] = df_bgc_classifications['categories'].to_dict()

In [None]:
df = A_binarized.loc[characterized_order]

# List to store labels and column names
label_col = []
name_col = []

# Iterate over columns
for col in df.columns:
    # Find index where value is 1
    index = df.index[df[col] == 1].tolist()
    if index:
        label_col.append(index[0])  # Append the first index where value is 1
    else:
        label_col.append('None')  # If no 1 is found, append None
    name_col.append(col)

# Create a new DataFrame
output_df = pd.DataFrame({'Column': name_col, 'Label': label_col}).set_index('Column')

custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]


clr = dict(zip(characterized_order, custom_colors))

In [None]:
phylon_bgc_classifications = {}
for phylon in characterized:
    strains = get_strains(phylon)
    phylon_clusters = bgc_clusters[bgc_clusters.genome_id.isin(strains)]
    # Remove duplicate genome_id and product pairs
    phylon_clusters = phylon_clusters.drop_duplicates(subset=['genome_id', 'product'])
    
    phylon_bgc_classifications[phylon] = (phylon_clusters['product'].value_counts() / len(strains)).to_dict()

from collections import Counter
phylon_genes_bgc_counts = {}
for phylon in phylon_bgc_classifications.keys():
    genes_and_cats = phylon_bgc_classifications[phylon]
    counts = Counter(genes_and_cats.values())
    phylon_genes_bgc_counts[phylon] = counts



from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

blue = "#323ec1" 
white = '#FFFFFF'
red = "#7a0600"  

# Generate the correct rows for the heatmap
heatmap_df = pd.DataFrame(phylon_bgc_classifications).fillna(0)
heatmap_df = heatmap_df.drop(heatmap_df.max(axis=1)[heatmap_df.max(axis=1) < .5].index)

# Create a colormap from grey to red
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("grey_red", [blue, white, red])
plt.figure(figsize=(10, 5))
g = sns.clustermap(heatmap_df, cmap=cmap, col_colors = list(clr.values()), col_cluster=False, row_cluster=False, 
                   cbar_pos=(0, .35, .03, .4))

heatmap_ax = g.ax_heatmap
pos = heatmap_ax.get_position()
heatmap_ax.set_position([pos.x0, pos.y0, pos.width, pos.height * 0.98])  # Shift down and adjust height

# Remove x-ticks and x-labels
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xlabel('')  # Optional, to clear any x-axis label

### Making plots for genes and presence of bgcs in strains

In [None]:
fam_list = []
for fam in bgc_clusters['fam_id_0.30'].unique():
    if len((bgc_clusters[bgc_clusters['fam_id_0.30'] ==  fam]).genome_id.unique()) > 10: 
        fam_list.append(fam)
bgc_clusters_temp = bgc_clusters[bgc_clusters['fam_id_0.30'].isin(fam_list)]

fam_list = bgc_clusters['fam_id_0.30'].unique()

In [None]:
genes_per_fam = {fam:set() for fam in fam_list}
for fam in tqdm(fam_list):
    bgc_clusters_temp = bgc_clusters[bgc_clusters['fam_id_0.30'] == fam]['bgc_id'].values
    for cluster in bgc_clusters_temp:
        genes = set(bgc_genes[bgc_genes.region_id == cluster]['Gene'].values)
        genes_per_fam[fam] = genes_per_fam[fam].union(genes)

In [None]:
phylon_genes_by_fam = {phylon:{} for phylon in characterized_order}
for phylon in characterized_order:
    phylon_genes = set(get_phylon_genes(phylon))
    for fam in fam_list:
        phylon_genes_by_fam[phylon][fam] = phylon_genes.intersection(genes_per_fam[fam])

In [None]:
phylon_bgc_classifications = {}
for phylon in characterized:
    strains = get_strains(phylon)
    phylon_clusters = bgc_clusters[bgc_clusters.genome_id.isin(strains)]
    # Remove duplicate genome_id and product pairs
    phylon_clusters = phylon_clusters.drop_duplicates(subset=['genome_id', 'fam_id_0.30'])
    
    phylon_bgc_classifications[phylon] = (phylon_clusters['fam_id_0.30'].value_counts() / len(strains)).to_dict()

In [None]:
## Purpose of this cell:
# Make lists of acc genes per phylon
# Make list of core genes in BGCs
# Make list of genes in each BGC

accessory_genes_by_phylon = {}
for phylon in tqdm(characterized_order):
    strains = get_strains(phylon)
    bgc_genes_from_strains = bgc_genes[bgc_genes.genome_id.isin(strains)].Gene.unique()
    phylon_genes = get_phylon_genes(phylon)
    accessory_genes_by_phylon[phylon] = [x for x in phylon_genes if x in bgc_genes_from_strains]


core_genes = [x for x in df_core.index if x in bgc_genes.Gene.unique()]

rare_genes = [x for x in df_rare.index if x in bgc_genes.Gene.unique()]


bgc_genes_list = {}
for bgc in tqdm(bgc_clusters['fam_id_0.30'].unique()):
    bgc_ids = bgc_clusters[bgc_clusters['fam_id_0.30'] == bgc]['bgc_id'].unique()
    bgc_genes_from_bgc = bgc_genes[bgc_genes.region_id.isin(bgc_ids)].Gene.unique()
    bgc_genes_list[bgc] = bgc_genes_from_bgc

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex, Normalize
from matplotlib.cm import ScalarMappable

# Generate phylon_bgc_classifications dictionary
phylon_bgc_classifications = {}
for phylon in characterized:
    strains = get_strains(phylon)
    phylon_clusters = bgc_clusters[bgc_clusters.genome_id.isin(strains)]
    # Remove duplicate genome_id and product pairs
    phylon_clusters = phylon_clusters.drop_duplicates(subset=['genome_id', 'fam_id_0.30'])
    
    phylon_bgc_classifications[phylon] = (phylon_clusters['fam_id_0.30'].value_counts() / len(strains)).to_dict()

# Determine minimum percentage for BGCs
bgc_min_percentage = {}
for fam in bgc_clusters['fam_id_0.30'].unique():
    try: 
        bgc_min_percentage[fam] = (min(d[fam] for d in phylon_bgc_classifications.values() if fam in d.keys()))
    except: 
        bgc_min_percentage[fam] = 0

# Collect BGCs that meet the criteria
filtered_bgcs = [bgc for bgc in genes_per_fam.keys() if len(genes_per_fam[bgc]) > 0]
filtered_bgcs = [bgc for bgc in filtered_bgcs if bgc_min_percentage[bgc] > .25]

# Initialize matrices
core_bgc_matrix = pd.DataFrame(0, index=['Core Genome', 'Rare Genome'], columns=filtered_bgcs)
phylon_bgc_matrix = pd.DataFrame(0, index=accessory_genes_by_phylon.keys(), columns=filtered_bgcs)

# Fill matrices with counts
for gene in core_genes:
    for bgc in filtered_bgcs:
        if gene in genes_per_fam[bgc]:
            core_bgc_matrix.loc['Core Genome', bgc] += 1

for gene in rare_genes:
    for bgc in filtered_bgcs:
        if gene in genes_per_fam[bgc]:
            core_bgc_matrix.loc['Rare Genome', bgc] += 1

for phylon, phylon_genes in accessory_genes_by_phylon.items():
    for bgc in filtered_bgcs:
        if any(gene in genes_per_fam[bgc] for gene in phylon_genes):
            phylon_bgc_matrix.loc[phylon, bgc] = len([gene for gene in phylon_genes if gene in genes_per_fam[bgc]])

# Combine matrices for a comprehensive heatmap
heatmap_data = pd.concat([core_bgc_matrix, phylon_bgc_matrix]).T

# Separate the data into two parts
core_rare_data = heatmap_data[['Core Genome', 'Rare Genome']]
phylon_data = heatmap_data.drop(columns=['Core Genome', 'Rare Genome'])

text_colors = phylon_data.copy().map(lambda x: 'grey')
# Add percentage data to the text colors
for phylon, bgc_counts in phylon_bgc_classifications.items():
    for bgc, percentage in bgc_counts.items():
        if bgc in heatmap_data.index and phylon in heatmap_data.columns:
            # Use color gradient based on percentage
            text_colors.at[bgc, phylon] = 'black'  # Example colormap


# Plot heatmap with different colormaps
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(14, 8), gridspec_kw={'width_ratios': [1, len(phylon_data.columns) / 2]})

# Core and rare genome heatmap
sns.heatmap(core_rare_data, cmap="YlGnBu", annot=False, fmt="d", linewidths=.5, annot_kws={"size": 10}, cbar_kws={"label": "Gene Count Across Pangenome"}, ax=ax1)
ax1.set_title('Core & Rare Genome')
ax1.set_xlabel('Gene Categories')
ax1.set_ylabel('BGCs')

# Phylon heatmap
sns.heatmap(phylon_data, cmap="RdYlBu_r", annot=False, fmt="d", linewidths=.5, annot_kws={"size": 10}, cbar_kws={"label": "Gene Count Across Pangenome"}, ax=ax2)
ax2.set_title('Phylons')
ax2.set_xlabel('Phylon')
ax2.set_ylabel('')

# Apply custom text colors
for i in range(len(phylon_data.index)):
    for j in range(len(phylon_data.columns)):
        ind1 = phylon_data.index[i]
        ind2 = phylon_data.columns[j]
        text = str(phylon_data.iloc[i, j]) + '\n(' + str(round(pd.DataFrame(phylon_bgc_classifications).fillna(0).loc[ind1,ind2],2)) + ')'
        color = text_colors.iloc[i, j]
        ax2.text(j + 0.5, i + 0.5, text, ha='center', va='center', color=color, fontsize=10, weight='bold')

for i in range(len(core_rare_data.index)):
    for j in range(len(core_rare_data.columns)):
        color = 'grey'
        text = str(core_rare_data.iloc[i, j])
        ax1.text(j + 0.5, i + 0.5, text, ha='center', va='center', color=color, fontsize=10, weight='bold')


# Align y-axis of both heatmaps
ax2.yaxis.set_tick_params(length=0)
ax2.set_yticks(ax1.get_yticks())
ax2.set_yticklabels([])

plt.tight_layout()


plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/BGC_heatmap.svg', format='svg', dpi=600, bbox_inches='tight')


plt.show()


In [None]:
plotting_data = pd.DataFrame(columns=['BGC', 'Phylon', 'Number_Genes', 'Pct'])
for key in phylon_genes_by_fam.keys():
    for bgc in genes_per_fam.keys():
        plotting_data.loc[len(plotting_data)] = [bgc,key, len(phylon_genes_by_fam[key][bgc]), phylon_bgc_classifications[key].get(bgc,0)]
plotting_data['BGC'] = plotting_data['BGC'].astype('category')
plotting_data['Phylon'] = plotting_data['Phylon'].astype('category')

In [None]:
filtered_bgcs = [bgc for bgc in genes_per_fam.keys() if len(genes_per_fam[bgc]) > 0]

# Determine minimum percentage for BGCs
bgc_min_percentage = {}
for fam in bgc_clusters['fam_id_0.30'].unique():
    try: 
        bgc_min_percentage[fam] = (min(d[fam] for d in phylon_bgc_classifications.values() if fam in d.keys()))
    except: 
        bgc_min_percentage[fam] = 0

filtered_bgcs = [bgc for bgc in filtered_bgcs if bgc_min_percentage[bgc] > .25]

In [None]:
custom_colors = [
    # Shades of red/orange/yellow
    "Red",
    "IndianRed",
    "DarkRed",
    "FireBrick",
    "Tomato",
    "Gold",
    "DarkGoldenrod",
    "Goldenrod",
    # Other species
    "Green",
    "Blue",
    "Purple",
    "Cyan",
    "Magenta",
    "Lime",
    "Pink",
]

In [None]:
plotting_data_filt = plotting_data[plotting_data.BGC.isin(filtered_bgcs)]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors

# Create a custom colormap that starts with white and transitions to the desired colors
cmap = mcolors.LinearSegmentedColormap.from_list("custom_white_to_orange", ["white", "#550044", "#BB0000"])

# Generate the clustermap with a colorbar
clustermap = sns.clustermap(
    plotting_data_filt.pivot(index='BGC', columns='Phylon', values='Pct')[characterized_order], 
    cmap=cmap, col_colors=list(clr.values()), row_cluster=False, col_cluster=False, figsize = (6,7)
)

# Add the colorbar alongside the heatmap
clustermap.cax.set_position([.95, 0.25, 0.02, .578])  # Adjust these values as needed for positioning

# Move y-axis labels to the left and remove x-axis labels and ticks
clustermap.ax_heatmap.yaxis.set_ticks_position('left')
clustermap.ax_heatmap.yaxis.set_label_position('left')
clustermap.ax_heatmap.set_xticks([])  # Remove x-axis ticks
clustermap.ax_heatmap.set_xticklabels([])  # Remove x-axis labels

# Save and display
plt.rcParams['svg.fonttype'] = 'none'
plt.savefig('../images/fig3/BGC_heatmap.svg', format='svg', dpi=600, bbox_inches='tight')
plt.show()


# VFDB Analysis

In [None]:
# import blast results and faa file to translate between IDs
blast_results = pd.read_csv('../../data/blastdbs/vfdb.txt', sep='\t', header=None)
                           
blast_results.columns = ['query', 'target', 'identity', 'len', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend',
                                     'eval', 'bitscore']
blast_results = blast_results[blast_results.identity > 80]
blast_results['query'] = blast_results['query'].apply(lambda x: x.split('A')[0])
blast_results['target'] = blast_results['target'].apply(lambda x: x.split('(')[0])
blast_results = blast_results.sort_values(by=['query', 'eval'], ascending=[True, True])
blast_results  = blast_results.drop_duplicates(subset='query', keep='first')

def parse_fasta(file_path):
    with open(file_path, "r") as f:
        fasta_data = f.readlines()

    # Initialize an empty list to store the results
    results = {}

    # Regular expressions for matching the labels
    vfg_pattern = r"VFG\d{6}"  # Match VFG followed by 6 digits
    vf_pattern = r"VF\d{4,6}"  # Match VF followed by 4 to 6 digits
    vfc_pattern = r"VFC\d{4,6}"  # Match VFC followed by 4 to 6 digits

    # Iterate over the lines
    for line in fasta_data:
        # If the line starts with '>' it's a header
        if line.startswith(">"):
            # Extract the VFG label using regex
            vfg_match = re.search(vfg_pattern, line)
            vf_match = re.search(vf_pattern, line)
            vfc_match = re.search(vfc_pattern, line)

            if vfg_match:
                vfg_label = vfg_match.group(0)
                
                # Extract VF and VFC labels if they exist in the header
                vf_label = vf_match.group(0) if vf_match else "Unknown VF"
                vfc_label = vfc_match.group(0) if vfc_match else "Unknown VFC"
                
                results[vfg_label] =  (vf_label, vfc_label)

    return results

# Step 2: Use the function and print the results
file_path = "../../data/blastdbs/vfdb/VFDB_setA_pro.fas"  # Replace with your FASTA file path
mapped_data = parse_fasta(file_path)

vf_annots = pd.read_csv('../../data/blastdbs/vfdb/VFs.csv', skiprows=1, index_col=0)

blast_results['vfid'] = blast_results.target.apply(lambda x: mapped_data[x][0])
blast_results['category'] = blast_results['vfid'].apply(lambda x: vf_annots.loc[x, 'VFcategory'])
blast_results['vfname'] = blast_results['vfid'].apply(lambda x: vf_annots.loc[x, 'VF_Name'])
blast_results['gene_name'] = [bakta_annotations.loc[x, 'Name'] for x in blast_results['query']]

In [None]:
pili_genes_acc = list(set(df_eggnog.loc[df_acc.index][df_eggnog.loc[df_acc.index].Description.str.contains('pil', case=False)].index).union(set(bakta_annotations.loc[df_acc.index][bakta_annotations.loc[df_acc.index].Product.str.contains('pil', case=False)].index)))
pili_genes_acc = (df_eggnog.loc[df_acc.index][df_eggnog.loc[df_acc.index].Description.str.contains('pil', case=False)].index)
L_temp = L_binarized.loc[pili_genes_acc, characterized_order].copy()
L_temp = L_temp[L_temp.sum(axis=1) > 0]
L_temp.index = bakta_annotations.loc[L_temp.index, 'Name']

sns.barplot(data  = L_temp.sum().reset_index(), x = 'index', y = 0, hue='index', palette=custom_colors)
plt.xticks([])
plt.xlabel("Phylon")
plt.ylabel('Number of Pili-Associated Genes')
plt.title('Pili-Associated Genes Across Phylons')
# colors = ["grey", "green"]

# # Create the custom colormap
# cmap = sns.blend_palette(colors, as_cmap=True)

# g = sns.clustermap(L_temp
#            , cmap= cmap, col_cluster=False, figsize=(14,14), row_cluster=True)


In [None]:
inds = blast_results.set_index('query').loc[[x for x in blast_results['query'] if x in df_acc.index]].sort_values('category').index
L_temp = L_binarized.loc[inds, characterized_order].copy()
L_temp = L_temp[L_temp.sum(axis=1) > 0]

# Create a color palette (you can customize this)
unique_categories = blast_results.set_index('query').loc[L_temp.index].sort_values('category').category.unique()
palette = sns.color_palette("tab10", len(unique_categories))
category_color_map = dict(zip(unique_categories, palette))

# Step 2: Map the colors to the categories
row_colors = blast_results.set_index('query').loc[L_temp.index].sort_values('category').category.map(category_color_map)

L_temp.index = blast_results.set_index('query').loc[L_temp.index].sort_values('category').vfname


# Remove the colorbar from the clustermap
g = sns.clustermap(
    L_temp,
    cmap='Greys',
    col_cluster=False,
    figsize=(14, 14),
    row_cluster=True,
    row_colors=row_colors.values,
    cbar_pos=None,
    col_colors=custom_colors
)

plt.ylabel("Genes By Virulence Factor Name", fontsize=15)
plt.xlabel("Phylons", fontsize=15)
plt.title("Virulence Factors by Phylon", y=1.05, fontsize=20)

# Remove x-ticks
g.ax_heatmap.set_xticks([])

# Add a legend for row colors
row_legend_patches = [
    mpatches.Patch(color=color, label=category)
    for category, color in category_color_map.items()
]

# Add a separate axis for the row legend
row_legend_ax = g.fig.add_axes([0.02, 0.3, 0.1, 0.1])  # Adjust position as needed
row_legend_ax.axis("off")
row_legend_ax.legend(
    handles=row_legend_patches,
    title="Row Categories",
    loc='center',
    fontsize=12
)

# Add a legend for column colors
column_unique_categories = characterized_order
column_palette = custom_colors
column_color_map = dict(zip(column_unique_categories, column_palette))
column_legend_patches = [
    mpatches.Patch(color=color, label=category)
    for category, color in column_color_map.items()
]

# Add a separate axis for the column legend
column_legend_ax = g.fig.add_axes([0.02, 0.5, 0.1, 0.1])  # Adjust position as needed
column_legend_ax.axis("off")
column_legend_ax.legend(
    handles=column_legend_patches,
    title="Phylons",
    loc='center',
    fontsize=12
)


plt.show()

# Hormaechei core gene analysis and visualization

In [None]:
hormaechei_core_genes = set(get_phylon_genes('hormaechei-hormaechei')) # hormaechei phylon to start set
for phylon in characterized:
    if 'hormaechei' in phylon:
        hormaechei_core_genes = hormaechei_core_genes.intersection(set(get_phylon_genes(phylon)))
    else:
        hormaechei_core_genes = hormaechei_core_genes - set(get_phylon_genes(phylon))

In [None]:
n_genes = pd.Series()
for phylon in characterized:
    n_genes[phylon] = df_eggnog.loc[get_phylon_genes(phylon)].COG_category.apply(lambda x:x[0]).value_counts()['N']
    # display(phylon, df_eggnog.loc[get_phylon_genes(phylon)].COG_category.apply(lambda x:x[0]).value_counts())

In [None]:
plt.bar(n_genes.index, n_genes.values, color = custom_colors)
plt.xticks([]);
plt.title('Number of Motility Associated Genes')