# Setup

In [None]:
# Base imports
import os
import pickle

# Compute imports
import numpy as np
import pandas as pd

import scipy
from scipy import spatial as sp
from scipy.spatial.distance import squareform
from scipy.sparse import csr_matrix
from scipy.cluster import hierarchy as hc
from scipy.cluster.hierarchy import cophenet


from tqdm.notebook import tqdm, trange

# Plotting imports
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px

# ML import
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, median_absolute_error

In [None]:
df_genes = pd.read_pickle('../../data/processed/cd-hit-results/sim80/Ebacter_strain_by_gene.pickle.gz')
df_genes.fillna(0, inplace=True)
df_genes = df_genes.sparse.to_dense().astype('int8')

df_genes.shape

In [None]:
# Import full metadata
metadata = pd.read_csv('../../data/metadata/mash_scrubbed_species_metadata.csv', index_col=0, dtype='object')

# Filter metadata for Complete sequences only
metadata_complete = metadata[metadata.genome_status == 'Complete'] # filter for only Complete sequences

# Filter P matrix for Complete sequences only
df_genes_complete = df_genes[metadata_complete.genome_id]
inCompleteseqs = df_genes_complete.sum(axis=1) > 0 # filter for genes found in complete sequences
df_genes_complete = df_genes_complete[inCompleteseqs]

df_genes_complete.shape

In [None]:
# select genomes have too many plasmids
bad_genomes = metadata.loc[(metadata.plasmids.apply(float) > 20)]
metadata = metadata.drop(bad_genomes.index)
df_genes = df_genes.drop(bad_genomes.genome_id, axis=1)
df_genes = df_genes[df_genes.sum(axis=1) > 0]

In [None]:
# Get sparse representations of the P matrix
df_genes_complete_sparse = df_genes_complete.astype(pd.SparseDtype("int8", 0))

coo_genes = df_genes_complete_sparse.sparse.to_coo()
csr_genes = csr_matrix(coo_genes)
csr_genes

In [None]:
# Convert sparse P matrix into a frequency matrix
df_genes_freq = pd.DataFrame(index=df_genes_complete_sparse.index, data=csr_genes.sum(axis=1), columns=['freq'])
df_genes_freq = df_genes_freq.freq
df_genes_freq.sort_values()

## Full accessory genome

In [None]:
# Import (full) accessory genome
df_acc_complete = pd.read_pickle('../../data/processed/CAR_genomes/df_acc_complete.pickle')
df_acc_complete

In [None]:
# Find cutoff frequency between accessory & rare genomes
acc_min_freq = 100 * df_acc_complete.sum(axis=1).min() / df_genes_complete.shape[1]
acc_min_freq

In [None]:
# Find cutoff frequency between accessory & core genomes
acc_max_freq = 100 * df_acc_complete.sum(axis=1).max() / df_genes_complete.shape[1]
acc_max_freq

## Reduced accessory genome (min to 75%)

In [None]:
######## NOTE FROM JOSH: is this reducing the min to 75% or the max to 75%??

In [None]:
cond1 = df_genes_freq >= acc_min_freq
cond2 = df_genes_freq <= df_acc_complete.shape[1] * 0.75

df_acc_75 = df_genes_complete.loc[df_genes_freq[cond1 & cond2].sort_values().index]
df_acc_75

## Infrequent accessory genome (min to 50%)

In [None]:
cond1 = df_genes_freq >= acc_min_freq
cond2 = df_genes_freq <= df_acc_complete.shape[1] * 0.5

df_acc_50 = df_genes_complete.loc[df_genes_freq[cond1 & cond2].sort_values().index]
df_acc_50

## Sparse accessory genome (min to 25%)

In [None]:
cond1 = df_genes_freq >= acc_min_freq
cond2 = df_genes_freq <= df_acc_complete.shape[1] * 0.25

df_acc_25 = df_genes_complete.loc[df_genes_freq[cond1 & cond2].sort_values().index]
df_acc_25

# NMF decomposition

In [None]:
RANK = 19# Enter your rank here (from Mash clustering, notebook 2b)

## Useful functions

In [None]:
# Util functions for consensus clustering (from Nimfa)
from operator import mul, eq, ne, add, ge, le, itemgetter
from operator import truediv as div

def argmax(X, axis=None):
    idxX = np.asmatrix(X).argmax(axis)
    if axis is None:
        eX = X[idxX // X.shape[1], idxX % X.shape[1]]
    elif axis == 0:
        eX = [X[idxX[0, idx], col]
              for idx, col in zip(range(X.shape[1]), range(X.shape[1]))]
    else:
        eX = [X[row, idxX[idx, 0]]
              for row, idx in zip(range(X.shape[0]), range(X.shape[0]))]
    return eX, idxX


def repmat(X, m, n):
    return np.tile(np.asmatrix(X), (m, n))


def elop(X, Y, op):
    try:
        zp1 = op(1, 0)
        zp2 = op(0, 0)
        zp = zp1 != 0 or zp2 != 0
    except:
        zp = 0
    
    try:
        X[X == 0] = np.finfo(X.dtype).eps
        Y[Y == 0] = np.finfo(Y.dtype).eps
    except ValueError:
        return op(np.mat(X), np.mat(Y))
    
    return op(np.mat(X), np.mat(Y))

In [None]:
def connectivity(X, H):
    """
    Compute the connectivity matrix for the samples based on their mixture coefficients. 
    
    The connectivity matrix C is a symmetric matrix which shows the shared membership of the samples: entry C_ij is 1 iff sample i and 
    sample j belong to the same cluster, 0 otherwise. Sample assignment is determined by its largest expression value. 
    
    Return connectivity matrix.
    
    :param idx: Used in the multiple NMF model. In factorizations following
        standard NMF model or nonsmooth NMF model ``idx`` is always None.
    :type idx: None or `str` with values 'coef' or 'coef1' (`int` value of 0 or 1, respectively) 
    """
    _, idx = argmax(H, axis=0)
    mat1 = repmat(idx, X.shape[1], 1)
    mat2 = repmat(idx.T, 1, X.shape[1])
    conn = elop(mat1, mat2, eq)
    
    return np.mat(conn, dtype='d')


## Consensus model 1: Full accessory genome

__Main model we will be working with__

In [None]:
# Input matrix
X = df_acc_complete

# Rank (determined by Mash)
rank = RANK

# Initialize DataFrame of error metrics (as list of dicts)
nmf_errors = []

# Initialize dict of low-dim representations (phylons)
W_dict = {}

# Initialize dict of corresponding affinities
H_dict = {}

# Perform NMF 50 times
for i in trange(3):
    # Initialize NMF object
    model = NMF(
        n_components=rank,
        init='nndsvd',
        max_iter=5_000,
        random_state=i
    )
    
    # Fit and transform the model
    W = model.fit_transform(X)
    H = model.components_

    # Typecast as DataFrames
    init_names = [f'phylon{i}' for i in range(rank)]
    W = pd.DataFrame(W, index=X.index, columns=init_names)
    H = pd.DataFrame(H, index=init_names, columns=X.columns)

    # Save matrices to respective dicts
    W_dict[i] = W
    H_dict[i] = H
    
    # Reconstruct matrix
    X_reconstructed = np.dot(W, H)
    X_diff = X - X_reconstructed
    
    # Calculate errors
    ssr = np.sum(X_diff**2, axis=0).sum(axis=0)
    frobenius = np.linalg.norm(X_diff, 'fro')
    
    mae = median_absolute_error(X, X_reconstructed)
    rmse = np.sqrt(mean_squared_error(X, X_reconstructed))
    
    # Save errors to DataFrame (as list of dicts)
    error = {}
    error['Run'] = i+1
    error['SSR'] = ssr
    error['Frobenius'] = frobenius
    error['MAE'] = mae
    error['RMSE'] = rmse
    
    nmf_errors.append(error)

# Typecast to DataFrame
df_nmf_errors = pd.DataFrame(nmf_errors).set_index('Run')

In [None]:
# Initialize the new dictionary
conn_dict = {}

# Loop over each matrix in the H_dict dictionary
for key, H in H_dict.items():
    conn_dict[key] = connectivity(X.values, H.values)

In [None]:
# Consensus matrix for these runs (H matrix, default)
consensus_matrix = np.zeros(shape=conn_dict[0].shape)

for key, conn_matrix in conn_dict.items():
    consensus_matrix += conn_matrix

consensus_matrix /= len(conn_dict)

df_consensus_matrix = pd.DataFrame(consensus_matrix, index=X.columns, columns=X.columns)
df_consensus_matrix

In [None]:
# change this to get different number of clusters

# Minimum acceptable value for robust clusters = 50%
thresh = 0.5

# change this to get a different linkage (by method)
df_consensus_dist = 1 - df_consensus_matrix
link = hc.linkage(scipy.spatial.distance.squareform(df_consensus_dist), method='ward')

# retrieve clusters using fcluster
dist = scipy.spatial.distance.squareform(df_consensus_dist)

consensus_clst = pd.DataFrame(index=X.columns)
consensus_clst['cluster'] = hc.fcluster(link, thresh * dist.max(), 'distance')

In [None]:
# # Bar plot showing sizes of each consensus strain cluster (at thresh = 0.5)
# sns.barplot(
#     x=consensus_clst.cluster.value_counts().sort_index().index,
#     y=consensus_clst.cluster.value_counts().sort_index().values
# )

In [None]:
# Bar plot showing sizes of each consensus strain cluster (at thresh = 0.5)
px.bar(
    x=consensus_clst.cluster.value_counts().sort_index().index,
    y=consensus_clst.cluster.value_counts().sort_index().values
)

In [None]:
# Color each NMF cluster (default matplotlib colors)

#cm = matplotlib.colormaps.get_cmap('tab20')
cmb = matplotlib.colormaps.get_cmap('tab20b')
cmc = matplotlib.colormaps.get_cmap('tab20c')
cm_colors = cmb.colors + cmc.colors

consensus_clr = dict(zip(sorted(consensus_clst.cluster.unique()), cm_colors))
consensus_clst['color'] = consensus_clst.cluster.map(consensus_clr)

print('Number of colors: ', len(consensus_clr))
print('Number of clusters', len(consensus_clst.cluster.unique()))

In [None]:
size = 9

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_consensus_matrix,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r'
)

#l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.75), handles=legend_TN, frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
# upper diagonal elements of consensus
avec = np.array([consensus_matrix[i, j] for i in range(consensus_matrix.shape[0] - 1)
                 for j in range(i + 1, consensus_matrix.shape[1])])

# consensus entries are similarities, conversion to distances
Y = 1 - avec
Z = hc.linkage(Y, method='ward')

# cophenetic correlation coefficient of a hierarchical clustering
# defined by the linkage matrix Z and matrix Y from which Z was
# generated
coph_cor, _ = cophenet(Z, Y)

coph_cor # Cophenetic correlation of consensus matrix (ideally 0.7 or higher)

In [None]:
dispersion = np.sum(4 * np.multiply(consensus_matrix - 0.5, consensus_matrix - 0.5)) / consensus_matrix.size

dispersion # Dispersion coefficient of consensus matrix

## Consensus model 2: Reduced accessory

In [None]:
# Input matrix
X = df_acc_75.copy()

X.shape

In [None]:
# Number of NMF runs (~3 min for 50 runs)
n_runs = 3

# Rank of NMF (Mash rank for complete strains)
rank = RANK

# Max iterations per run
max_iter = 5_000

# Initialize dictionaries to store W and H matrices
W_dict_red = {}
H_dict_red = {}

# Initialize dictionary to store NMF model
nmf_red_dict = {}

# Store model errors (becomes pandas DataFrame)
nmf_red_errors = []

# Run NMF num_runs times and store W and H matrices in dictionaries
for i in trange(n_runs):
    nmf_red = NMF(
        n_components=rank,
        init='nndsvd', # gives sparser basis matrix
        max_iter=max_iter,
        random_state=i+731
    )
    W = nmf_red.fit_transform(X) # basis matrix (gene groupings)
    H = nmf_red.components_ # coefficients matrix (strain groupings)
    W_dict_red[i] = W
    H_dict_red[i] = H
    
    X_approx = pd.DataFrame(
        np.dot(W, H),
        index=X.index,
        columns=X.columns
    )
    
    nmf_red_dict[i] = nmf_red    
    
    # Store error metrics
    entry = {}
    entry['run'] = i
    entry['rmse']  = np.sqrt(mean_squared_error(X, X_approx))
    entry['mae']  = median_absolute_error(X, X_approx)
    entry['fro'] = np.linalg.norm(X - X_approx)
    entry['ssr'] = np.square(X - X_approx).values.flatten().sum()
    
    nmf_red_errors.append(entry)

nmf_red_errors = pd.DataFrame(nmf_red_errors)

In [None]:
# Initialize the new dictionary
conn_dict_red = {}

# Loop over each matrix in the H_dict dictionary
for key, H in H_dict_red.items():
    conn_dict_red[key] = connectivity(X, H)

In [None]:
# Consensus matrix for these runs (H matrix, default)
consensus_matrix_red = np.zeros(shape=conn_dict_red[0].shape)

for key, conn_matrix in conn_dict_red.items():
    consensus_matrix_red += conn_matrix

consensus_matrix_red /= len(conn_dict_red)

df_consensus_matrix_red = pd.DataFrame(consensus_matrix_red, index=X.columns, columns=X.columns)
df_consensus_matrix_red

In [None]:
# change this to get different number of clusters

# Minimum acceptable value for robust clusters = 50%
thresh = 0.5

# change this to get a different linkage (by method)
df_consensus_dist_red = 1 - df_consensus_matrix_red
link = hc.linkage(scipy.spatial.distance.squareform(df_consensus_dist_red), method='ward')

# retrieve clusters using fcluster
dist = scipy.spatial.distance.squareform(df_consensus_dist_red)

consensus_clst_red = pd.DataFrame(index=X.columns)
consensus_clst_red['cluster'] = hc.fcluster(link, thresh * dist.max(), 'distance')

In [None]:
# # Bar plot showing sizes of each consensus NMF cluster
# sns.barplot(
#     x=consensus_clst_red.cluster.value_counts().sort_index().index,
#     y=consensus_clst_red.cluster.value_counts().sort_index().values
# )

In [None]:
# Bar plot showing sizes of each consensus NMF cluster
px.bar(
    x=consensus_clst_red.cluster.value_counts().sort_index().index,
    y=consensus_clst_red.cluster.value_counts().sort_index().values
)

In [None]:
# Color each NMF cluster (default matplotlib colors)

#cm = matplotlib.colormaps.get_cmap('tab20')
cmb = matplotlib.colormaps.get_cmap('tab20b')
cmc = matplotlib.colormaps.get_cmap('tab20c')
cm_colors = cmb.colors + cmc.colors

consensus_clr_red = dict(zip(sorted(consensus_clst_red.cluster.unique()), cm_colors))
consensus_clst_red['color'] = consensus_clst_red.cluster.map(consensus_clr_red)

print('Number of colors: ', len(consensus_clr_red))
print('Number of clusters', len(consensus_clst_red.cluster.unique()))

In [None]:
size = 9

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_consensus_matrix_red,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r'
)

#l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
# upper diagonal elements of consensus
avec = np.array([df_consensus_matrix_red.values[i, j] for i in range(df_consensus_matrix_red.shape[0] - 1)
                 for j in range(i + 1, df_consensus_matrix_red.shape[1])])

# consensus entries are similarities, conversion to distances
Y = 1 - avec
Z = hc.linkage(Y, method='ward')

# cophenetic correlation coefficient of a hierarchical clustering
# defined by the linkage matrix Z and matrix Y from which Z was
# generated
coph_cor_red, _ = cophenet(Z, Y)

coph_cor_red # Cophenetic correlation of reduced consensus matrix

In [None]:
dispersion_red = np.sum(
    4 * np.multiply(
        df_consensus_matrix_red.values - 0.5,
        df_consensus_matrix_red.values - 0.5
    )
) / consensus_matrix_red.size # same size as df_consensus_matrix_red

dispersion_red # Dispersion coefficient of consensus matrix

## Consensus model 3: Infrequent accessory

In [None]:
# Input matrix
X = df_acc_50.copy()

X.shape

In [None]:
# Number of NMF runs (~1 min for 50 runs)
n_runs = 3

# Rank of NMF (Mash rank for complete strains)
rank = RANK

# Max iterations per run
max_iter = 5_000

# Initialize dictionaries to store W and H matrices
W_dict_min = {}
H_dict_min = {}

# Initialize dictionary to store NMF model
nmf_min_dict = {}

# Store model errors (becomes pandas DataFrame)
nmf_min_errors = []

# Run NMF num_runs times and store W and H matrices in dictionaries
for i in trange(n_runs):
    nmf_min = NMF(
        n_components=rank,
        init='nndsvd', # gives sparser basis matrix
        max_iter=max_iter,
        random_state=i+15
    )
    W = nmf_min.fit_transform(X) # basis matrix (gene groupings)
    H = nmf_min.components_ # coefficients matrix (strain groupings)
    W_dict_min[i] = W
    H_dict_min[i] = H
    
    X_approx = pd.DataFrame(
        np.dot(W, H),
        index=X.index,
        columns=X.columns
    )
    
    nmf_min_dict[i] = nmf_min
    
    # Store error metrics
    entry = {}
    entry['run'] = i
    entry['rmse']  = np.sqrt(mean_squared_error(X, X_approx))
    entry['mae']  = median_absolute_error(X, X_approx)
    entry['fro'] = np.linalg.norm(X - X_approx)
    entry['ssr'] = np.square(X - X_approx).values.flatten().sum()
    
    nmf_min_errors.append(entry)

nmf_min_errors = pd.DataFrame(nmf_min_errors)

In [None]:
# Initialize the new dictionary
conn_dict_min = {}

# Loop over each matrix in the H_dict dictionary
for key, H in H_dict_min.items():
    conn_dict_min[key] = connectivity(X, H)

In [None]:
# Consensus matrix for these runs (H matrix, default)
consensus_matrix_min = np.zeros(shape=conn_dict_min[0].shape)

for key, conn_matrix in conn_dict_min.items():
    consensus_matrix_min += conn_matrix

consensus_matrix_min /= len(conn_dict)

df_consensus_matrix_min = pd.DataFrame(consensus_matrix_min, index=X.columns, columns=X.columns)
df_consensus_matrix_min

In [None]:
# change this to get different number of clusters

# Minimum acceptable value for robust clusters = 50%
thresh = 0.5

# change this to get a different linkage (by method)
df_consensus_dist_min = 1 - df_consensus_matrix_min
link = hc.linkage(scipy.spatial.distance.squareform(df_consensus_dist_min), method='ward')

# retrieve clusters using fcluster
dist = scipy.spatial.distance.squareform(df_consensus_dist_min)

consensus_clst_min = pd.DataFrame(index=X.columns)
consensus_clst_min['cluster'] = hc.fcluster(link, thresh * dist.max(), 'distance')

In [None]:
# # Bar plot showing sizes of each consensus NMF cluster
# sns.barplot(
#     x=consensus_clst_min.cluster.value_counts().sort_index().index,
#     y=consensus_clst_min.cluster.value_counts().sort_index().values
# )

In [None]:
# Bar plot showing sizes of each consensus NMF cluster
px.bar(
    x=consensus_clst_min.cluster.value_counts().sort_index().index,
    y=consensus_clst_min.cluster.value_counts().sort_index().values
)

In [None]:
# Color each NMF cluster (default matplotlib colors)

#cm = matplotlib.colormaps.get_cmap('tab20')
cmb = matplotlib.colormaps.get_cmap('tab20b')
cmc = matplotlib.colormaps.get_cmap('tab20c')
cm_colors = cmb.colors + cmc.colors

consensus_clr_min = dict(zip(sorted(consensus_clst_min.cluster.unique()), cm_colors))
consensus_clst_min['color'] = consensus_clst_min.cluster.map(consensus_clr_min)

print('Number of colors: ', len(consensus_clr_min))
print('Number of clusters', len(consensus_clst_min.cluster.unique()))

In [None]:
size = 9

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_consensus_matrix_min,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r'
)

#l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
# upper diagonal elements of consensus
avec = np.array([df_consensus_matrix_min.values[i, j] for i in range(df_consensus_matrix_min.shape[0] - 1)
                 for j in range(i + 1, df_consensus_matrix_min.shape[1])])

# consensus entries are similarities, conversion to distances
Y = 1 - avec
Z = hc.linkage(Y, method='ward')

# cophenetic correlation coefficient of a hierarchical clustering
# defined by the linkage matrix Z and matrix Y from which Z was
# generated
coph_cor_min, _ = cophenet(Z, Y)

coph_cor_min # Cophenetic correlation of reduced consensus matrix

In [None]:
dispersion_min = np.sum(
    4 * np.multiply(
        df_consensus_matrix_min.values - 0.5,
        df_consensus_matrix_min.values - 0.5
    )
) / consensus_matrix_min.size

dispersion_min # Dispersion coefficient of consensus matrix

## Consensus model 4: Sparse accessory

In [None]:
# Input matrix
X = df_acc_25.copy()

X

In [None]:
# Number of NMF runs (~1 min for 50 runs)
n_runs = 3

# Rank of NMF (Mash rank for complete strains)
rank = 31

# Max iterations per run
max_iter = 5_000

# Initialize dictionaries to store W and H matrices
W_dict_sparse = {}
H_dict_sparse = {}

# Initialize dictionary to store NMF model
nmf_sparse_dict = {}

# Store model errors (becomes pandas DataFrame)
nmf_sparse_errors = []

# Run NMF num_runs times and store W and H matrices in dictionaries
for i in trange(n_runs):
    nmf_sparse = NMF(
        n_components=rank,
        init='nndsvd', # gives sparser basis matrix
        max_iter=max_iter,
        random_state=i+971
    )
    W = nmf_sparse.fit_transform(X) # basis matrix (gene groupings)
    H = nmf_sparse.components_ # coefficients matrix (strain groupings)
    W_dict_sparse[i] = W
    H_dict_sparse[i] = H
    
    X_approx = pd.DataFrame(
        np.dot(W, H),
        index=X.index,
        columns=X.columns
    )
    
    nmf_sparse_dict[i] = nmf_min
    
    # Store error metrics
    entry = {}
    entry['run'] = i
    entry['rmse']  = np.sqrt(mean_squared_error(X, X_approx))
    entry['mae']  = median_absolute_error(X, X_approx)
    entry['fro'] = np.linalg.norm(X - X_approx)
    entry['ssr'] = np.square(X - X_approx).values.flatten().sum()
    
    nmf_sparse_errors.append(entry)

nmf_sparse_errors = pd.DataFrame(nmf_min_errors)

In [None]:
# Initialize the new dictionary
conn_dict_sparse = {}

# Loop over each matrix in the H_dict dictionary
for key, H in H_dict_sparse.items():
    conn_dict_sparse[key] = connectivity(X, H)

In [None]:
# Consensus matrix for these runs (H matrix, default)
consensus_matrix_sparse = np.zeros(shape=conn_dict_sparse[0].shape)

for key, conn_matrix in conn_dict_sparse.items():
    consensus_matrix_sparse += conn_matrix

consensus_matrix_sparse /= len(conn_dict)

df_consensus_matrix_sparse = pd.DataFrame(consensus_matrix_sparse, index=X.columns, columns=X.columns)
df_consensus_matrix_sparse

In [None]:
# Minimum acceptable value for robust clusters = 50%
thresh = 0.5

# change this to get a different linkage (by method)
df_consensus_dist_sparse = 1 - df_consensus_matrix_sparse
link = hc.linkage(scipy.spatial.distance.squareform(df_consensus_dist_sparse), method='ward')

# retrieve clusters using fcluster
dist = scipy.spatial.distance.squareform(df_consensus_dist_sparse)

consensus_clst_sparse = pd.DataFrame(index=X.columns)
consensus_clst_sparse['cluster'] = hc.fcluster(link, thresh * dist.max(), 'distance')

In [None]:
# # Bar plot showing sizes of each consensus NMF cluster
# sns.barplot(
#     x=consensus_clst_sparse.cluster.value_counts().sort_index().index,
#     y=consensus_clst_sparse.cluster.value_counts().sort_index().values
# )

In [None]:
# Bar plot showing sizes of each consensus NMF cluster
px.bar(
    x=consensus_clst_sparse.cluster.value_counts().sort_index().index,
    y=consensus_clst_sparse.cluster.value_counts().sort_index().values
)

In [None]:
# Color each NMF cluster (default matplotlib colors)

#cm = matplotlib.colormaps.get_cmap('tab20')
cmb = matplotlib.colormaps.get_cmap('tab20b')
cmc = matplotlib.colormaps.get_cmap('tab20c')
cm_colors = cmb.colors + cmc.colors

consensus_clr_sparse = dict(zip(sorted(consensus_clst_sparse.cluster.unique()), cm_colors))
consensus_clst_sparse['color'] = consensus_clst_sparse.cluster.map(consensus_clr_sparse)

print('Number of colors: ', len(consensus_clr_sparse))
print('Number of clusters', len(consensus_clst_sparse.cluster.unique()))

In [None]:
size = 9

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_consensus_matrix_sparse,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r'
)

#l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
# upper diagonal elements of consensus
avec = np.array([df_consensus_matrix_sparse.values[i, j] for i in range(df_consensus_matrix_sparse.shape[0] - 1)
                 for j in range(i + 1, df_consensus_matrix_sparse.shape[1])])

# consensus entries are similarities, conversion to distances
Y = 1 - avec
Z = hc.linkage(Y, method='ward')

# cophenetic correlation coefficient of a hierarchical clustering
# defined by the linkage matrix Z and matrix Y from which Z was
# generated
coph_cor_sparse, _ = cophenet(Z, Y)

coph_cor_sparse # Cophenetic correlation of reduced consensus matrix

In [None]:
dispersion_sparse = np.sum(
    4 * np.multiply(
        df_consensus_matrix_sparse.values - 0.5,
        df_consensus_matrix_sparse.values - 0.5
    )
) / consensus_matrix_sparse.size

dispersion_sparse # Dispersion coefficient of consensus matrix

## Meta-consensus model

In [None]:
assert df_consensus_matrix.shape == df_consensus_matrix_red.shape == \
    df_consensus_matrix_min.shape == df_consensus_matrix_sparse.shape

df_consensus_matrix.shape

In [None]:
df_meta_consensus_matrix = pd.DataFrame(
    np.zeros(df_consensus_matrix.shape),
    index=df_consensus_matrix.index,
    columns=df_consensus_matrix.columns
)

df_meta_consensus_matrix = df_consensus_matrix + df_consensus_matrix_red + \
    df_consensus_matrix_min + df_consensus_matrix_sparse

df_meta_consensus_matrix /= 4

df_meta_consensus_matrix

In [None]:
# Minimum acceptable value for robust clusters = (75%)
thresh = 0.75

# change this to get a different linkage (by method)
df_meta_consensus_dist = 1 - df_meta_consensus_matrix
link = hc.linkage(scipy.spatial.distance.squareform(df_meta_consensus_dist), method='ward')

# retrieve clusters using fcluster
dist = scipy.spatial.distance.squareform(df_meta_consensus_dist)

meta_consensus_clst = pd.DataFrame(index=X.columns)
meta_consensus_clst['cluster'] = hc.fcluster(link, thresh * dist.max(), 'distance')

In [None]:
# # Bar plot showing sizes of each meta-consensus NMF cluster
# sns.barplot(
#     x=meta_consensus_clst.cluster.value_counts().sort_index().index,
#     y=meta_consensus_clst.cluster.value_counts().sort_index().values
# )

In [None]:
# Bar plot showing sizes of each meta-consensus NMF cluster
px.bar(
    x=meta_consensus_clst.cluster.value_counts().sort_index().index,
    y=meta_consensus_clst.cluster.value_counts().sort_index().values
)

In [None]:
# Color each NMF cluster (default matplotlib colors)

cm1 = matplotlib.colormaps.get_cmap('Pastel1')
cm2 = matplotlib.colormaps.get_cmap('Pastel2')
cmb = matplotlib.colormaps.get_cmap('tab20b')
cmc = matplotlib.colormaps.get_cmap('tab20c')
cm_colors = cm1.colors + cm2.colors + cmb.colors + cmc.colors

meta_consensus_clr = dict(zip(sorted(meta_consensus_clst.cluster.unique()), cm_colors))
meta_consensus_clst['color'] = meta_consensus_clst.cluster.map(meta_consensus_clr)

print('Number of colors: ', len(meta_consensus_clr))
print('Number of clusters', len(meta_consensus_clst.cluster.unique()))

In [None]:
size = 9

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_meta_consensus_matrix,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r'
)

#l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
# upper diagonal elements of consensus
avec = np.array([df_meta_consensus_matrix.values[i, j] for i in range(df_meta_consensus_matrix.shape[0] - 1)
                 for j in range(i + 1, df_meta_consensus_matrix.shape[1])])

# consensus entries are similarities, conversion to distances
Y = 1 - avec
Z = hc.linkage(Y, method='ward')

# cophenetic correlation coefficient of a hierarchical clustering
# defined by the linkage matrix Z and matrix Y from which Z was
# generated
coph_cor_meta, _ = cophenet(Z, Y)

coph_cor_meta # Cophenetic correlation of reduced consensus matrix

In [None]:
dispersion_meta = np.sum(
    4 * np.multiply(
        df_meta_consensus_matrix.values - 0.5,
        df_meta_consensus_matrix.values - 0.5
    )
) / df_meta_consensus_matrix.size

dispersion_meta # Dispersion coefficient of consensus matrix

## Find best run for main model

In [None]:
df_nmf_errors.sort_values(by='Frobenius')

In [None]:
best_run = df_nmf_errors['Frobenius'].idxmin()

L = W_dict[best_run-1]
A = H_dict[best_run-1]

## Plotting with species and mash clusters

### Species

In [None]:
df_species = metadata.loc[:,["genome_id", "genome_name"]]
df_species["species"] = df_species["genome_name"].apply(lambda x: x.split()[0]+" " +x.split()[1])
df_species.set_index('genome_id', inplace=True)

df_species.loc[df_species[df_species.species == "uncultured Enterobacter"].index, 'species'] = "Enterobacter sp."
small_species = df_species.species.value_counts()[(df_species.species.value_counts() < 5)].index
df_species.loc[df_species.species.isin(small_species), 'species'] = "Enterobacter sp."

cm = matplotlib.colormaps.get_cmap('tab20')
clr = dict(zip(sorted(df_species.species.unique()), cm.colors))
df_species['color'] = df_species.species.map(clr)

In [None]:
size = 9
import matplotlib.patches as patches

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

legend_TN = [patches.Patch(color=c, label=l) for l,c in clr.items()]


sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_meta_consensus_matrix,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r',
    col_colors=df_species.color
)

l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

### Mash Cluster

In [None]:
metadata

In [None]:
mash = metadata[metadata.genome_status=='Complete'][['genome_id','complete_mash_cluster']].fillna(0)
mash = mash.set_index('genome_id')
mash.complete_mash_cluster = mash.complete_mash_cluster.apply(float)
cm = matplotlib.colormaps.get_cmap('tab20')
clr = dict(zip(sorted(mash.complete_mash_cluster.unique()), cm.colors + cm.colors))
mash['color'] = mash.complete_mash_cluster.map(clr)

In [None]:
size = 9
import matplotlib.patches as patches

#legend_TN = [patches.Patch(color=c, label=l) for l,c in mash_color_dict_31.items()] # Mash cluster for legend

legend_TN = [patches.Patch(color=c, label=l) for l,c in clr.items()]


sns.set(rc={'figure.facecolor':'white'})
g = sns.clustermap(
    df_meta_consensus_matrix,
    figsize=(size,size),
    row_linkage=link,
    #row_colors=phylogroup_clst.color, # Phylogroup colors on left
    col_linkage=link,
    #col_colors=clst.color, # Mash cluster on top
    yticklabels=False,
    xticklabels=False,
    cmap='hot_r',
    col_colors=mash.color
)

l2=g.ax_heatmap.legend(loc='upper left', bbox_to_anchor=(1.01,0.85), handles=legend_TN,frameon=True)
#l2.set_title(title='Mash cluster',prop={'size':10})

In [None]:
df_species

In [None]:
mash

In [None]:
mash_cluster = 16

ind = mash[mash.complete_mash_cluster==mash_cluster].index
df_species.loc[ind].species.value_counts()

# Save NMF outputs

In [None]:
L.to_csv('../../data/processed/nmf-outputs/L.csv')
A.to_csv('../../data/processed/nmf-outputs/A.csv')