<a href="https://colab.research.google.com/github/joheras/Chromatograms/blob/main/DendogramGenerationSpectra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Upload file
#@markdown Press the play button on the left. A button to upload your file will appear.
#@markdown The file must be a csv file that contains a first column with a header called Sample, and 
#@markdown a second column containing the molecular weights.  

import pandas as pd
from scipy.spatial.distance import pdist, jaccard
from scipy.stats import pearsonr,spearmanr
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import correlation,cosine
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from matplotlib import pyplot as plt
import numpy as np
from seaborn import heatmap
%matplotlib inline

def generate_spectra(df):
    """
    Generate occurrence matrix from dataframe
    
    Parameters
    ----------
    df : dataframe
         A dataframe where the first row contains the names of the samples and for each sample we have its spectra
    
    Returns
    ----------
    matrix : narray
             The spectra associated with each sample
    names  : list
             The list of names of the samples
    """
    
    names = df.columns[1:]
        
    
    return np.transpose(df.values[:,1:]), names    

def compute_distance(vector1,vector2,metric='correlation'):
    if(metric=='correlation'):
        return correlation(vector1,vector2)
    if(metric=='cosine'):
        return cosine(vector1,vector2)

def generate_similarity_matrix(df,metric='correlation'):
    # We group the mws of each sample
    spectra,names = generate_spectra(df)
    
    matrix  = np.zeros((len(names),len(names)),dtype='float32')
    for i in range(0,len(names)):
        for j in range(i+1,len(names)):
            matrix[i][j] = compute_distance(spectra[i],
                                            spectra[j],
                                            metric)
            matrix[j][i] = matrix[i][j]
    return matrix, names

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    
    dendrogram(linkage_matrix, **kwargs)

def generate_dendogram(df,linkage='average',metric='correlation',threshold=0.35,name='dendogram.jpg'):
    """
    Generate occurrence matrix from dataframe
    
    Parameters
    ----------
    df : dataframe
        A dataframe where the first column contains the names of the samples and the second
        contains the molecular weights
    linkage: string
        The linkage method for building the dendogram:
        ‘ward’, ‘complete’, ‘average’, ‘single’.
    metric: string
        One of the following metrics to compute the distance matrix:
        'pearson' or 'spearman'
    threshold: float >=0 and <=1
        Value that is considered to create a cluster. 
    name: string
        Path where the dendogram will be saved
    Returns
    ----------
    This methods shows the associated dendogram and saves the image.
    """

    if(metric=='correlation' or metric=='cosine'):            
        m,names = generate_similarity_matrix(df,metric)
    else:
        raise Excepction('Only jaccard and dice are supported')
    
    clustering = AgglomerativeClustering(n_clusters=None,affinity='precomputed',linkage=linkage,
                                      distance_threshold=threshold)
    clustering.fit(m)
    plt.figure(figsize=(50, 2*len(names)))
    plot_dendrogram(clustering,leaf_font_size=1.25*len(names),orientation='left',labels = names)
    plt.savefig(name)
    

def generate_heatmap(df,tolerance=0,metric='correlation',name='heatmap.jpg'):
    m,names = generate_similarity_matrix(df,tolerance,metric)
    plt.figure(figsize=(len(names)+1, len(names)))
    heatmap(1-m,xticklabels=names,yticklabels=names,annot=True)
    plt.savefig(name)



from google.colab import files
uploaded = files.upload()
df = pd.read_csv(list(uploaded.keys())[0])
df = df[(df['kDa']>=2) & (df['kDa']<=40)]


In [None]:
#@title Dendrogram Generation
#@markdown Fix the following values, and press the play button on the left. 
#@markdown You can change the values as many times as you want, but after
#@markdown changing them, you have to press again the play button on the left to recompute
#@markdown the dendrogram. 


#@markdown Distance: 
distance = 'correlation' #@param ["correlation", "cosine"]

#@markdown Linkage: 
linkage = 'average' #@param ["average", "single","complete","ward"]

#@markdown Similarity threshold: 
similarity = 80 #@param {type:"slider", min:0, max:100, step:1}

threshold = 1-(similarity*1.0/100)
generate_dendogram(df,tolerance=tolerance,linkage=linkage,metric=distance,threshold=threshold)


In [None]:
#@title Heatmap Generation
#@markdown Fix the following values, and press the play button on the left. 
#@markdown You can change the values as many times as you want, but after
#@markdown changing them, you have to press again the play button on the left to recompute
#@markdown the dendrogram. 

#@markdown Distance: 
distance = 'correlation' #@param ["correlation", "cosine"]

generate_heatmap(df,tolerance=tolerance,metric=distance)
