## Centroid, medoid and pairwise distance measurements

In [6]:
import pandas as pd
import numpy as np

def calculateCentroid(matrix):
    """
    calculates the position of the centroid
    expects that neither of the sets are empty

    expects input in form of a numpy matrix
    """

    mat = np.matrix(matrix)
    centroid = mat.mean(0).flatten('C')

    return np.squeeze(np.asarray(centroid))

def calculateMedoid(points):
    """
    finds the medoid of a set of points;
    medoid is a point which is has the lowest sum dissimilarity to all other points in the set

    in comparison to a centroid, it is a point which already exists in the dataset

    :param points:
    :return: coordinates of the medoid
    """

    from sklearn.metrics.pairwise import pairwise_distances

    distMatrix = pairwise_distances(points)
    medoid = np.argmin(distMatrix.sum(axis = 0))

    return points[medoid], medoid

def pairwiseCentralityDistances(matrix, cohorts, membership, annotationIndex, centroids = True, verbose = False):
    """
    calculates the pairwise distance between cohorts,
    observations of each are expected to be contained within the matrix from which the respective subset of cases
    will be extracted based on the membership index

    can be also used for

    :param matrix:
    :param cohorts:
    :param membership:
    :return:
    """

    from sklearn.metrics.pairwise import pairwise_distances

    centrality = dict()
    centralityMatrix = list()

    for cohort in cohorts:
        
        tmpIndex = [i for i,m in enumerate(membership) if m == cohort]
        subsetAnnotation = [annotationIndex[i] for i in tmpIndex]
        subset = [coords for i, coords in enumerate(matrix[:,:]) if i in tmpIndex]

        if centroids:
            tmpCentrality = calculateCentroid(subset)
            centrality[cohort] = tmpCentrality
            centralityMatrix.append(tmpCentrality)
        else:
            tmpCentrality, m = calculateMedoid(subset)

            if verbose:
                print('medoid of the cluster {} is case {}'.format(cohort, subsetAnnotation[m]))

            centrality[cohort] = tmpCentrality
            centralityMatrix.append(tmpCentrality)

    finalPairwise = pairwise_distances(centralityMatrix)

    dists = pd.DataFrame(finalPairwise, index=cohorts)
    dists.columns = cohorts
    print(dists)
    return(dists)

# update the docstring

In [3]:
def calculateCentroidsMedoids(matrix, cohorts, membership, annotationIndex):
    """
    calculates the pairwise distance between cohorts,
    observations of each are expected to be contained within the matrix from which the respective subset of cases
    will be extracted based on the membership index

    can be also used for

    :param matrix:
    :param cohorts:
    :param membership:
    :return:
    """
    
    def calculateCentroid(matrix):
        """
        calculates the position of the centroid
        expects that neither of the sets are empty

        expects input in form of a numpy matrix
        """

        mat = np.matrix(matrix)
        centroid = mat.mean(0).flatten('C')

        return np.squeeze(np.asarray(centroid))

    def calculateMedoid(points):
        """
        finds the medoid of a set of points;
        medoid is a point which is has the lowest sum dissimilarity to all other points in the set

        in comparison to a centroid, it is a point which already exists in the dataset

        :param points:
        :return: coordinates of the medoid
        """

        from sklearn.metrics.pairwise import pairwise_distances

        distMatrix = pairwise_distances(points)
        medoid = np.argmin(distMatrix.sum(axis = 0))

        return points[medoid], medoid

    from sklearn.metrics.pairwise import pairwise_distances

    centrality = dict()
    centralityMatrix = list()
    
    centroids = dict()
    medoids = dict()
    

    for cohort in cohorts:       
        
#         currentSubset = np.array([index for index in range(len(membership)) if membership[index] == cohort])
#         subsetAnnotation = [annotationIndex[i] for i in currentSubset]
#         subset = matrix[currentSubset, :]
        
        tmpIndex = [i for i,m in enumerate(membership) if m == cohort] 
        subset = [coords for i, coords in enumerate(matrix[:,:]) if i in tmpIndex]

        tmpCentroid = calculateCentroid(subset)
        centroids[cohort] = tmpCentroid
        
        tmpMedoid, m = calculateMedoid(subset)
        medoids[cohort] = tmpMedoid
        
    return(centroids, medoids)

In [4]:
def queryOriginalSpace(space, q, geneMembership, k = 8):
    '''A k-neighbourhood finder for a query q in matrix space;
    geneMembership vector is used to report the labels of the neighbors'''
    
    from scipy.spatial import KDTree
    from collections import Counter
    
    assert len(q[0]) == len(space[0]), 'must have the same dimensionality for the space and the query'
    
    # a KDTree on the the space
    kdt = KDTree(space)
    
    coords, index = kdt.query(q, k = k)
    ranking = [geneMembership[i] for i in index[0]]
    c = Counter([geneMembership[i] for i in index[0]])
    
    return (c.most_common(5), ranking)

In [2]:
def clusterMap(dists, name = ''):

    import seaborn as sns
    import matplotlib.pyplot as plt

    sns.set()
    plt.figure()
    vMax = np.max(dists.max()) # note that this assumes that the input is a pandas dataFrame with a method .max()
    ax = sns.clustermap(dists, annot=True, fmt='.2f', linewidths=.5,
                        cmap=sns.color_palette("GnBu_d"), vmin=0, vmax=vMax,
                        )
    
    if name:
        plt.savefig(name)
    
    plt.show()

In [3]:
def calcDist(v1, v2):
    """
    Returns the sum of squares distance between two vectors v1 and v2, both either lists or np arrays
    """
    
    dist = np.sqrt(np.sum(np.square(np.subtract(v1, v2))))
    return dist

In [4]:
def getGene(name):
    """
    From a name in format GENE_ID returns the gene
    """
    if '_' in name:
        return name[:name.rfind('_')]
    else:
        return name

In [5]:
def getId(name):
    """
    From a name in format GENE_ID returns the ID
    """
    
    return name[name.find('_')+1:]

In [6]:
def getOptimalThreshold(tpr, fpr, thresholds):
    """
    Based on the true positive and false positive rate and the thresholds used supplied as lists,
    returns the optimal threshold value and its index in the threshold list based on the 
    largest difference between the true positive and false positive rate
    """
    import numpy as np
    
    index = np.argmax([i - j for i,j in list(zip(tpr, fpr))])
    val = thresholds[index]
    
    return (index, val)

In [7]:
def getROC(tpr, fpr, thresholds):
    """
    Plots the ROC curve given the true positive and false positive rate lists and the thresholds;
    returns the AUC value
    """
    from matplotlib import pyplot as plt
    
    optimalThrIndex, optimalThrVal = getOptimalThreshold(tpr, fpr, thresholds)
    
    valFprOptimal, valTprOptimal = fpr[optimalThrIndex], tpr[optimalThrIndex]

    plt.plot(fpr,tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.plot(valFprOptimal, valTprOptimal, 'o', c='black')

    plt.annotate(f'Optimal threshold:{round(optimalThrVal, 3)}\nFPR:{valFprOptimal}, TPR:{valTprOptimal}',
                 xy=(valFprOptimal, valTprOptimal), xytext=(valFprOptimal + 0.1, valTprOptimal - 0.1), 
                 arrowprops=dict(facecolor='black', shrink=0.05),
                )
    
    plt.title('ROC curve')
    plt.show() 

    # This is the AUC
    auc = np.trapz(tpr,fpr)
    
    print(f'the Area under the curve is: {round(auc, 3)}')

In [3]:
def distConsistancy(matrix, groups, membership):
    
    from sklearn.metrics.pairwise import pairwise_distances
    import numpy as np
    
    def calculateDistance(pointA, pointB):
        """
        calculates the euclidean distance between points a and b;
        expects the points to be defined in the same number of dimensions
        """

        assert len(pointA) == len(pointB), "points must be defined in a same number of dimension"

        from scipy.spatial import distance

        return distance.euclidean(pointA, pointB)
    
    def calculateCentroid(matrix):
        """
        calculates the position of the centroid
        expects that neither of the sets are empty

        expects input in form of a numpy matrix
        """

        mat = np.matrix(matrix)
        centroid = mat.mean(0).flatten('C')

        return np.squeeze(np.asarray(centroid))
    
    splits = dict()

    for each in groups:

        # to subset the rows for each cohort in order to easily label them on the graph
        tmpIndex = [index for index in range(len(membership)) if membership[index] == each]
        splits[each] = [coords for i, coords in enumerate(matrix[:,:]) if i in tmpIndex]
        
    centroids = dict() #going to be holding all the centroids for each set of points
    
    for group, coords in splits.items():
        centroids[group] = calculateCentroid(coords)    
    
    closer = {k:0 for k in splits}

    for group, points in splits.items():

        totalPointsInCluster = 0

        indexGroup = list(centroids.keys()).index(group)

        for each in points:

            totalPointsInCluster += 1       

            tmpDists = [calculateDistance(each, centroid) for centroid in centroids.values()]        
            indexMin = tmpDists.index(min(tmpDists))

            if indexGroup == indexMin:
                closer[group] += 1
            else:
                continue

        closer[group] /= totalPointsInCluster
        closer[group] = round(closer[group], 4)
    
    vals = [v if v else np.nan for v in closer.values()]
    averageConsistency = np.nanmean(vals)
    
    return (closer, averageConsistency)