# Clustering Samples

Script to cluster and label all the samples of all the studies (given by their geo id)


@authors: Luis, Arun, Claire, and Karsten

April 01 2019--April 12 2019

## Import modules and dependencies

In [2]:
import pandas as pd
import numpy as np
import sklearn.cluster
import distance # first, >>> pip install Distance
import time
import itertools

## Define functions

### clustering function
aim: to cluster the sample titles into clusters of samples with similar names

In [3]:
def cluster_terms(input_words):
    """
    to cluster a list of word strings (uses levenshtein distance with affinity propagation)
    
    returns a list of the clusters. each list is a dictionary. each dictionary's key is the exemplar word,
    and value is a list of the words in the cluster
       
    (list --> list[dictionary{string: list}])
    
    example
    >>> text = ['apple', 'oranges', 'ornnges', 'melons', 'aaple', 'apples', 
                 'melon', 'apale', 'meeons', 'orange', 'orage', 'meeon', 'melan']
    >>> print(cluster_terms(text))
    [{'apple': ['aaple', 'apale', 'apple', 'apples']}, 
    {'melon': ['meeon', 'meeons', 'melan', 'melon', 'melons']}, 
    {'orange': ['orage', 'orange', 'oranges', 'ornnges']}]
    """
    words = np.asarray(input_words) #So that indexing with a list will work
    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

    affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    
    clustered_terms = []    
    for cluster_id in np.unique(affprop.labels_):
        cluster_dict = {}
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        nth_cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        cluster_dict[exemplar] = list(nth_cluster)
        clustered_terms.append(cluster_dict)
    return clustered_terms

### labeling functions
aim: to label the clustered sample titles into the correct names

note: the implementation labels as per the *most common phrase* in the sample titles. does not work perfectly, needs minor fixes

In [4]:
def is_substr(find, data):
    if len(data) < 1 and len(find) < 1:        
        return False
    for i in range(len(data)):
        if find not in data[i]:            
            return False
    return True

def long_substr(data):
    """
    returns the longest common substring from a list of strings
    """
    substr = ''
    if len(data) > 1 and len(data[0]) > 0:
        for i in range(len(data[0])):
            for j in range(len(data[0])-i+1):
                if j > len(substr) and is_substr(data[0][i:i+j], data):                    
                    substr = data[0][i:i+j]
    return substr

## Input given data

In [5]:
# Read samples data
samples_df = pd.read_pickle('../data/interim/samples.pkl')

## Run data
### Example 1 (samples <= 10)

In [6]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200003505']
print(geo_id_samples, '\n')
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds')

          geo_id  nsamples        date accession  \
36227  200003505         9  2005/10/27  GSM79992   
36228  200003505         9  2005/10/27  GSM80027   
36229  200003505         9  2005/10/27  GSM80024   
36230  200003505         9  2005/10/27  GSM79993   
36231  200003505         9  2005/10/27  GSM80025   
36232  200003505         9  2005/10/27  GSM79783   
36233  200003505         9  2005/10/27  GSM80026   
36234  200003505         9  2005/10/27  GSM79782   
36235  200003505         9  2005/10/27  GSM80023   

                                                 title  
36227      23353, GATA2 overexpresed, FLT3 non-mutated  
36228          20531, GATA2 overexpresed, FLT3 mutated  
36229  26611, GATA2 non-overexpresed, FLT3 non-mutated  
36230  16320, GATA2 non-overexpresed, FLT3 non-mutated  
36231          16583, GATA2 overexpresed, FLT3 mutated  
36232      20505, GATA2 overexpresed, FLT3 non-mutated  
36233          18788, GATA2 overexpresed, FLT3 mutated  
36234      14413, GATA2

### Example 2 (10 <= samples <= 100)

In [7]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200003484']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds')

          geo_id  nsamples        date accession                   title
35191  200003484        30  2006/01/20  GSM78851           LSPM2 Control
35192  200003484        30  2006/01/20  GSM78864           MM455 Control
35193  200003484        30  2006/01/20  GSM78839         D23 24 h PEP005
35194  200003484        30  2006/01/20  GSM78842         D24 24 h PEP005
35195  200003484        30  2006/01/20  GSM78859          MM253 24 h TPA
35196  200003484        30  2006/01/20  GSM78848   LSPM2 24 h TPA + 24 h
35197  200003484        30  2006/01/20  GSM78857           MM127 Control
35198  200003484        30  2006/01/20  GSM78854   MM127 24 h TPA + 24 h
35199  200003484        30  2006/01/20  GSM78837            D04 24 h TPA
35200  200003484        30  2006/01/20  GSM78840            D23 24 h TPA
35201  200003484        30  2006/01/20  GSM78860           MM253 Control
35202  200003484        30  2006/01/20  GSM78843            D24 24 h TPA
35203  200003484        30  2006/01/20  GSM78846   

### Example 3 (100 <= samples <= 500)

In [8]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200128147']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds') # time taken is typically 32.016 seconds

            geo_id  nsamples        date   accession  \
2048837  200128147       192  2019/03/12  GSM3665739   
2048838  200128147       192  2019/03/12  GSM3665716   
2048839  200128147       192  2019/03/12  GSM3665618   
2048840  200128147       192  2019/03/12  GSM3665741   
2048841  200128147       192  2019/03/12  GSM3665724   
2048842  200128147       192  2019/03/12  GSM3665641   
2048843  200128147       192  2019/03/12  GSM3665610   
2048844  200128147       192  2019/03/12  GSM3665747   
2048845  200128147       192  2019/03/12  GSM3665658   
2048846  200128147       192  2019/03/12  GSM3665633   
2048847  200128147       192  2019/03/12  GSM3665619   
2048848  200128147       192  2019/03/12  GSM3665620   
2048849  200128147       192  2019/03/12  GSM3665698   
2048850  200128147       192  2019/03/12  GSM3665701   
2048851  200128147       192  2019/03/12  GSM3665626   
2048852  200128147       192  2019/03/12  GSM3665686   
2048853  200128147       192  2019/03/12  GSM366

### Example 4 (500 <= samples <= 1000)

In [None]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200000014']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds') # time taken is typically 414.407 seconds

         geo_id  nsamples        date  accession  \
384   200000014       765  2001/11/29  GSM383758   
385   200000014       765  2001/11/29  GSM383993   
386   200000014       765  2001/11/29  GSM384279   
387   200000014       765  2001/11/29  GSM384196   
388   200000014       765  2001/11/29  GSM383910   
389   200000014       765  2001/11/29  GSM383827   
390   200000014       765  2001/11/29  GSM383721   
391   200000014       765  2001/11/29  GSM384007   
392   200000014       765  2001/11/29  GSM384133   
393   200000014       765  2001/11/29  GSM384362   
394   200000014       765  2001/11/29  GSM384076   
395   200000014       765  2001/11/29  GSM383970   
396   200000014       765  2001/11/29  GSM383847   
397   200000014       765  2001/11/29  GSM383741   
398   200000014       765  2001/11/29  GSM384325   
399   200000014       765  2001/11/29  GSM384219   
400   200000014       765  2001/11/29  GSM384113   
401   200000014       765  2001/11/29  GSM384382   
402   200000

### Example 5 (only 1 cluster)

In [None]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200002487']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds')

In [1]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200002193']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
clusters = (cluster_terms(sample_titles))
print('\nNo. of clusters: ', len(clusters), '\n')
for item in clusters:
    print(item, '\n')
    clustered_sample_titles = list(item.values())[0]    
    print('label: ', long_substr(clustered_sample_titles), '\n')
end_time = time.time() # to monitor time taken
print('time taken: ', end_time - start_time, 'seconds')

NameError: name 'samples_df' is not defined