In [1]:
import pandas as pd
import numpy as np
import sklearn.cluster
import distance # first, >>> pip install Distance
import time
import itertools

In [2]:
disease_samples_vocab_original = ['KO', 'knock-out', 'OE', 'overexpressor', 'infected']
control_samples_vocab_original = ['WT ', 'wild type', 'healthy', 'control', 'ctrl', 'not infected', 'normal']
control_samples_vocab_original

['WT ', 'wild type', 'healthy', 'control', 'ctrl', 'not infected', 'normal']

In [6]:
# Read samples data
samples_df = pd.read_pickle('../data/interim/samples.pkl')

In [7]:
def cluster_terms(input_words):
    """
    list of word strings --> prints exemplar_str, cluster_str
    """
    words = np.asarray(input_words) #So that indexing with a list will work
    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

    affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        cluster_str = ", ".join(cluster)
        print(" - *%s:* %s" % (exemplar, cluster_str))

In [8]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200003505']
geo_id_samples

Unnamed: 0,geo_id,nsamples,date,accession,title
36227,200003505,9,2005/10/27,GSM79992,"23353, GATA2 overexpresed, FLT3 non-mutated"
36228,200003505,9,2005/10/27,GSM80027,"20531, GATA2 overexpresed, FLT3 mutated"
36229,200003505,9,2005/10/27,GSM80024,"26611, GATA2 non-overexpresed, FLT3 non-mutated"
36230,200003505,9,2005/10/27,GSM79993,"16320, GATA2 non-overexpresed, FLT3 non-mutated"
36231,200003505,9,2005/10/27,GSM80025,"16583, GATA2 overexpresed, FLT3 mutated"
36232,200003505,9,2005/10/27,GSM79783,"20505, GATA2 overexpresed, FLT3 non-mutated"
36233,200003505,9,2005/10/27,GSM80026,"18788, GATA2 overexpresed, FLT3 mutated"
36234,200003505,9,2005/10/27,GSM79782,"14413, GATA2 overexpresed, FLT3 non-mutated"
36235,200003505,9,2005/10/27,GSM80023,"18861, GATA2 non-overexpresed, FLT3 non-mutated"


In [9]:
sample_titles = list(geo_id_samples.title)
sample_titles

['23353, GATA2 overexpresed, FLT3 non-mutated',
 '20531, GATA2 overexpresed, FLT3 mutated',
 '26611, GATA2 non-overexpresed, FLT3 non-mutated',
 '16320, GATA2 non-overexpresed, FLT3 non-mutated',
 '16583, GATA2 overexpresed, FLT3 mutated',
 '20505, GATA2 overexpresed, FLT3 non-mutated',
 '18788, GATA2 overexpresed, FLT3 mutated',
 '14413, GATA2 overexpresed, FLT3 non-mutated',
 '18861, GATA2 non-overexpresed, FLT3 non-mutated']

In [13]:
start_time = time.time() # to monitor time taken
cluster_terms(sample_titles)
end_time = time.time() # to monitor time taken
print('\ntime taken: ', end_time - start_time, 'seconds')

 - *D24 24 h PEP005:* A07RM 24 h PEP005, D04 24 h PEP005, D23 24 h PEP005, D24 24 h PEP005, LSPM2 24 h PEP005, MM127 24 h PEP005, MM253 24 h PEP005, MM455 24 h PEP005
 - *LSPM2 24 h TPA + 24 h:* LSPM2 24 h TPA + 24 h, MM127 24 h TPA + 24 h
 - *D24 24 h TPA:* A07RM 24 h TPA, D04 24 h TPA, D23 24 h TPA, D24 24 h TPA, LSPM2 24 h TPA, LSPM2 6 h TPA, MM127 24 h TPA, MM127 6 h TPA, MM253 24 h TPA, MM455 24 h TPA
 - *D24 Control:* A07RM Control, D04 Control, D23 Control, D24 Control, LSPM2 Control, MM127 Control, MM253 Control, MM455 Control
 - *LSPM2 Control Recovery:* LSPM2 Control Recovery, MM127 Control Recovery

time taken:  0.11421895027160645 seconds


In [14]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200003484']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
cluster_terms(sample_titles)
end_time = time.time() # to monitor time taken
print('\ntime taken: ', end_time - start_time, 'seconds')

          geo_id  nsamples        date accession                   title
35191  200003484        30  2006/01/20  GSM78851           LSPM2 Control
35192  200003484        30  2006/01/20  GSM78864           MM455 Control
35193  200003484        30  2006/01/20  GSM78839         D23 24 h PEP005
35194  200003484        30  2006/01/20  GSM78842         D24 24 h PEP005
35195  200003484        30  2006/01/20  GSM78859          MM253 24 h TPA
35196  200003484        30  2006/01/20  GSM78848   LSPM2 24 h TPA + 24 h
35197  200003484        30  2006/01/20  GSM78857           MM127 Control
35198  200003484        30  2006/01/20  GSM78854   MM127 24 h TPA + 24 h
35199  200003484        30  2006/01/20  GSM78837            D04 24 h TPA
35200  200003484        30  2006/01/20  GSM78840            D23 24 h TPA
35201  200003484        30  2006/01/20  GSM78860           MM253 Control
35202  200003484        30  2006/01/20  GSM78843            D24 24 h TPA
35203  200003484        30  2006/01/20  GSM78846   

In [20]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200128147']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)

            geo_id  nsamples        date   accession  \
2048837  200128147       192  2019/03/12  GSM3665739   
2048838  200128147       192  2019/03/12  GSM3665716   
2048839  200128147       192  2019/03/12  GSM3665618   
2048840  200128147       192  2019/03/12  GSM3665741   
2048841  200128147       192  2019/03/12  GSM3665724   
2048842  200128147       192  2019/03/12  GSM3665641   
2048843  200128147       192  2019/03/12  GSM3665610   
2048844  200128147       192  2019/03/12  GSM3665747   
2048845  200128147       192  2019/03/12  GSM3665658   
2048846  200128147       192  2019/03/12  GSM3665633   
2048847  200128147       192  2019/03/12  GSM3665619   
2048848  200128147       192  2019/03/12  GSM3665620   
2048849  200128147       192  2019/03/12  GSM3665698   
2048850  200128147       192  2019/03/12  GSM3665701   
2048851  200128147       192  2019/03/12  GSM3665626   
2048852  200128147       192  2019/03/12  GSM3665686   
2048853  200128147       192  2019/03/12  GSM366

In [21]:
start_time = time.time() # to monitor time taken
cluster_terms(sample_titles)
end_time = time.time() # to monitor time taken
print('\ntime taken: ', end_time - start_time, 'seconds')

 - *WTCHG_380869_204251: M38+ Gut single cell:* WTCHG_380869_201250: M38+ Gut single cell, WTCHG_380869_201251: M38+ Gut single cell, WTCHG_380869_202250: M38+ Gut single cell, WTCHG_380869_202251: M38+ Gut single cell, WTCHG_380869_203251: M38+ Gut single cell bulk, WTCHG_380869_204250: M38+ Gut single cell, WTCHG_380869_204251: M38+ Gut single cell, WTCHG_380869_204254: M38+ Gut single cell, WTCHG_380869_204255: M38+ Gut single cell, WTCHG_380869_205251: M38+ Gut single cell, WTCHG_380869_206251: M38+ Gut single cell, WTCHG_380869_207251: M38+ Gut single cell, WTCHG_380869_207255: M38+ Gut single cell, WTCHG_380869_208251: M38+ Gut single cell, WTCHG_380869_208255: M38+ Gut single cell
 - *WTCHG_380869_230276: M38+ Gut single cell:* WTCHG_380869_230273: M38+ Gut single cell, WTCHG_380869_230275: M38+ Gut single cell, WTCHG_380869_230276: M38+ Gut single cell, WTCHG_380869_230277: M38+ Gut single cell, WTCHG_380869_230279: M38+ Gut single cell, WTCHG_380869_230280: M38+ Gut single cel

In [23]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200000014']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
cluster_terms(sample_titles)
end_time = time.time() # to monitor time taken
print('\ntime taken: ', end_time - start_time, 'seconds')

         geo_id  nsamples        date  accession  \
384   200000014       765  2001/11/29  GSM383758   
385   200000014       765  2001/11/29  GSM383993   
386   200000014       765  2001/11/29  GSM384279   
387   200000014       765  2001/11/29  GSM384196   
388   200000014       765  2001/11/29  GSM383910   
389   200000014       765  2001/11/29  GSM383827   
390   200000014       765  2001/11/29  GSM383721   
391   200000014       765  2001/11/29  GSM384007   
392   200000014       765  2001/11/29  GSM384133   
393   200000014       765  2001/11/29  GSM384362   
394   200000014       765  2001/11/29  GSM384076   
395   200000014       765  2001/11/29  GSM383970   
396   200000014       765  2001/11/29  GSM383847   
397   200000014       765  2001/11/29  GSM383741   
398   200000014       765  2001/11/29  GSM384325   
399   200000014       765  2001/11/29  GSM384219   
400   200000014       765  2001/11/29  GSM384113   
401   200000014       765  2001/11/29  GSM384382   
402   200000

In [24]:
geo_id_samples = samples_df.loc[samples_df['geo_id'] == '200002487']
print(geo_id_samples)
sample_titles = list(geo_id_samples.title)
start_time = time.time() # to monitor time taken
cluster_terms(sample_titles)
end_time = time.time() # to monitor time taken
print('\ntime taken: ', end_time - start_time, 'seconds')

          geo_id  nsamples        date accession        title
24316  200002487        10  2005/05/15  GSM42172      ITMST_0
24317  200002487        10  2005/05/15  GSM42178  ITME6E7ST_0
24318  200002487        10  2005/05/15  GSM42104        ITM_0
24319  200002487        10  2005/05/15  GSM42170        ITM_1
24320  200002487        10  2005/05/15  GSM42176    ITME6E7_0
24321  200002487        10  2005/05/15  GSM42179  ITME6E7ST_1
24322  200002487        10  2005/05/15  GSM42103        ITV_1
24323  200002487        10  2005/05/15  GSM42177    ITME6E7_1
24324  200002487        10  2005/05/15  GSM42175      ITMST_1
24325  200002487        10  2005/05/15  GSM42080        ITV_0
 - *ITMST_0:* ITME6E7ST_0, ITME6E7ST_1, ITME6E7_0, ITME6E7_1, ITMST_0, ITMST_1, ITM_0, ITM_1, ITV_0, ITV_1

time taken:  0.009755849838256836 seconds
