In [1]:
############################################################################
# HER2 Overexpression, clustering of P and 24H data by ensemble clustering  #
# This produces the ensemble clusters and the mixture model solution        #
############################################################################
%matplotlib inline 
import numpy as np 
import pandas as pd 
import sklearn.cluster  as skc
import matplotlib.pyplot as plt
from sklearn import datasets
import sklearn as sk
import scipy.cluster.hierarchy as sch
#from cooccurrence import coMat
import openensembles as oe


fileName = 'data/experiment_1599_EGF_24H.csv'
df = pd.DataFrame.from_csv(fileName)

df.describe()

#Create two dataframes, 24H_EGF and P_EGF
filter_col_24H = [col for col in list(df) if col.startswith('24H_EGF:data')]
HER2_EGF_df = df[filter_col_24H]

filter_col_P = [col for col in list(df) if col.startswith('P_EGF:data')]
P_EGF_df = df[filter_col_P]

time = [0, 5, 10, 30]

dataObj_P = oe.data(P_EGF_df, time)
dataObj_24H = oe.data(HER2_EGF_df, time)





In [2]:
K_ROT = round(len(dataObj_P.D['parent']) ** 0.5)
K_ROT

8

## Transformations

In [4]:
dataObj_P.transform('parent', 'zscore', 'zscore')
dataObj_P.transform('parent', 'log', 'log', base=2,Keep_NaN=0, Keep_Inf=0)
dataObj_P.transform('parent', 'PCA', 'PCA')
dataObj_P.transform('parent', 'minmax', 'minmax', minValue=0, maxValue=1)

dataObj_24H.transform('parent', 'zscore', 'zscore')
dataObj_24H.transform('parent', 'log', 'log', base=2, Keep_NaN=0, Keep_Inf=0)
dataObj_24H.transform('parent', 'PCA', 'PCA')
dataObj_24H.transform('parent', 'minmax', 'minmax', minValue=0, maxValue=1)

In [5]:
list(dataObj_P.D.keys())

['PCA', 'minmax', 'zscore', 'parent', 'log']

## Cluster

In [6]:
def cluster_ens(c, K_vals):
    
    # Create an ensemble of those that can take distance, for all distances
    dDict = sk.metrics.pairwise.distance_metrics()
    dDict
    #remove precomputed from dictionary of distances
    if 'precomputed' in dDict:
        del dDict['precomputed']
    
    algorithms_wDist = ['agglomerative', 'DBSCAN', 'AffinityPropagation']
    algorithms_noDist = ['kmeans', 'spectral']
    
    
    for transform in c.dataObj.D: #For all transfomrs
        
        for algorithm in algorithms_wDist:
            for distance in dDict:
                if algorithm=='DBSCAN' or algorithm=='AffinityPropagation': #algorithms that don't accept K
                    name = "%s_%s_%s"%(algorithm, transform, distance)
                    c.cluster(transform, algorithm, name)
                else:
                    for k in K_vals:
                        name = "%s_%s_%s_%d"%(algorithm, transform, distance, k)
                        c.cluster(transform, algorithm, name, K=k)
        for algorithm in algorithms_noDist:
            for k in K_vals:
                name = "%s_%s_%s_%d"%(algorithm, transform, distance, k)
                c.cluster(transform, algorithm, name, K=k)


    
    return c

In [7]:


K_list = range(6,10)
c_P = oe.cluster(dataObj_P)
c_P = cluster_ens(c_P, K_list)

c_24H = oe.cluster(dataObj_24H)
c_24H = cluster_ens(c_24H, K_list)


In [8]:
len(c_P.labels)

220

## Finishing

In [16]:
threshold=0.45 #determined by coMat.plot() -- visualize the cut that will be made
c_link_P = c_P.finish_co_occ_linkage(threshold)
c_link_24H = c_24H.finish_co_occ_linkage(threshold)

In [17]:
threshold = 0.7
clique_size = 4
c_g_P = c_P.finish_graph_closure(threshold, clique_size=clique_size)
c_g_24H = c_24H.finish_graph_closure(threshold, clique_size=clique_size)

In [18]:
import pickle
pickle.dump([dataObj_P, dataObj_24H, c_P, c_24H, c_link_P, c_link_24H, c_g_P, c_g_24H], open("P_24H_ensembleClusterworkspace.p", "wb"))
