## Clustering  

Exploratory clustering of T2D and lipids data. The goal of clustering is to determine whether there is a certain subset of points that are similar to each other - whether these splits are made between, across or within the trait labels. Clustering is highly dependent on the parameters chosen for distance measure and the number of desired clusters. Moreover, clustering is not guaranteed to produce useful results, depending on the structure of the data and on the definition of what a 'useful' cluster is. 

In [63]:
import pandas as pd
import numpy as np
import boto3
import s3fs
import os
import sys
import warnings
import random
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from scipy import stats
from matplotlib import pyplot as plt
plt.style.use('ggplot')
warnings.filterwarnings(action='ignore')

import jupyterthemes
from jupyterthemes import jtplot
jtplot.style(theme='oceans16')
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Evaluator import Evaluator
from auth import access_key, secret_key

In [64]:
filepath = "s3://voightlab-data/"
lipids_df = pd.read_table(filepath + "lipids/lipids_groups.txt")
t2d_df = pd.read_table(filepath + "t2d/t2d_groups.txt")
both_df = pd.read_table(filepath + "grouped/lipids_T2D_overlapping_groups.txt")

In [65]:
lipids_df.head()

Unnamed: 0,Chr1_Group_1001
0,Chr1_Group_1008
1,Chr1_Group_1009
2,Chr1_Group_1037
3,Chr1_Group_1053
4,Chr1_Group_107617707


In [66]:
grouped_df = pd.read_table(filepath + "grouped/ML_table_grouped_snpcount.txt")
grouped_df.head()

Unnamed: 0,snp,MCF-7_ChIP-seq_CTCF_ENCSR000AHD_ENCFF001UML_ENCFF001UMN_intersect.bed,MCF-7_ChIP-seq_TAF1_ENCSR000AHF_ENCFF001UNU_ENCFF001UNT_intersect.bed,GM12878_ChIP-seq_CTCF_ENCFF002CDP.bed,K562_ChIP-seq_CTCF_ENCFF002CEL.bed,K562_ChIP-seq_POLR2A_ENCFF002CET.bed,endothelial_cell_of_umbilical_vein_ChIP-seq_CTCF_ENCFF002CEH.bed,endothelial_cell_of_umbilical_vein_ChIP-seq_POLR2A_ENCFF002CEJ.bed,keratinocyte_ChIP-seq_CTCF_ENCFF002CFA.bed,keratinocyte_ChIP-seq_POLR2A_ENCFF002CFC.bed,...,Hepatocyte_PPARA_GW7647_2hr.bed,Hepatocyte_PPARA_GW7647_24hr.bed,liver_USF1_ctrl_peaks.narrowPeak,liver_USF1_ASH_peaks.narrowPeak,islet_pooled_H3K4me1_final.bed,islet_CTCF_intersectall.bed,islet_H3K27ac.bed,islet_pooled_H3K27ac.bed,islet_pooled_H3K4me3_peaks.broadPeak,snpcount
0,Chr1_Group_1,0,0,0,1,1,1,0,1,0,...,0,0,0,0,0,0,1,1,1,5
1,Chr1_Group_10,1,0,1,1,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,52
2,Chr1_Group_100,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,1,1,57
3,Chr1_Group_1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
4,Chr1_Group_100046246,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


In [67]:
# Normalize the snpcount column which is continuous, to fall between 0 and 1
if 'snpcount' in grouped_df.columns:
    grouped_df['snpcount'] = (grouped_df['snpcount'] - grouped_df['snpcount'].min())/ (grouped_df['snpcount'].max() - grouped_df['snpcount'].min())

In [68]:
# Remove snp since its a non numerical column - can be reattached later if we preserve order
X_train = grouped_df.loc[:, grouped_df.columns!='snp']

In [69]:
ev = Evaluator()

### Subsampling

2749 features are far too many to cluster efficiently given that we would like to evaluate pairwise distances between samples to measure the relative effectiveness of clustering. In order to speed up runtime I'm loading in the features selected in the Feature Selection notebook based on the chi squared test for both t2d and lipids and randomly sampling from this featureset. 

In [70]:
t2d_mask = pd.read_csv(filepath + "t2d/FeatureSelected/chi2/X_train.csv", index_col=0)
lipids_mask = pd.read_csv(filepath + "lipids/FeatureSelected/chi2/X_train.csv", index_col=0)

In [73]:
num_features = 25
feature_list = list(t2d_mask.columns) + (list(lipids_mask.columns))
random.shuffle(feature_list)
features = feature_list[:num_features]

['E118-H3K79me2.gappedPeak', 'GSM1208788_batch2_chrom1_LoVo_RARG_PassedQC_peaks_hg19.bed', 'E004-H3K79me1.gappedPeak', 'HepG2_ChIP-seq_POLR2A_ENCFF002CUP.bed', 'MAFB', 'hsap_HNF4A_hg19.bed', 'PANC.ISLT-EnhA', 'HepG2_ChIP-seq_MYBL2_ENCFF002CKR.bed', 'GSM1208731_batch2_chrom1_LoVo_E2F8_PassedQC_peaks_hg19.bed', 'FAT.ADIP.NUC-EnhA', 'cardiac_mesoderm_ChIP-seq_H3K36me3_ENCSR000DSH_gappedpeak.bed', 'E008-H3K79me2.gappedPeak', 'PE_Active_Enhancers_hg19.bed', 'PDX1', 'FAT.ADIP.NUC-TssFlnkU', 'HepG2_ChIP-seq_POLR2AphosphoS5_ENCFF002CKW.bed', 'GSE64233_p65_V_final.bed', 'HepG2_ChIP-seq_TEAD4_ENCFF002CLG.bed', 'snpcount', 'islet_pooled_H3K4me3_peaks.broadPeak', 'H1-hESC_ChIP-seq_BACH1_ENCFF002CQP.bed', 'LIV.ADLT-EnhG', 'HepG2_ChIP-seq_CTCF_ENCFF002CDZ.bed', 'E004-H3K79me2.gappedPeak', 'hepatocyte_p300_Rif.bed']


In [74]:
X_train = X_train.loc[:, features[:25]]
X_train.head()

Unnamed: 0,E118-H3K79me2.gappedPeak,GSM1208788_batch2_chrom1_LoVo_RARG_PassedQC_peaks_hg19.bed,E004-H3K79me1.gappedPeak,HepG2_ChIP-seq_POLR2A_ENCFF002CUP.bed,MAFB,hsap_HNF4A_hg19.bed,PANC.ISLT-EnhA,HepG2_ChIP-seq_MYBL2_ENCFF002CKR.bed,GSM1208731_batch2_chrom1_LoVo_E2F8_PassedQC_peaks_hg19.bed,FAT.ADIP.NUC-EnhA,...,HepG2_ChIP-seq_POLR2AphosphoS5_ENCFF002CKW.bed,GSE64233_p65_V_final.bed,HepG2_ChIP-seq_TEAD4_ENCFF002CLG.bed,snpcount,islet_pooled_H3K4me3_peaks.broadPeak,H1-hESC_ChIP-seq_BACH1_ENCFF002CQP.bed,LIV.ADLT-EnhG,HepG2_ChIP-seq_CTCF_ENCFF002CDZ.bed,E004-H3K79me2.gappedPeak,hepatocyte_p300_Rif.bed
0,1,1,1,1,0,0,0,0,1,0,...,1,0,0,0.001173,1,0,0,1,1,0
1,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0.01496,0,0,1,1,0,0
2,1,1,1,1,0,1,0,1,1,0,...,1,0,0,0.016427,1,1,1,0,1,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0.00704,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0.001173,0,0,0,0,0,0


In [95]:
# Some utility functions to help us compare clustering with a few different parameters
def concat_labels(df, labels):
    labeled_df = pd.concat([df, labels], axis=1)
    return labeled_df


def cluster_compare(n_clusters, model, data, metric):
    results = []
    labels = []
    
    for n in n_clusters:
        model.fit(data)
        predictions = pd.Series(model.labels_, name='label')
        unique, counts = np.unique(predictions, return_counts=True)
        data = concat_labels(data, predictions)
        
        for index,val in enumerate(unique):
            print("val is "+ str(val))
            avg_dist = pairwise_distances(data.loc[data['label'] == val], metric=metric, n_jobs=-1).mean()
            results.append(avg_dist)
            labels.append("label: " + str(val))
        
    titles = [(str(n) + " Clusters\n" ) for n in n_clusters]
    ev.plot_compare(results, labels, 'cluster number', 'avg  {} distance'.format(metric), titles)

### Agglomerative Clustering

A hierarchical clustering method with a bottom-up approach. Each observation starts in its own cluster and clusters are iteratively merged in a way to minimize some linkage criterion. 

In [83]:
agg_cluster = AgglomerativeClustering(n_clusters=3,
                                      affinity='l1',
                                      linkage='average')

In [84]:
agg_cluster.fit(X_train)

AgglomerativeClustering(affinity='l1', compute_full_tree='auto',
            connectivity=None, linkage='average', memory=None,
            n_clusters=3, pooling_func='deprecated')

In [85]:
labels = agg_cluster.labels_
unique, counts = np.unique(labels, return_counts=True)
print (np.asarray((unique, counts)).T)

[[   0 9833]
 [   1  683]
 [   2    5]]


In [86]:
print("Number of components: {}".format(agg_cluster.n_components_))
print("Number of leaves: {}".format(agg_cluster.n_leaves_))

Number of components: 1
Number of leaves: 10521


In [96]:
cluster_compare([2, 3, 4, 5], agg_cluster, X_train, 'hamming')

val is 0
val is 1
val is 2
val is 0


ValueError: Cannot index with multidimensional key