In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import h5py
from scipy.cluster.hierarchy import fcluster, linkage
from collections import Counter

In [2]:
fns = sorted(os.listdir('../processed_data/adj_expression/'))

data = []
id2tissue = []
for i, f in enumerate(fns):
    tissue = f.split('-')[0]
    with open('../processed_data/model_genes/%s-model_genes.txt' % tissue, 'rb') as reader:
        id2gene = [s.strip() for s in reader.readlines()]
    id2gene = np.array(id2gene)
    
    df_exp = pd.read_csv('../processed_data/adj_expression/' + f, index_col=0)
    df_exp.columns = [g.split('.')[0] for g in df_exp.columns]
    
    
    df_exp = df_exp[id2gene] 
    data = df_exp.values
    
    ## heirarchical clustering
    Z = linkage(data.T, method='ward', metric='euclidean')
    
    
    res = []
    for thres in np.arange(.5, 2.0, 0.1):
        ind_c = fcluster(Z, thres, criterion='inconsistent')
        g, c = Counter(ind_c).most_common(1)[0]
        res.append((thres, c))

    ### determining cut-off for heirarchical clustering
    for t, c in res:
        if c < 30:
            thres = t

    ## heirarchical clustering with the selected cut-off
    ind_c = fcluster(Z, thres, criterion='inconsistent')

    pca_info = []
    clf = PCA(n_components = 1, whiten=False)
    
    ## PCA
    for c in set(ind_c):
        if np.sum(ind_c == c) < 5:
            continue
        data0 = data[:, ind_c == c]
        clf.fit(data0)
        tmp = clf.transform(data0)
        
        ## gathering model weights
        pca_sig = np.std(tmp)
        pca_weight = np.squeeze(clf.components_)
        pca_genes = id2gene[ind_c == c]

        pca_info.append([c, pca_genes, pca_weight, pca_sig])

    ## writing model weights
    with h5py.File('../processed_data/model_weights/%s-eigengene_model.h5' % tissue, 'w') as writer:
        writer.create_dataset('model_genes', data =id2gene)
        writer.create_dataset('gene_clusters', data =ind_c)
        for c, pca_genes, pca_weight, pca_sig in pca_info:
            writer.create_dataset('weights/cluster%04d/genes' % c, data = pca_genes)
            writer.create_dataset('weights/cluster%04d/pca_weights' % c, data = pca_weight)
            writer.create_dataset('weights/cluster%04d/pca_sig' % c, data = pca_sig)
    
    del df_exp, data, Z, clf, pca_weight, pca_genes, ind_c
