# Generate Summary Datasets
I will average over all the small CSVs with the labeled and unlabeled datasets (e.g. pdc_1.csv from labeled) and generate two summary datasets: summary_labeled.tsv and summary_unlabeled.tsv. 

In [1]:
import numpy as np
import pandas as pd
import glob
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [2]:
# df = pd.read_csv('../data_mats/labeled_data/b_cell_memory_1.csv', index_col=0)

for inst_type in ['labeled_data', 'unlabeled_data']:
    # get all csvs
    all_files = glob.glob('../data_mats/'+inst_type+'/*.csv')
        
    # remove manual labels from csv list (unlabeled only)
    all_files = [x for x in all_files if not 'manual_labels' in x]
    
    # keep all averaged dfs (e.g. series) in a list for concatenation later
    avg_list = []

    for inst_file in all_files:
        
        inst_name = inst_file.split('/')[-1].replace('.csv','')

        inst_df = pd.read_csv(inst_file, index_col=0)
        
        cols = inst_df.columns.tolist()
        
        # throw out first few row meta-data
        keep_cols = cols[5:]
        inst_df = inst_df[keep_cols]
        
        # save averaged df as series
        series_avg = inst_df.mean(axis=0)
        series_avg.name = inst_name
        
        avg_list.append(series_avg)
    
    # aggregate averaged series into dataframe
    inst_df = pd.concat(avg_list,  axis=1)
    
    # transpose to get groups as rows and measurements as columns
    inst_df = inst_df.transpose()
    
    # save dataframes to csv
    inst_df.to_csv('../data_mats/' + inst_type + '.csv')

# Visualize Labeled Matrix 

In [3]:
inst_df = pd.read_csv('../data_mats/labeled_data.csv', index_col=0)
net.load_df(inst_df)
net.cluster(enrichrgram=False)
net.widget()

# Visualize Un-Labeled Matrix

In [4]:
inst_df = pd.read_csv('../data_mats/unlabeled_data.csv', index_col=0)
net.load_df(inst_df)
net.cluster(enrichrgram=False)
net.widget()