# Auto Label
This notebook will auto-label the unlabeled dataest based on similarity to the labeled dataset. I will start by generating a list of common measurements (columns) and also have a look at the two datasets stacked on each other and clustered. 

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [2]:
import scipy
spearmanr = scipy.stats.spearmanr

###  Load Labeled and Unlabeled Data


In [3]:
df_labeled = pd.read_csv('../data_mats/labeled_data.csv', index_col=0)
df_unlabeled = pd.read_csv('../data_mats/unlabeled_data.csv', index_col=0)

In [4]:
df_labeled.shape

(82, 37)

In [5]:
df_unlabeled.shape

(147, 41)

In [6]:
labeled_cols = df_labeled.columns.tolist()

In [7]:
unlabeled_cols = df_unlabeled.columns.tolist()

In [8]:
# find common measurements (intersection)
common_cols = list(set(labeled_cols).intersection(unlabeled_cols))

### Get Common Measurements (Columns)

In [9]:
df_labeled_comm = df_labeled[common_cols]

In [10]:
df_unlabeled_comm = df_unlabeled[common_cols]

### Add row categories 

In [11]:
old_rows = df_labeled_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: labeled')
    new_rows.append(new_row)

df_labeled_comm_cat = deepcopy(df_labeled_comm)
df_labeled_comm_cat.index = new_rows

In [12]:
old_rows = df_unlabeled_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: unlabeled')
    new_rows.append(new_row)
    
df_unlabeled_comm_cat = deepcopy(df_unlabeled_comm)
df_unlabeled_comm_cat.index = new_rows

### Visualize unlabeled and labeled clusters together

In [13]:
df_merge = df_labeled_comm_cat.append(df_unlabeled_comm_cat)

In [14]:
net.load_df(df_merge)
net.cluster(enrichrgram=False)
net.widget()

# Assign Labels
Loop through df_unlabeled_comm and calculate correlation to each of the df_labeled_comm clusters.

In [15]:
unlabeled_groups = df_unlabeled_comm.index.tolist()
labeled_groups = df_labeled_comm.index.tolist()

In [16]:
tdf_unlabeled_comm = df_unlabeled_comm.transpose()
tdf_labeled_comm = df_labeled_comm.transpose()

### Save Correlations as Series and Pandas DF

In [61]:
corr_list = []
for inst_unlabeled in unlabeled_groups:
    
    unlabeled_series = tdf_unlabeled_comm[inst_unlabeled]
    
    # index for series
    inst_names = []
    inst_corrs = []
    
    for inst_labeled in labeled_groups:
        
        inst_names.append(inst_labeled)
        
        labeled_series = tdf_labeled_comm[inst_labeled]
    
        # only keep correlation from the calculation
        corr = spearmanr(unlabeled_series, labeled_series)[0]
        
        inst_corrs.append(corr)
    
    # make series, name is the unlabeled group
    corr_series = pd.Series(name=inst_unlabeled, data=inst_corrs, index=inst_names)
    
    corr_list.append(corr_series)
    

In [63]:
inst_df = pd.concat(corr_list,  axis=1).transpose()

In [64]:
inst_df.shape

(147, 82)

In [65]:
net.load_df(inst_df)
net.cluster()
net.widget()

In [50]:
corr_list[0].shape

(82,)

In [32]:
val = [0, 1, 2]
names = ['zero', 'one', 'two']

In [21]:
tmp_s = pd.Series(data=val, index=names)

In [29]:
tmp_s

two     2
one     1
zero    0
dtype: int64

In [27]:
tmp_s.sort_values(ascending=False, inplace=True)

In [28]:
tmp_s

two     2
one     1
zero    0
dtype: int64