# Auto Label
This notebook will auto-label the unlabeled dataest based on similarity to the labeled dataset. I will start by generating a list of common measurements (columns) and also have a look at the two datasets stacked on each other and clustered. 

In [66]:
import numpy as np
import pandas as pd
from copy import deepcopy
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [67]:
import scipy
spearmanr = scipy.stats.spearmanr

###  Load Labeled and Unlabeled Data


In [68]:
df_labeled = pd.read_csv('../data_mats/labeled_data.csv', index_col=0)
df_unlabeled = pd.read_csv('../data_mats/unlabeled_data.csv', index_col=0)

In [69]:
df_labeled.shape

(82, 37)

In [70]:
df_unlabeled.shape

(147, 41)

In [71]:
labeled_cols = df_labeled.columns.tolist()

In [72]:
unlabeled_cols = df_unlabeled.columns.tolist()

In [73]:
# find common measurements (intersection)
common_cols = list(set(labeled_cols).intersection(unlabeled_cols))

### Get Common Measurements (Columns)

In [74]:
df_labeled_comm = df_labeled[common_cols]

In [75]:
df_unlabeled_comm = df_unlabeled[common_cols]

### Add row categories 

In [76]:
old_rows = df_labeled_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: labeled')
    new_rows.append(new_row)

df_labeled_comm_cat = deepcopy(df_labeled_comm)
df_labeled_comm_cat.index = new_rows

In [77]:
old_rows = df_unlabeled_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: unlabeled')
    new_rows.append(new_row)
    
df_unlabeled_comm_cat = deepcopy(df_unlabeled_comm)
df_unlabeled_comm_cat.index = new_rows

### Visualize unlabeled and labeled clusters together

In [78]:
df_merge = df_labeled_comm_cat.append(df_unlabeled_comm_cat)

In [79]:
net.load_df(df_merge)
net.cluster(enrichrgram=False)
net.widget()

# Assign Labels
Loop through df_unlabeled_comm and calculate correlation to each of the df_labeled_comm clusters.

In [80]:
unlabeled_groups = df_unlabeled_comm.index.tolist()
labeled_groups = df_labeled_comm.index.tolist()

In [81]:
tdf_unlabeled_comm = df_unlabeled_comm.transpose()
tdf_labeled_comm = df_labeled_comm.transpose()

### Save Correlations as Series and Pandas DF

In [110]:
corr_list = []
predict_group = {}
for inst_unlabeled in unlabeled_groups:
    
    unlabeled_series = tdf_unlabeled_comm[inst_unlabeled]
    
    # index for series
    inst_names = []
    inst_corrs = []
    
    for inst_labeled in labeled_groups:
        
        inst_names.append(inst_labeled)
        
        labeled_series = tdf_labeled_comm[inst_labeled]
    
        # only keep correlation from the calculation
        corr = spearmanr(unlabeled_series, labeled_series)[0]
        
        inst_corrs.append(corr)
    
    # make series, name is the unlabeled group
    corr_series = pd.Series(name=inst_unlabeled, data=inst_corrs, index=inst_names)
    
    corr_list.append(corr_series)
    
    corr_sort = deepcopy(corr_series)
    
    corr_sort.sort_values(ascending=False, inplace=True)
    
    # get the predicted category from sorting the series
    top_corr = corr_sort[0]
    # remove number from cat type
    name_list = corr_sort.index[0].split('_')[:-1]
    predict_cat = '_'.join(name_list)
    
    # save predicted group as dictionary
    predict_group[inst_unlabeled] = predict_cat

# Transfer Labels to Unlabeled DF


In [113]:
inst_rows = df_unlabeled.index.tolist()
new_rows = []
for inst_row in inst_rows:
    inst_group = predict_group[inst_row]
    new_rows.append((inst_row, inst_group))

In [114]:
new_rows

[('b_cell_1', 'b_cell_naive'),
 ('b_cell_10', 'b_cell_naive'),
 ('b_cell_11', 'b_cell_naive'),
 ('b_cell_12', 'b_cell_transitional'),
 ('b_cell_13', 'b_cell_naive'),
 ('b_cell_14', 'b_cell_memory'),
 ('b_cell_15', 'b_cell_naive'),
 ('b_cell_16', 'b_cell_memory'),
 ('b_cell_17', 'b_cell_plasmablast'),
 ('b_cell_18', 'b_cell_memory'),
 ('b_cell_19', 'b_cell_naive'),
 ('b_cell_2', 'b_cell_memory'),
 ('b_cell_20', 'b_cell_memory'),
 ('b_cell_21', 'b_cell_transitional'),
 ('b_cell_3', 'undefined'),
 ('b_cell_4', 'b_cell_naive'),
 ('b_cell_5', 'b_cell_naive'),
 ('b_cell_6', 'b_cell_memory'),
 ('b_cell_7', 'b_cell_naive'),
 ('b_cell_8', 'undefined'),
 ('b_cell_9', 'undefined'),
 ('basophil_1', 'cd4_cd8_t_cell'),
 ('basophil_10', 'nkt_cell'),
 ('basophil_11', 'b_cell_naive'),
 ('basophil_12', 'pdc'),
 ('basophil_13', 'cd4_neg_cd8_neg_t_cell'),
 ('basophil_14', 'undefined'),
 ('basophil_15', 'cd8_t_cell_naive'),
 ('basophil_16', 'nkt_cell'),
 ('basophil_17', 'cd14_monocyte'),
 ('basophil_18', '

In [83]:
inst_df = pd.concat(corr_list,  axis=1).transpose()

In [84]:
inst_df.shape

(147, 82)

In [85]:
net.load_df(inst_df)
net.cluster()
net.widget()

In [50]:
corr_list[0].shape

(82,)

In [32]:
val = [0, 1, 2]
names = ['zero', 'one', 'two']

In [21]:
tmp_s = pd.Series(data=val, index=names)

In [29]:
tmp_s

two     2
one     1
zero    0
dtype: int64

In [27]:
tmp_s.sort_values(ascending=False, inplace=True)

In [28]:
tmp_s

two     2
one     1
zero    0
dtype: int64