# Auto Label
This notebook will auto-label the unlabeled dataest based on similarity to the labeled dataset. I will start by generating a list of common measurements (columns) and also have a look at the two datasets stacked on each other and clustered. 

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [2]:
import scipy
spearmanr = scipy.stats.spearmanr

###  Load Labeled and Unlabeled Data
labeled: chik

unlabeled: zika

In [4]:
net.load_file('../data_mats/chik_col-zscore.txt')
df_chik = net.export_df()
net.load_file('../data_mats/zika_col-zscore.txt')
df_zika = net.export_df()

In [5]:
df_chik.shape

(82, 37)

In [7]:
df_zika.shape

(147, 41)

In [9]:
chik_cols = df_chik.columns.tolist()

In [10]:
zika_cols = df_zika.columns.tolist()

In [11]:
# find common measurements (intersection)
common_cols = list(set(chik_cols).intersection(zika_cols))

### Get Common Measurements (Columns)

In [12]:
df_chik_comm = df_chik[common_cols]

In [13]:
df_zika_comm = df_zika[common_cols]

In [15]:
df_chik_comm.shape

(82, 33)

In [16]:
df_zika_comm.shape

(147, 33)

### Add row categories 

In [18]:
old_rows = df_chik_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: chik')
    new_rows.append(new_row)

df_chik_comm_cat = deepcopy(df_chik_comm)
df_chik_comm_cat.index = new_rows

In [19]:
old_rows = df_zika_comm.index.tolist()
new_rows = []
for inst_row in old_rows:
    new_row = (inst_row, 'type: zika')
    new_rows.append(new_row)
    
df_zika_comm_cat = deepcopy(df_zika_comm)
df_zika_comm_cat.index = new_rows

### Visualize unlabeled and labeled clusters together

In [31]:
df_merge = df_chik_comm_cat.append(df_zika_comm_cat)
df_merge.shape

(229, 33)

In [21]:
net.load_df(df_merge)
net.cluster(enrichrgram=False)
net.widget()

# Assign Labels
Loop through df_zika_comm and calculate correlation to each of the df_chik_comm clusters.

In [22]:
zika_groups = df_zika_comm.index.tolist()
chik_groups = df_chik_comm.index.tolist()

In [23]:
tdf_zika_comm = df_zika_comm.transpose()
tdf_chik_comm = df_chik_comm.transpose()

### Save Correlations as Series and Pandas DF

In [24]:
corr_list = []
predict_group = {}
for inst_zika in zika_groups:
    
    zika_series = tdf_zika_comm[inst_zika]
    
    # index for series
    inst_names = []
    inst_corrs = []
    
    for inst_chik in chik_groups:
        
        inst_names.append(inst_chik)
        
        chik_series = tdf_chik_comm[inst_chik]
    
        # only keep correlation from the calculation
        corr = spearmanr(zika_series, chik_series)[0]
        
        inst_corrs.append(corr)
    
    # make series, name is the zika group
    corr_series = pd.Series(name=inst_zika, data=inst_corrs, index=inst_names)
    
    corr_list.append(corr_series)
    
    corr_sort = deepcopy(corr_series)
    
    corr_sort.sort_values(ascending=False, inplace=True)
    
    # get the predicted category from sorting the series
    top_corr = corr_sort[0]
#     # remove number from cat type
#     name_list = corr_sort.index[0].split('_')[:-1]
#     predict_cat = '_'.join(name_list)
    predict_cat = corr_sort.index[0]
    
    inst_dict = {}
    inst_dict['predict_cat'] = predict_cat
    inst_dict['predict_corr'] = top_corr
    inst_dict['orig_cat'] = inst_zika
    
    # save predicted group as dictionary
    predict_group[inst_zika] = inst_dict

### Save Zika vs Chik Labels

In [25]:
f = open('../data_mats/unlabeled_vs_predicted.txt', 'w')
for inst_key in sorted(predict_group.keys()):
    f.write(inst_key + ': ' + predict_group[inst_key]['predict_cat'] + '\n')
    
f.close()

# Transfer Chik-Labels to Zika DataFrame


In [75]:
inst_rows = df_zika.index.tolist()
new_rows = []
zika_predict_rows = []
for inst_row in inst_rows:
    predict_name = predict_group[inst_row]['predict_cat']
    
    tmp_row = inst_row.split('_')[:-1]
    orig_cat = '_'.join(tmp_row)
    
    tmp_row = predict_name.split('_')[:-1]
    predict_cat = '_'.join(tmp_row)
    
    predict_corr = predict_group[inst_row]['predict_corr']
    
    # save predictions as tuple
    orig_cat = 'original: ' + orig_cat
    predict_cat = 'predict: ' + predict_cat
    predict_corr = 'correlation: ' + str(predict_corr)
    zika_predict_rows.append((predict_name, 'virus: zika', orig_cat, predict_cat, predict_corr))


In [76]:
df_zika_predict = deepcopy(df_zika)

In [77]:
df_zika_predict.index = zika_predict_rows

In [78]:
df_zika_predict.to_csv('../data_mats/predict_zika.csv')
print(df_zika_predict.shape)

(147, 41)


In [79]:
net.load_df(df_zika_predict)
net.normalize(axis='col', norm_type='zscore')
net.cluster(enrichrgram=False)
net.widget()

# Generate Chik 'Predicted' Labels for Stacking Chik and Zika
Here I will generate similar tuple-labels for the Zika DataFrame so that I can stack the Chik and Zika DataFrames and visualize as one heatmap.

In [96]:
inst_rows = df_chik.index.tolist()
new_rows = []
chik_predict_rows = []
for inst_row in inst_rows:

    predict_name = inst_row
    
    tmp_row = inst_row.split('_')[:-1]
    orig_cat_name = '_'.join(tmp_row)
    
    # save predictions as tuple
    orig_cat = 'original: ' + orig_cat_name
    predict_cat = 'predict: ' + orig_cat_name
    predict_corr = 'correlation: 1.0'
    chik_predict_rows.append((predict_name, 'virus: chik', orig_cat, predict_cat, predict_corr))

### Chik Predict Comm (keep only common measurements)

In [97]:
df_chik_comm_predict = deepcopy(df_chik_comm)
print(df_chik_comm_predict.shape)

(82, 33)


In [98]:
df_chik_comm_predict.index = chik_predict_rows

### Zika Predict Comm (keep only common measurements)

In [99]:
df_zika_comm_predict = deepcopy(df_zika_comm)
print(df_zika_comm_predict.shape)

(147, 33)


In [100]:
df_zika_comm_predict.index = zika_predict_rows

In [101]:
df_predict_merge = df_chik_comm_predict.append(df_zika_comm_predict)

In [102]:
df_predict_merge.shape

(229, 33)

# Stack Chik and Zika with Predicted Labels and Correlations

In [103]:
net.load_df(df_predict_merge)
net.set_cat_color('row', 1, 'virus: chik', 'blue')
net.set_cat_color('row', 1, 'virus: zika', 'red')
net.cluster(enrichrgram=False)
net.widget()

# Visualize Unlabeled vs Labeled Correlation Heatmap

In [104]:
inst_df = pd.concat(corr_list,  axis=1).transpose()

In [105]:
inst_df.shape

(147, 82)

In [106]:
net.load_df(inst_df)
net.cluster()
net.widget()

In [50]:
corr_list[0].shape

(82,)

In [32]:
val = [0, 1, 2]
names = ['zero', 'one', 'two']

In [21]:
tmp_s = pd.Series(data=val, index=names)

In [29]:
tmp_s

two     2
one     1
zero    0
dtype: int64

In [27]:
tmp_s.sort_values(ascending=False, inplace=True)

In [28]:
tmp_s

two     2
one     1
zero    0
dtype: int64