# 1.0 Working on Signature Methods

In [1]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}

import clustergrammer_groupby as cby
import gene_exp_10x

In [2]:
### signature distance calculation requirements
from sklearn.metrics import pairwise_distances
from scipy.stats import ttest_ind 
from sklearn.metrics import f1_score


import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

### Load Default Dataset

In [3]:
net.load_file('../data/rc_two_cats.txt')
df['ini'] = net.export_df()
df['ini'].shape

### Drop Gender Category
cols = df['ini'].columns.tolist()
new_cols = []
for inst_col in cols:
    inst_col = list(inst_col)
    new_col = tuple(inst_col[:2])
    new_cols.append(new_col)

df['ini'].columns = new_cols

### Set category colors

In [4]:
net.set_cat_color(axis='col', cat_index=1, cat_name='Category: one', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Category: two', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Category: three', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=1, cat_name='Category: four', inst_color='black')
net.set_cat_color(axis='col', cat_index=1, cat_name='Category: five', inst_color='purple')

In [5]:
net.load_df(df['ini'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


### Calculate Signatures of Categories

In [6]:
df['ini_mi'] = cby.row_tuple_to_multiindex(df['ini'].transpose())

In [7]:
df['ini_mi'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,"(Gene: CDK4, Gene Type: Interesting)","(Gene: LMTK3, Gene Type: Not Interesting)","(Gene: LRRK2, Gene Type: Not Interesting)","(Gene: UHMK1, Gene Type: Not Interesting)","(Gene: EGFR, Gene Type: Interesting)","(Gene: STK32A, Gene Type: Interesting)","(Gene: NRK, Gene Type: Interesting)","(Gene: ERBB2, Gene Type: Not Interesting)","(Gene: ERBB4, Gene Type: Not Interesting)","(Gene: AAK1, Gene Type: Not Interesting)",...,"(Gene: ROS1, Gene Type: Interesting)","(Gene: MAP2K4, Gene Type: Interesting)","(Gene: SRC, Gene Type: Interesting)","(Gene: TGFBR1, Gene Type: Interesting)","(Gene: CAMK2B, Gene Type: Not Interesting)","(Gene: STK24, Gene Type: Interesting)","(Gene: DCLK3, Gene Type: Not Interesting)","(Gene: LATS1, Gene Type: Not Interesting)","(Gene: NEK9, Gene Type: Not Interesting)","(Gene: MYLK3, Gene Type: Not Interesting)"
Cell Line,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
H1650,two,-0.792804,0.177621,-0.697876,0.850547,1.412416,-0.38804,1.408537,0.906642,-0.452907,3.579052,...,-0.312364,0.119311,-0.294264,-0.000864,-0.276737,-0.318076,-0.670178,-0.695253,-0.337849,-0.368173
H23,two,0.527687,-0.016061,-0.55561,-0.26328,0.018988,-0.592627,-0.017369,-0.684771,-0.392791,0.923308,...,0.701257,0.593671,-0.618072,0.735638,-0.426081,-0.814111,3.224534,4.299877,-0.535266,0.209192
CAL-12T,two,0.000623,5.422114,-0.360498,0.179253,0.902252,-0.244137,-0.367128,0.015261,-0.374174,-0.651094,...,0.475208,0.489153,-0.252534,-0.68029,-0.16016,0.646545,0.14551,-0.175587,0.80316,0.266318
H358,one,0.356723,1.30704,-0.460237,0.398647,-0.178137,0.740365,0.313254,0.160568,-0.527418,0.952744,...,-0.585297,0.841683,-0.786607,0.040926,-0.890033,0.268372,0.107432,-0.061022,0.275911,-0.100657
H1975,two,0.933286,0.355815,-0.680761,1.537664,0.781819,3.023348,-0.162887,0.365002,-0.320103,-0.212733,...,-0.122694,1.064674,-0.228027,0.35933,-0.437405,-9.425121,-1.120493,-0.391646,0.981343,-0.336792


### Collect Top diff genes using T-test

In [8]:
num_top_genes = False
pval_cutoff = 0.05

cell_types = sorted(list(set(df['ini_mi'].index.get_level_values('Category').tolist())))

inst_level = 'Category'

keep_genes = []
keep_genes_dict = {}

for inst_ct in cell_types:
    
    inst_ct_mat = df['ini_mi'].xs(key=inst_ct, level=inst_level)
    inst_other_mat = df['ini_mi'].drop(inst_ct, level=inst_level)
    
    inst_stats, inst_pvals = ttest_ind(inst_ct_mat, inst_other_mat, axis=0)
    
    ser_pval = pd.Series(data=inst_pvals, index=df['ini_mi'].columns.tolist()).sort_values()

    if num_top_genes == False:
        ser_pval_keep = ser_pval[ser_pval < pval_cutoff]
    else:
        ser_pval_keep = ser_pval[:num_top_genes]

    print(ser_pval_keep)
    inst_keep = ser_pval_keep.index.tolist()
    keep_genes.extend(inst_keep)
    keep_genes_dict[inst_ct] = inst_keep
    
keep_genes = sorted(list(set(keep_genes)))

(Gene: NEK9, Gene Type: Not Interesting)    0.000724
(Gene: ULK4, Gene Type: Interesting)        0.002321
(Gene: ROS1, Gene Type: Interesting)        0.010923
(Gene: MAPK11, Gene Type: Interesting)      0.013899
dtype: float64
(Gene: MAPK4, Gene Type: Interesting)         0.008790
(Gene: TGFBR1, Gene Type: Interesting)        0.010232
(Gene: SRPK3, Gene Type: Not Interesting)     0.013685
(Gene: UHMK1, Gene Type: Not Interesting)     0.016737
(Gene: CAMK2B, Gene Type: Not Interesting)    0.025516
(Gene: GRK1, Gene Type: Not Interesting)      0.036758
dtype: float64
(Gene: STK32A, Gene Type: Interesting)     0.000172
(Gene: STK31, Gene Type: Interesting)      0.002281
(Gene: KDR, Gene Type: Not Interesting)    0.006222
(Gene: NRK, Gene Type: Interesting)        0.011511
(Gene: CDK4, Gene Type: Interesting)       0.045237
dtype: float64
(Gene: LRRK2, Gene Type: Not Interesting)    0.002408
dtype: float64
(Gene: PRKCE, Gene Type: Not Interesting)    0.005459
(Gene: STK39, Gene Type: Inter

### Visualize the Average Levels of the Informative Genes

In [9]:
df['ini_gbm'] = df['ini_mi'].groupby(level=inst_level).mean().transpose()
cols = df['ini_gbm'].columns.tolist()
new_cols = []
for inst_col in cols:
    new_col = (inst_col, inst_level + ': ' + inst_col)
    new_cols.append(new_col)
df['ini_gbm'].columns = new_cols

In [10]:
df['cat_sig'] = df['ini_gbm'].ix[keep_genes]

# Category Signature

In [11]:
net.load_df(df['cat_sig'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


### Visualize Informative Dimensions in Original Data
Generate a matrix with only category-informative dimensions/genes.

In [12]:
df['ini_info'] = df['ini'].ix[keep_genes]
df['ini_info'].shape

(19, 29)

In [13]:
net.load_df(df['ini_info'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


# Predict Category Based on Signature

In [14]:
def predict_cat_from_sig(df_data_ini, df_sig, dist_type='cosine'):
    print('df_data_ini: ', df_data_ini.shape)

    keep_rows = df_sig.index.tolist()
    df_data = deepcopy(df_data_ini.ix[keep_rows])
    print('df_data: ', df_data.shape)

    # calculate sim_mat of df_data and df_sig
    cell_types = df_sig.columns.tolist()
    barcodes = df_data.columns.tolist()
    sim_mat = 1 - pairwise_distances(df_sig.transpose(), df_data.transpose(), metric=dist_type)
    df_sim = pd.DataFrame(data=sim_mat, index=cell_types, columns=barcodes).transpose()
    print(df_sim.shape)

    ser_list = []
    top_list = []
    rows = df_sim.index.tolist()
    for inst_row in rows:

        # make ser_data_sim
        inst_ser = df_sim.loc[[inst_row]]
        inst_data = inst_ser.get_values()[0]
        inst_cols = inst_ser.columns.tolist()
        ser_data_sim = pd.Series(data=inst_data, index=inst_cols, name=inst_row).sort_values(ascending=False)

        # define top matching cell type
        top_ct_1 = ser_data_sim.index.tolist()[0]        

        # use cell type signature
        found_ct = top_ct_1

        # make binary matrix of ct_max
        inst_zeros = np.zeros((len(inst_cols)))
        max_ser = pd.Series(data=inst_zeros, index=inst_cols, name=inst_row)
        max_ser[found_ct] = 1
        top_list.append(found_ct)
        ser_list.append(max_ser)

    # make matrix of top cell type identified 
    df_sim_top = pd.concat(ser_list, axis=1).transpose()

    y_info = {}
    y_info['true'] = []
    y_info['pred'] = []

    # add cell type category to input data
    df_cat = deepcopy(df_data)
    cols = df_cat.columns.tolist()
    new_cols = []
    for i in range(len(cols)):
        inst_col = list(cols[i])
        inst_col.append('Predict Category: ' + top_list[i][0])
        inst_tuple = tuple(inst_col)
        new_cols.append(inst_tuple)

        # store true and predicted lists 
        y_info['true'].append(inst_col[1].split(': ')[1])
        y_info['pred'].append(top_list[i][0])

    df_cat.columns = new_cols
    
    return df_cat, df_sim.transpose(), df_sim_top.transpose(), y_info


### Run prediction function

In [15]:
df['pred_cat'], df['sig_sim'], df['sig_max'], y_info = predict_cat_from_sig(df['ini'], df['cat_sig'])

df_data_ini:  (38, 29)
df_data:  (19, 29)
(29, 5)


In [16]:
net.load_df(df['pred_cat'])
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Category: one', inst_color='red')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Category: two', inst_color='blue')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Category: three', inst_color='yellow')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Category: four', inst_color='black')
net.set_cat_color(axis='col', cat_index=2, cat_name='Predict Category: five', inst_color='purple')

In [17]:
net.load_df(df['pred_cat'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


In [18]:
net.load_df(df['sig_sim'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


In [19]:
net.load_df(df['sig_max'])
net.cluster()
net.widget()

Widget Javascript not detected.  It may not be installed or enabled properly.


### F1 Scores

In [20]:
print(f1_score(y_info['true'], y_info['pred'], average='macro'))
print(f1_score(y_info['true'], y_info['pred'], average='micro'))
print(f1_score(y_info['true'], y_info['pred'], average='weighted'))

0.825734265734
0.827586206897
0.820352061731
