# Compute metrics across peak subsets

In [78]:
import sys
# append paths pointing to Data directory on your machine
sys.path.append('/home/katie/bp_repo/multitask_profile_model_SPI_GATA')

import pickle
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py

import profile_models
import profile_performance
from profile_performance import load_metrics, compute_performance_metrics, log_performance_metrics
from all_functions import save_preds_by_subset

sys.path.append('/home/katie/bp_repo/reports/')
from util import import_profiles

In [46]:
# set variables
subsets_path_root = '/home/katie/bp_repo/reports/katie_notebooks/round2_tasks_mar2022/TASK_1/'
model_path_root = '/home/katie/bp_repo/pipeline_outputs/'
save_path_root = '/home/katie/bp_repo/reports/katie_notebooks/summer2022_analysis/preds_by_subset/'

model_paths = list(model_path_root + stem + '/model.state_dict' for stem in ['ctcf_chipseq_jul6',
                                                         'ctcf_cutnrun_jul6', 'fosl2_chipseq_jul6', 'fosl2_cutnrun_jul6'])
models = list(profile_models.ModelLoader(model_path, controls='chipseq' in model_path, num_tasks=1).load_model() for \
             model_path in model_paths)

tfs = ['CTCF'] * 2 + ['FOSL2'] * 2
assays = ['chip-seq', 'cutnrun'] * 2
controls = list(assay == 'chip-seq' for assay in assays)
save_paths = list(save_path_root + tfs[i] + '/' + assays[i] + '_model/' for i in range(len(tfs)))
peak_subsets = ['unique', 'shared']

In [None]:
# save preds by subset
for i in range(len(tfs)):
    model, tf, assay, control, save_path = models[i], tfs[i], assays[i], controls[i], save_paths[i]
    for peak_subset in peak_subsets:
        save_preds_by_subset(model, tf, assay, control, save_path, peak_subset, chrom_subset='test')

In [31]:
load_metrics(['/home/katie/bp_repo/pipeline_outputs/ctcf_chipseq_jul6/metrics.pkl',
              '/home/katie/bp_repo/pipeline_outputs/ctcf_cutnrun_jul6/metrics.pkl',
              '/home/katie/bp_repo/pipeline_outputs/fosl2_chipseq_jul6/metrics.pkl',
              '/home/katie/bp_repo/pipeline_outputs/fosl2_cutnrun_jul6/metrics.pkl'],
             ['CTCF ChIP-seq', 'CTCF CUT&RUN', 'FOSL2 ChIP-seq', 'FOSL2 CUT&RUN'])

Unnamed: 0,Test profile NLL,Test profile cross entropy,Test profile JSD,Test profile Pearson,Test profile Spearman,Test profile MSE,Test count Pearson,Test count Spearman,Test count MSE
CTCF ChIP-seq,200.276242,5.829124,0.138512,0.768911,0.581777,2e-06,0.507791,0.502402,0.398164
CTCF CUT&RUN,176.613637,5.101787,0.360888,0.259449,0.25568,1.9e-05,0.568966,0.571402,0.669476
FOSL2 ChIP-seq,277.316059,6.509193,0.117633,0.617717,0.527766,1e-06,0.600541,0.594788,0.32285
FOSL2 CUT&RUN,79.438789,5.034164,0.420013,0.222759,0.240627,1.9e-05,0.154552,0.140001,1.34379


In [97]:
# load in all test set predictions
all_preds_dict = {}

path_ids = ['CTCF chip-seq model', 'CTCF cutnrun model', 'FOSL2 chip-seq model', 'FOSL2 cutnrun model']

keys = ['shared_chip_loader',
 'shared_cut_loader',
 'unique_chip_loader',
 'unique_cut_loader']

data_ids = ['true_counts', 'log_pred_counts', 'true_profs', 'log_pred_profs']

for i in range(len(save_paths)):
    path = save_paths[i]
    path_id = path_ids[i]
    preds_dict = {}
    
    # the surprisingly quick (few secs) actual preds loading
    for key in keys:
        file = h5py.File(f'{path}{key}.preds', 'r')
        coords = import_profiles(f'{path}{key}.preds')[2]
        preds_dict[key] = {'true_counts': file['predictions']['true_counts'], 
                           'log_pred_counts': file['predictions']['log_pred_counts'], 
                           'true_profs': file['predictions']['true_profs'],
                           'log_pred_profs': file['predictions']['log_pred_profs'], 
                           'coords': coords}
        
        # we need to subset to chr1, the test set
            # 1. take the coords, make into a DataFrame, subset to chr1, and get the indexes of the test coords
            # 2. use these indexes to subset pred_counts into the test profs
        df = pd.DataFrame(preds_dict[key]['coords']).reset_index()
        preds_dict[key]['coords_test'] = df[df.iloc[:,1] == 'chr1']
        preds_dict[key]['indexes_test'] = list(preds_dict[key]['coords_test']['index'])
        
        for data_id in data_ids:  # update all of the data to ONLY include test data
            preds_dict[key][data_id] = preds_dict[key][data_id][preds_dict[key]['indexes_test']]
    
    all_preds_dict[path_id] = preds_dict

Importing predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/5 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/5 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/5 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/5 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/2 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/4 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/2 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/4 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/2 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/4 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/2 [00:00<?, ?it/s]

Importing predictions:   0%|          | 0/4 [00:00<?, ?it/s]

## Save metrics (pickling)

In [106]:
# save metrics
# only need to run once
for i in range(len(path_ids)):
    path_id = path_ids[i]
    save_path = save_paths[i]
    for key in keys:
        preds_dict = all_preds_dict[path_id][key]
        true_profs, log_pred_profs, true_counts, log_pred_counts = \
        preds_dict['true_profs'], preds_dict['log_pred_profs'], preds_dict['true_counts'], preds_dict['log_pred_counts']
        metrics = profile_performance.compute_performance_metrics(
                true_profs, log_pred_profs, true_counts, log_pred_counts,
                prof_smooth_kernel_sigma=7, prof_smooth_kernel_width=81, print_updates=False
            )

        pickle.dump(profile_performance.log_performance_metrics(metrics), open(save_path + key + '_metrics.pkl', 'wb'))

	Test profile NLL: 118.424368
	Test profile cross entropy: 6.022300
	Test profile JSD: 0.217622
	Test profile Pearson: 0.637231
	Test profile Spearman: 0.501469
	Test profile MSE: 0.000004
	Test count Pearson: 0.194651
	Test count Spearman: 0.165882
	Test count MSE: 0.277827
	Test profile NLL: 67.967874
	Test profile cross entropy: 6.799979
	Test profile JSD: 0.397298
	Test profile Pearson: 0.161837
	Test profile Spearman: 0.193510
	Test profile MSE: 0.000009
	Test count Pearson: 0.175710
	Test count Spearman: 0.195075
	Test count MSE: 1.369871
	Test profile NLL: 118.424368
	Test profile cross entropy: 6.022300
	Test profile JSD: 0.217622
	Test profile Pearson: 0.637231
	Test profile Spearman: 0.501469
	Test profile MSE: 0.000004
	Test count Pearson: 0.194651
	Test count Spearman: 0.165882
	Test count MSE: 0.277827
	Test profile NLL: 67.967874
	Test profile cross entropy: 6.799979
	Test profile JSD: 0.397298
	Test profile Pearson: 0.161837
	Test profile Spearman: 0.193510
	Test profile

In [155]:
len(all_preds_dict['CTCF cutnrun model']['shared_cut_loader']['indexes_test'])

4100

In [156]:
len(all_preds_dict['CTCF cutnrun model']['unique_cut_loader']['indexes_test'])

4100

In [148]:
all_preds_dict['CTCF cutnrun model']['shared_cut_loader']['true_counts'].shape

(4100, 1, 2)

## Load metrics into pretty dataframe :]

In [119]:
pickles = []
ids = []
for i in range(len(path_ids)):
    path_id = path_ids[i]
    save_path = save_paths[i]
    for key in keys:
        pickles.append(save_path + key + '_metrics.pkl')
        ids.append(path_id + ' on ' + ' '.join(key.split('_')[:-1]) + ' peaks')

In [125]:
metrics = load_metrics(pickles, ids)

In [131]:
metrics.loc[list(i for i in list(metrics.index) if 'FOSL2 cutnrun' in i),
           ['Test count Pearson','Test count Spearman']]

Unnamed: 0,Test count Pearson,Test count Spearman
FOSL2 cutnrun model on shared chip peaks,-0.006213,-0.018478
FOSL2 cutnrun model on shared cut peaks,0.211839,0.209289
FOSL2 cutnrun model on unique chip peaks,-0.006213,-0.018478
FOSL2 cutnrun model on unique cut peaks,0.211839,0.209289


In [132]:
metrics

Unnamed: 0,Test profile NLL,Test profile cross entropy,Test profile JSD,Test profile Pearson,Test profile Spearman,Test profile MSE,Test count Pearson,Test count Spearman,Test count MSE
CTCF chip-seq model on shared chip peaks,118.424368,6.0223,0.217622,0.637231,0.501469,4e-06,0.194651,0.165882,0.277827
CTCF chip-seq model on shared cut peaks,67.967874,6.799979,0.397298,0.161837,0.19351,9e-06,0.17571,0.195075,1.369871
CTCF chip-seq model on unique chip peaks,118.424368,6.0223,0.217622,0.637231,0.501469,4e-06,0.194651,0.165882,0.277827
CTCF chip-seq model on unique cut peaks,67.967874,6.799979,0.397298,0.161837,0.19351,9e-06,0.17571,0.195075,1.369871
CTCF cutnrun model on shared chip peaks,58.504822,5.549674,0.442124,0.178287,0.173227,2.1e-05,0.115782,0.141856,2.050756
CTCF cutnrun model on shared cut peaks,117.197684,5.478706,0.398888,0.172835,0.143052,1.3e-05,0.129719,0.1217,0.471928
CTCF cutnrun model on unique chip peaks,58.504822,5.549674,0.442124,0.178287,0.173227,2.1e-05,0.115782,0.141856,2.050756
CTCF cutnrun model on unique cut peaks,117.197684,5.478706,0.398888,0.172835,0.143052,1.3e-05,0.129719,0.1217,0.471928
FOSL2 chip-seq model on shared chip peaks,183.829077,6.562229,0.160945,0.527547,0.44749,2e-06,0.008305,0.0,0.180043
FOSL2 chip-seq model on shared cut peaks,139.039445,6.850489,0.225605,0.249811,0.271661,2e-06,0.240208,0.256843,0.464291
