---
title: Debugging Borzoi Personalized Predictions Test
author: Sabrina Mi
date: 10/16/2023
execute:
  code-fold: true
---

Use human gene ENSG00000161011 to compute CAGE tracks for personalized genomes.

In [1]:
#| include: false
# Borzoi Setup
prefix = '/home/s1mi/borzoi_tutorial/'

import json
import os
import time
import warnings

import h5py
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import pandas as pd
import pysam
import pyfaidx
import tensorflow as tf

from baskerville import seqnn
from baskerville import gene as bgene
from baskerville import dna

from borzoi_helpers import *

params_file = prefix + 'params_pred.json'
targets_file = prefix + 'targets_human.txt' #Subset of targets_human.txt

seq_len = 524288
n_folds = 1       #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.
rc = True         #Average across reverse-complement prediction

#Read model parameters

with open(params_file) as params_open :
    
    params = json.load(params_open)
    
    params_model = params['model']
    params_train = params['train']

#Read targets

targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
target_index = targets_df.index

#Create local index of strand_pair (relative to sliced targets)
if rc :
    strand_pair = targets_df.strand_pair
    
    target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
    slice_pair = np.array([
        target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
    ], dtype='int32')

#Initialize model ensemble

models = []
for fold_ix in range(n_folds) :
    
    model_file = prefix + "saved_models/f" + str(fold_ix) + "/model0_best.h5"

    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file, 0)
    seqnn_model.build_slice(target_index)
    if rc :
        seqnn_model.strand_pair.append(slice_pair)
    seqnn_model.build_ensemble(rc, '0')
    
    models.append(seqnn_model)

fasta_open = pysam.Fastafile(prefix + 'hg38.fa')
transcriptome = bgene.Transcriptome(prefix + 'gencode41_basic_nort.gtf')



2023-10-17 03:13:50.091055: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-17 03:13:57.864020: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2023-10-17 03:13:57.864167: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: polaris-login-01
2023-10-17 03:13:57.864189: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: polaris-login-01
2023-10-17 03:13:57.864601: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 470.103.4
2023-10-17 03:13:57.864664: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnosti



In [36]:
search_gene = 'ENSG00000161011'
chrom = 'chr5'
gene_start = 179820905
gene_end = 179838078
tss = gene_start
center = (gene_start + gene_end) // 2
start = center - seq_len // 2
end = center + seq_len // 2
individuals = ['NA20521', 'NA18934']

In [20]:
gene_keys = [gene_key for gene_key in transcriptome.genes.keys() if search_gene in gene_key]

gene = transcriptome.genes[gene_keys[0]]

#Determine output sequence start
seq_out_start = start + seqnn_model.model_strides[0]*seqnn_model.target_crops[0]
seq_out_len = seqnn_model.model_strides[0]*seqnn_model.target_lengths[0]

#Determine output positions of gene exons
gene_slice = gene.output_slice(seq_out_start, seq_out_len, seqnn_model.model_strides[0], False)

In [41]:
def find_variants_in_vcf_file(cyvcf2_object, interval_object, samples, mode="phased"):
    start = max(interval_object['start'], 0)
    query = f"{interval_object['chr']}:{start}-{interval_object['end']}"
    variants_dictionary = {}
    variants_dictionary['chr'] = interval_object['chr']
    variants_dictionary['positions'] = tuple(variant.POS for variant in cyvcf2_object(query))
    if mode == 'phased':
        delim = '|'
    elif mode == 'unphased':
        delim = '/'
    for i, sample in enumerate(samples):
        if sample in cyvcf2_object.samples:
            variants_dictionary[sample] = tuple([variant.genotypes[i][0:2], variant.gt_bases[i].split(delim)] for variant in cyvcf2_object(query))
    return variants_dictionary

def predict_on_sequence(models, sample_input):
    prediction_output = {}
    for haplotype, sequence_encoding in sample_input.items():
        prediction = predict_tracks(models, sequence_encoding)
        prediction_output[haplotype] = prediction
    return prediction_output



In [20]:
import cyvcf2
target_interval = {'chr': chrom, 'start': start, 'end': end}
vcf_dir = "/grand/TFXcan/imlab/data/1000G/vcf_snps_only"
path_to_vcf = os.path.join(vcf_dir, f"ALL.{chrom}.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz")
vcf_chr = cyvcf2.cyvcf2.VCF(path_to_vcf, samples=individuals)
variants_array = find_variants_in_vcf_file(vcf_chr, target_interval, individuals, mode="phased")

In [43]:
def mutate_sequence(sequence_one_hot, start, poses, alts):
    
    #Induce mutation(s)
    sequence_one_hot_mut = np.copy(sequence_one_hot)

    for pos, alt in zip(poses, alts) :
        alt_ix = -1
        if alt == 'A' :
            alt_ix = 0
        elif alt == 'C' :
            alt_ix = 1
        elif alt == 'G' :
            alt_ix = 2
        elif alt == 'T' :
            alt_ix = 3

        sequence_one_hot_mut[pos-start-1] = 0.
        sequence_one_hot_mut[pos-start-1, alt_ix] = 1.
    return sequence_one_hot_mut

def replace_variants_in_reference_sequence(variants_array, individuals):
    poses = variants_array['positions']
    variant_encoded = {}
    for individual in individuals:
        alts_1 = [variants_array[individual][i][1][0] for i in range(len(poses))]
        alts_2 = [variants_array[individual][i][1][1] for i in range(len(poses))]
        sequence_one_hot = process_sequence(fasta_open, chrom, gene_start, gene_end)
        haplotype1_encoded = mutate_sequence(sequence_one_hot, gene_start, poses, alts_1)
        haplotype2_encoded = mutate_sequence(sequence_one_hot, gene_start, poses, alts_2)
        variant_encoded[individual] = {'haplotype1': haplotype1_encoded, 'haplotype2': haplotype2_encoded}
    return variant_encoded

In [44]:
samples_variants_encoded = replace_variants_in_reference_sequence(variants_array, individuals)

In [48]:
for individual in individuals:
    sample_input = samples_variants_encoded[individual]
    sample_predictions = predict_on_sequence(models, sample_input)
    with h5py.File(f'{individual}/test.h5', "w") as hf:
        for hap in sample_predictions.keys():
            hf[hap]= np.squeeze(sample_predictions[hap], axis=0)

In [46]:
import os
import h5py
old_predictions = {}
new_predictions = {}
for individual in individuals:
    predictions_file = os.path.join("/grand/TFXcan/imlab/users/sabrina/borzoi-personalized-test", individual, f'chr5_179567347_180091635_predictions.h5')
    with h5py.File(predictions_file, "r") as hf:
        haplo1 = hf['haplotype1'][:]
        haplo2 = hf['haplotype2'][:]
        print(haplo1.shape)
    old_predictions[individual] = {'haplotype1': haplo1, 'haplotype2': haplo2}
    with h5py.File(os.path.join(individual, 'test.h5'), 'r') as hf:
        haplo1 = hf['haplotype1'][:]
        haplo2 = hf['haplotype2'][:]
        print(haplo1.shape)
    new_predictions[individual] = {'haplotype1': haplo1, 'haplotype2': haplo2}
    

(4, 16352, 7611)
(1, 16352, 7611)
(4, 16352, 7611)
(1, 16352, 7611)


In [66]:
old_CAGE_predictions = []
new_CAGE_predictions = []
tss_bin = (tss - seq_out_start) // 32
for individual in individuals:
    old_CAGE_haplo1 = np.mean(old_predictions[individual]['haplotype1'][:, tss_bin-1:tss_bin + 2, [870,871]])
    old_CAGE_haplo2 = np.mean(old_predictions[individual]['haplotype2'][:, tss_bin-1:tss_bin + 2, [870,871]])
    old_CAGE = (old_CAGE_haplo1 + old_CAGE_haplo2) / 2
    new_CAGE_haplo1 = np.mean(new_predictions[individual]['haplotype1'][:, tss_bin-1:tss_bin + 2, [870,871]])
    new_CAGE_haplo2 = np.mean(new_predictions[individual]['haplotype2'][:, tss_bin-1:tss_bin + 2, [870,871]])
    new_CAGE = (new_CAGE_haplo1 + new_CAGE_haplo2) / 2    
    old_CAGE_predictions.append(old_CAGE)
    new_CAGE_predictions.append(new_CAGE)
    


In [59]:
print(old_CAGE_predictions)
print(new_CAGE_predictions)

[41.0625, 41.0]
[42.78125, 42.78125]


In [33]:
sequence_one_hot = process_sequence(fasta_open, chrom, gene_start, gene_end)
reference_prediction = predict_tracks(models, sequence_one_hot)

In [48]:
reference_CAGE = np.mean(reference_prediction[..., tss_bin-1:tss_bin + 2, [870,871]])
print(reference_CAGE)

42.8


Since it looks like the new predictions are closer to reference, it could mean our bug was how we indexed variants into the reference sequence.