---
title: Code Snippets for Borzoi Personalized Prediction
author: Sabrina Mi
date: 9/27/2023
execute:
  include: false
---

## Step 1: VCF to One-hot Encode

In [127]:
import pandas as pd
import numpy as np
import cyvcf2
import pysam
import h5py
import os
from borzoi_helpers import process_sequence, predict_tracks

In [2]:
with open(f"/home/s1mi/Github/deep-learning-in-genomics/posts/2023-08-31-Br-personalized-prediction-on-more-genes/gene_list.txt", "r") as file:
    rn7_gene_list = file.read().splitlines()
rn7_hg38_ortho = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/rn7_hg38.ortholog_genes.txt", sep="\t", index_col="ensembl_gene_id")
hg38_annot = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/hg38.gene.txt", sep="\t")

In [3]:
ortho_genes = list(set(rn7_gene_list).intersection(rn7_hg38_ortho.index))
## convert to hg38
gene_df = rn7_hg38_ortho.loc[ortho_genes]
## annotate hg38 genes
gene_df = gene_df.merge(hg38_annot, left_on="hsapiens_homolog_ensembl_gene", right_on="ensembl_gene_id", how="inner")
gene_df = gene_df[["ensembl_gene_id", "chromosome_name", "transcript_start", "transcript_end"]]

In [39]:
def find_variants_in_vcf_file(cyvcf2_object, interval_object, samples, mode="phased"):
    start = max(interval_object['start'], 0)
    query = f"{interval_object['chr']}:{start}-{interval_object['end']}"
    variants_dictionary = {}
    variants_dictionary['chr'] = interval_object['chr']
    variants_dictionary['positions'] = tuple(variant.POS for variant in cyvcf2_object(query))
    if mode == 'phased':
        delim = '|'
    elif mode == 'unphased':
        delim = '/'
    for i, sample in enumerate(samples):
        if sample in cyvcf2_object.samples:
            variants_dictionary[sample] = tuple([variant.genotypes[i][0:2], variant.gt_bases[i].split(delim)] for variant in cyvcf2_object(query))
    return variants_dictionary

In [54]:
def create_mapping_dictionary(variants_array, samples, interval_start):
    import numpy as np
    A = np.array([1,0,0,0], dtype=np.float32)
    C = np.array([0,1,0,0], dtype=np.float32)
    G = np.array([0,0,1,0], dtype=np.float32)
    T = np.array([0,0,0,1], dtype=np.float32)
    seq_dict = {'A': A, 'C': C, 'G': G, 'T': T}

    # collect common information
    samples_haplotype_map = {}
    samples_haplotype_map['positions'] = tuple((variants_array['positions'][i]) - interval_start for i in range(len(variants_array['positions'])))
    for sample in samples:
        samples_haplotype_map[sample] = {}
        samples_haplotype_map[sample]['haplotype1'] = tuple(seq_dict[variants_array[sample][i][1][0]] for i in range(0, len(variants_array[sample])))
        samples_haplotype_map[sample]['haplotype2'] = tuple(seq_dict[variants_array[sample][i][1][1]] for i in range(0, len(variants_array[sample])))
    return samples_haplotype_map


In [83]:
def resize(region, seq_len=524288):
    center_bp = (region['end'] + region['start']) // 2
    start = center_bp - seq_len // 2
    end = center_bp + seq_len // 2
    return {"chr": region['chr'], "start": start, "end": end}

In [96]:
def replace_variants_in_reference_sequence(query_sequences_encoded, mapping_dict, samples):
    import copy
    import numpy as np
    positions = mapping_dict['positions']
    variant_encoded = {}
    for sample in samples:
        haplotype1_encoded = np.copy(query_sequences_encoded)
        haplotype2_encoded = np.copy(query_sequences_encoded)
        for i, position in enumerate(positions):
            haplotype1_encoded[position] = mapping_dict[sample]["haplotype1"][i]
            haplotype2_encoded[position] = mapping_dict[sample]["haplotype2"][i]
        variant_encoded[sample] = {"haplotype1": haplotype1_encoded, "haplotype2": haplotype2_encoded}
    return variant_encoded


In [111]:
def get_model(model_dir):
    import os
    import h5py
    import numpy as np
    import pandas as pd
    import tensorflow as tf

    import baskerville
    from baskerville import seqnn
    from baskerville import dna
    from baskerville import gene as bgene

    import json

    import pysam
    params_file = os.path.join(model_dir, 'params_pred.json') 
    targets_file = os.path.join(model_dir, 'targets_human.txt') 

    n_folds = 4       #To use only one model fold, change to 'n_folds = 1'
    rc = True         #Average across reverse-complement prediction
    #Read model parameters

    with open(params_file) as params_open :
        
        params = json.load(params_open)
        
        params_model = params['model']

    #Read targets

    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')
    target_index = targets_df.index

    #Create local index of strand_pair (relative to sliced targets)
    if rc :
        strand_pair = targets_df.strand_pair
        
        target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
        slice_pair = np.array([
            target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
        ], dtype='int32')

    #Initialize model ensemble

    models = []
    for fold_ix in range(n_folds) :
        
        model_file = os.path.join(model_dir, "saved_models/f" + str(fold_ix) + "/model0_best.h5")

        seqnn_model = seqnn.SeqNN(params_model)
        seqnn_model.restore(model_file, 0)
        seqnn_model.build_slice(target_index)
        if rc :
            seqnn_model.strand_pair.append(slice_pair)
        seqnn_model.build_ensemble(rc, '0')
        
        models.append(seqnn_model)
    return models


In [None]:
def predict_on_sequence(models, sample_input):
    prediction_output = {}
    for haplotype, sequence_encoding in sample_input.items():
        prediction = predict_tracks(models, sequence_encoding)
        prediction_output[haplotype] = prediction
    return prediction_output

In [None]:
fasta_open = pysam.Fastafile('/home/s1mi/borzoi_tutorial/hg38.fa')
model_dir = '/home/s1mi/borzoi_tutorial'

In [131]:
def enformer_predict_on_region(target_interval, samples, path_to_vcf,  output_dir):
    models = get_model(model_dir)
    sequence_one_hot_ref = process_sequence(fasta_open, target_interval["chr"], target_interval["start"], target_interval["end"])
    vcf_chr = cyvcf2.cyvcf2.VCF(path_to_vcf, samples=samples)
    variants_array = find_variants_in_vcf_file(vcf_chr, target_interval, samples, mode="phased")
    mapping_dict = create_mapping_dictionary(variants_array, samples, target_interval["start"])
    samples_variants_encoded = replace_variants_in_reference_sequence(sequence_one_hot_ref, mapping_dict, samples)
    for sample in samples:
        sample_input = samples_variants_encoded[sample]
        sample_predictions = predict_on_sequence(models, sample_input)
        sample_output = {}
        output_path = os.path.join(output_dir, sample, f'{target_interval["chr"]}_{target_interval["start"]}_{target_interval["end"]}_predictions.h5')
        if not os.path.exists(output_path): os.makedirs(output_path, exist_ok=True)
        with h5py.File(output_path, "w") as hf:
            for hap in sample_predictions.keys():
                sample_output[hap]= np.squeeze(sample_predictions[hap], axis=0)


In [132]:
gene_annot = gene_df.iloc[0]
samples = ["NA21143", "NA21144"]
vcf_dir = "/grand/TFXcan/imlab/data/1000G/vcf_snps_only"
output_dir = "/home/s1mi/Github/deep-learning-in-genomics/posts/2023-09-26-borzoi-personalized-test/"
interval_object = {'chr': 'chr' + gene_annot["chromosome_name"], 'start': gene_annot["transcript_start"], 'end': gene_annot["transcript_end"]}
target_interval = resize(interval_object)
path_to_vcf = os.path.join(vcf_dir, f"ALL.{interval_object['chr']}.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz")
enformer_predict_on_region(interval_object, samples, path_to_vcf, output_dir)



Model: "model_41"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sequence (InputLayer)          [(None, 524288, 4)]  0           []                               
                                                                                                  
 stochastic_reverse_complement_  ((None, 524288, 4),  0          ['sequence[0][0]']               
 8 (StochasticReverseComplement   ())                                                             
 )                                                                                                
                                                                                                  
 stochastic_shift_8 (Stochastic  (None, 524288, 4)   0           ['stochastic_reverse_complement_8
 Shift)                                                          [0][0]']                  