In [1]:
from IPython.display import clear_output
! pip install alphagenome python-dotenv
clear_output()

In [2]:
import sys
from pathlib import Path

# Allow importing utils from project subdir.
sys.path.append(str(Path('..').resolve()))

from utils import (
    get_dna_model,
    get_output_metadata,
    load_gtf_feather,
    prepare_gtf_views,
    build_transcript_extractors,
)

dna_model = get_dna_model()
output_metadata = get_output_metadata(dna_model)

gtf = load_gtf_feather()
gtf_transcript, gtf_longest_transcript = prepare_gtf_views(gtf)
transcript_extractor, longest_transcript_extractor = (
    build_transcript_extractors(gtf_transcript, gtf_longest_transcript)
)


gencode.v46.annotation.gtf.gz.feather already exists!


In [3]:
from pathlib import Path
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

DATA_DIR = Path("../../data")
PROMOTER_PATH = DATA_DIR / "promoters"
MODELS = ["DNABERT-2", "NT-v2-500m"]
GENE = "HBB"


In [4]:
promoter_dfs = {}

for model in MODELS:
    file_path = PROMOTER_PATH / model / f"{GENE}_promoter.csv"
    df = pd.read_csv(file_path, index_col=0)
    promoter_dfs[model] = df
    print(f"[{model}] loaded: shape={df.shape}")
    print(f"  strategies={list(df.index)}")
    print(f"  iterations={list(df.columns)}")


[DNABERT-2] loaded: shape=(4, 31)
  strategies=['greedy', 'sampling_t0.5', 'sampling_t1.0', 'sampling_t1.5']
  iterations=['iteration_0', 'iteration_1', 'iteration_2', 'iteration_3', 'iteration_4', 'iteration_5', 'iteration_6', 'iteration_7', 'iteration_8', 'iteration_9', 'iteration_10', 'iteration_11', 'iteration_12', 'iteration_13', 'iteration_14', 'iteration_15', 'iteration_16', 'iteration_17', 'iteration_18', 'iteration_19', 'iteration_20', 'iteration_21', 'iteration_22', 'iteration_23', 'iteration_24', 'iteration_25', 'iteration_26', 'iteration_27', 'iteration_28', 'iteration_29', 'iteration_30']
[NT-v2-500m] loaded: shape=(4, 31)
  strategies=['greedy', 'sampling_t0.5', 'sampling_t1.0', 'sampling_t1.5']
  iterations=['iteration_0', 'iteration_1', 'iteration_2', 'iteration_3', 'iteration_4', 'iteration_5', 'iteration_6', 'iteration_7', 'iteration_8', 'iteration_9', 'iteration_10', 'iteration_11', 'iteration_12', 'iteration_13', 'iteration_14', 'iteration_15', 'iteration_16', 'iter

In [5]:
DECODING_STR = "sampling_t1.0"

seqs_by_model = {}
for model, df in promoter_dfs.items():
    seqs = {
        "iter00": df.loc[DECODING_STR, "iteration_0"],
        "iter10": df.loc[DECODING_STR, "iteration_10"],
        "iter20": df.loc[DECODING_STR, "iteration_20"],
        "iter30": df.loc[DECODING_STR, "iteration_30"],
    }
    seqs_by_model[model] = seqs
    print(f"[{model}] lengths: " + " ".join(f"{k}={len(v)}" for k, v in seqs.items()))
    print(f"  iter00: {seqs['iter00'][:50]}...{seqs['iter00'][-50:]}")


[DNABERT-2] lengths: iter00=1000 iter10=1003 iter20=951 iter30=900
  iter00: GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTA...CAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTA
[NT-v2-500m] lengths: iter00=1000 iter10=1010 iter20=1020 iter30=1020
  iter00: GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTA...CAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTA


In [6]:
compare_df = pd.DataFrame({
    model: {
        "iteration_0": promoter_dfs[model].loc[DECODING_STR, "iteration_0"],
        "iteration_10": promoter_dfs[model].loc[DECODING_STR, "iteration_10"],
        "iteration_20": promoter_dfs[model].loc[DECODING_STR, "iteration_20"],
        "iteration_30": promoter_dfs[model].loc[DECODING_STR, "iteration_30"],
    }
    for model in MODELS
}).T
compare_df


Unnamed: 0,iteration_0,iteration_10,iteration_20,iteration_30
DNABERT-2,GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTAGAGGAAAAAAATAATCTGAGCCAAGTAGAAGACCTTTTCCCCTCCTACCCCTACTTTCTAAGTCACAGAGGCTTTTTGTTCCCCCAGACACTCTTGCAGATTAGTCCAGGCAGAAACAGTTAGATGTCCCCAGTTAACCTCCTATTTGACACCACTGATTACCCCATTGATAGTCACACTTTGGGTTGTAAGTGACTTTTTATTTATTTGTATTTTTGACTGCATTAAGAGGTCTCTAGTTTTTTATCTCTTGTTTCCCAAAACCTAATAAGTAACTAATGCACAGAGCACATTGATTTGTATTTATTCTATTTTTAGACATAATTTATTAGCATGCATGAGCAAATTAAGAAAAACAACAACAAATGAATGCATATATATGTATATGTATGTGTGTATATATACACACATATATATATATATTTTTTCTTTTCTTACCAGAAGGTTTTAATCCAAATAAGGAGAAGATATGCTTAGAACCGAGGTAGAGTTTTCATCCATTCTGTCCTGTAAGTATTTTGCATATTCTGGAGACGCAGGAAGAGATCCATCTACATATCCCAAAGCTGAATTATGGTAGACAAAACTCTTCCACTTTTAGTGCATCAACTTCTTATTTGTGTAATAAGAAAATTGGGAAAACGATCTTCAATATGCTTACCAAGCTGTGATTCCAAATATTACGTAAATACACTTGCAAAGGAGGATGTTTTTAGTAGCAATTTGTACTGATGGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCACACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTA,GCAATCATTTCTCAGGAATGCTTTGCGACGCAGCTAAGACATAACAGGCGCTAGAGAGGCAAATAATAGCGTCATAAAAATGTTGAAGGGATAATTATTAATTTCTTGCCCTTCTGTCTAAGTCACCAGGAGAGTGGCTATACTAGCTTTGGCATCAAATTTTTCCAAGTTAGATGGGGAATCTTCCTGTAATCCTCATTTGTGCCATTGATAAAAACAAATCAGCTGTGCCACTGACTTATAAAACTTTTAAAATATGTATTTTAAAATTAGCTTGAACCATGAGCTATCATTTGAAAGTTTCCTCTAAAGAGAGAGGGGGAATGTCTAGGGGGTGTTGTATTGAAATGAATACATATTTATTGTTTTTCTAGAAAGTTATTTAAATTTCTCTTCATGAATTGGGGTTTTTTTGCATATAGAATATACAAGATTCTTGTAGTTATTAAACACACATACAGAAAATCATTAATTTTAATGATAACCCAAAGCCTCCAAAATCTGTGATTTATTTTAAAAGGCAATCGAATATTAGGATTATCGAGGAAATTCACCAAGTCCCCACAAATCTCATGGGAACCCCCAAGCTGGAACTCTTAATGGGAAAAAATGTAAATCAACAAAAAAAAAACAAACAAAAGCTGCCACTTCACATTATATATATATATAAATACACACTCTAATAATAAACACATAACGATATATATATAAATTCTGCCCTAAGGAGACAAGGCTTATCTTGTAAAGATGGGGAGTGAATTCCGCAGTGTCTCAGGAGATGTCTCATCACTCATGGGAGTTGTGGATATAGTTCCTTCCCAATTCCTGGCCAAGGGCCCTGTAAGCCAGTTCCATCCGGCTGAATCTCTTTACGGCCTGAAGGCCCCGACCTGGCCACCCTGCTTTAATGGGGTTTCAAGACCCCAACAAATTTTTTCAACCGGTTGCTGGCAGGACTTGTAGTGCTACGGAGTGCATTTACCTACAGGAGGCTCTGCAGTAA,CAAAAATAATAAATAATAACAGGACCTCTGTCTTAATAATAGCTAATAAAATATATACCTATAGTAGATCATGTAACATAGGTTCACTGTAACTGAATCCAATAAATTAGTTAACTTTTATCCTCCTTCCATTGCGAAGAACTGAGACACTTTATCTCCCTCACATGAATCCCCATCTTCCTGGATTGAGGGAGTGTTGATAAAAACAAATCAGCTCACATTATGAGCCAACTGGTTTATAAAACAGTTAAAATAACATTGAACCAGGTACAAGAACAGCCTGGAGGCAACTCAGTGAAGTTGAGGTCAAATCTATGCATGATTACAAAAGGGCCTTTGAACTTACAGACTATCAGCTAGGCAGAATCAAAATTCAGAGTTGGTATTTTGTCATTCTGAAAGAGGCCAACGCAGTTATAATACAAATAGAAATAATGACCTATCATTAATCTATTTGATAAAAGGAGAGGTAATTATAATTAAACCCTGTTTATTTTGTAGCTCTTTAAACTCAGCTATTATTGCTGTATTAATAATTAATCCTAGCACTTTGTTTCTCCAAGAAATAAACAGCCTTGGTTGGGACGGAACAGCATGAGTTACTCGTGCTTTTCACATTACGAAGTAACAACATTCTTATTTCTCTCTTTTATCTTTTTTCCCCCCAATTAGCCCCCTCTCCCATGCCTTTAAGCTTAAAAGATATCTGCTCTGTAGGAGCAGTGGAAGGTCTGGATGCCTTTCTTTTGGGGTTGTTGCCACAGGAAAGCTTGTTCTCACCTGTTCAGCAGAAGCGTGCGGATGTCCCTCAAGGTACAGCCAGTTTTAACTGCCTCACTACAATGAGAAGTTATCAGGAGGCTAGTGGACCAAGGGAGAGTAGGTTTCTGAGTGTAAGTTCTTGTAGTGCTATCACCAGCAGAGAGTACATCACCAATCTCATGGCCTATT,TAATTACATTCCTTACAATATTTCCTATTTTAATACAATGTAATAAATAACTTTAGTACAAAATATGGACCTGATGCTGAACCAGTGTAATAAGATGGATTTCTCTGCTCTCCACAACCAGGCTACAAACAAACAAAAATGTATACTTCGAAATCTTGAGTCCACTTCTTAGTTTTTGAGTTTGATACTCCTGATTCACAAACTCTACCCTGTGTGTGTGAATATGTGTCTTTTCTGTCAACTCAAATCCATATGAAAAAGCCTGGTAAACCTCGAGTTTTGTTTTATCTATGTAACTAAGTTTTTTTGGAGGTACTTAAAGCCTTTCGTCACTAGCTTCTATCATAAATCTCACTCCTTTGTAATTTATAAATATTTAATACATATAAAAATTGGTTTGCTTAAAGTTACTAACTCATAATGACCTATAAAGATTTGTAAATGATGCTCTTTCAAGTGTGTGAGAATGAGGGCTCTTGCTACCTGGGAAGGATAACCTTTGGCTGCAAAATAAATATATAGTATTTTTCTCCAATCTTTGTGCTCCCTTACCTAATGCTGACATCTTTTATTCTTGATAACCTGCCACCCCTCAGTAATCTGCTTATTCTTACTGTAAAGTTTTTGGCTCTGAATTTTCTGCCATTATTATCATAAGCTTAAAACTTTTGTGGTTTAACACTTGACTTGAAGGCAAAACCAGGTTTGGGCAGGAAGGGAAGATTGGGAACCCTGCAGTTTTCAGCAGAACCGCTATTCGGATGCAAGTTTCTAATACACTAAGTTTTAACTGTACCAAAACCTGCTAGGAACCTAAGCAGACTTATGAACCTTTACAAGGAATCGACCCTAGGAGGATGGAGAAAGCTTGCAAATAAATCAGTCATCTCATGGCCTATT
NT-v2-500m,GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTAGAGGAAAAAAATAATCTGAGCCAAGTAGAAGACCTTTTCCCCTCCTACCCCTACTTTCTAAGTCACAGAGGCTTTTTGTTCCCCCAGACACTCTTGCAGATTAGTCCAGGCAGAAACAGTTAGATGTCCCCAGTTAACCTCCTATTTGACACCACTGATTACCCCATTGATAGTCACACTTTGGGTTGTAAGTGACTTTTTATTTATTTGTATTTTTGACTGCATTAAGAGGTCTCTAGTTTTTTATCTCTTGTTTCCCAAAACCTAATAAGTAACTAATGCACAGAGCACATTGATTTGTATTTATTCTATTTTTAGACATAATTTATTAGCATGCATGAGCAAATTAAGAAAAACAACAACAAATGAATGCATATATATGTATATGTATGTGTGTATATATACACACATATATATATATATTTTTTCTTTTCTTACCAGAAGGTTTTAATCCAAATAAGGAGAAGATATGCTTAGAACCGAGGTAGAGTTTTCATCCATTCTGTCCTGTAAGTATTTTGCATATTCTGGAGACGCAGGAAGAGATCCATCTACATATCCCAAAGCTGAATTATGGTAGACAAAACTCTTCCACTTTTAGTGCATCAACTTCTTATTTGTGTAATAAGAAAATTGGGAAAACGATCTTCAATATGCTTACCAAGCTGTGATTCCAAATATTACGTAAATACACTTGCAAAGGAGGATGTTTTTAGTAGCAATTTGTACTGATGGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCACACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTA,GAACTTCTTTAAATAGCTTAGATATTTTTTTCAGAAATCTCGAGATACTAATATTCTTAGGTTGGTTAGACTAGGACTAAGACCTTCTCACACTCTCCCCTCACGTAGTTTGTGGTCTAGTTAGATGCAAAGAGCAGGTAGGCACAATGATATGCAAAGGCAACCGAGCAAAAATGTCCCACACTTGATAAACTTTTGAGGATGCTGATTCTTAGATTAGAATTTTACAAACGTTTACCAAGTGACCTTATTCTTAGATTGTTTAGTTAGACAGGGACTAGATCTCTACACTCAAACCATGTGAAGAGCAGAAGGACACAAAGAGAGGCAAGCACTAGCACATAGGAAAAAATCGGGCTTAATTCAGAGGATGTTGCCCTTTAGCTTAAAGTTAAGAGTTAGAAAGGTGTAAGAAATATGTGTATATATATGTATATATATATATATATATATATATATATATATATACGTATGTAGTATGTATTAATTTAATAAGTCTAATCCCTTAAGCACAAAGATATGAAAAAGGTGAGTAATATTTTCGTATTGTCTTACTCGTAAGTATATTTTATGAGTAATTAAACTTAATTAGAGAGTTAGATGTCCTTCTTCTTCAAACTAAGAAGAAGGGCACAAAGCTCTAGCACATAGTGCATTGTGCTGCCCTTTGTGTAATAAAATTCCTGGGAATTTTTAAGTCTTTAGCTGAGGTAGAGTTTTTCTCAAAACCAATTAATATTCTTAGATTGTTAGACTTGGACTAAGACCTTTTCACACTCTACCCCTAGTGTTTTTGTGCTAGTCTTAGGCAGGGAGAGCAATGAATACTTAATAGAGTTAAGAAGTATTAAAAAAGGTGTAAGTTATCTCAGTATTGTCTTTTCACTTAAGCATATTTTATGAATATTAAACTTAGAATGAGGCAGTCAAACTCCCTAGCACAGAACACATTGTTAATGCGGTTATGCATAATAAAATAGTTTCGGAATTTAGATTTGAGATATTTTTCG,GAACTTTTACATTATTATGCGATGAGTTTTCATGCTCTACCTAGAGCTATTCTTGTTTAGGTTGGTTAAGTCATGTATAAGACCTTCTCACACTCTTGAGCAAGCAGATTTGTGGTCTAGCCTTAGGCAAAGCTAAAGTTAGCACAAGAGATAAATACAGCAGGGAGTTAGGTTTGTCCCAGATGATTAAAAGATTATTTAATGTAAGTCCTTCTTAATAGTTGGTGTAAGCCAGTAGATTTAATCCTTATTCTTAGATTCAGCTAGATTGCTAAAGAATAAGAATCATAAACTAGGAATAATTGTCTTAGTTAAGGCAAAAGATTTATATTTCTTAGATAAGAATTTAAAGCAACAATGAATTCATAGAAAGTTAGTTCATGTTAGTTGAGCAATAACCAGACTTAAGTAAGTAATATGTATATATACATATGTATATATATATATATATATATATATATATATATATATATATATATATATATTGATTCAATTATCTTCAAGTCTTAGTTAGCAGATAGTAAGCGAATTTTTTGTTTTAAAGAGGTTTTTTTGTTTTTTTGTTTGTTGATGAGTTTTTTGTTGATCTTAGAGCAGCAGAGATTAACCCTAAGCAAAAGCAAAATTTAGGCACAAGAGAATAAACCAGGGAGCTAGCTTTATGTCCCGGATTAGATAAGTTAATTGAGACTTTTACATACTTAGCCGATGAGTTTTCATGCTCTACCGAGAGATCATTTCTTAGTTGGTTAAGTCATGTATAAGACCTTCACACTCTTGAGCAACTAGAGCAGTTGTCTAGTCAGGAGGCAAAAAGTTTATCTTAACAAATCTGAGTTAAACTGTCTGCCTAAGTAAAGAGTGAATAAGATTATTTAAAAAGGTCTGCATGTTAGTGAGATAAAAATTTTAACTTACTAGAGATACAGAAAGACCTAGCACATTTAAGAAAACAATAAAGACGGCATTTACCCCCCCCCCAAAAAAAAAAAAGCCCGCCATTTTAGCACAATTAGAAGA,AATATTTTACATATTATATGTGTAAGTTTTCATGTTTTCAAAATTGCTCGTAAACCAGTTTGAGATTTATGTTTTACACTAATAGAGTAGCCAAGATGAGCATTTTATGAAGTTGAAAACTCTATTAAATCTTTTAGTTTAGCATAAAGAACTCTAACAGCAGGCAGCAAAATTGCCCCAAGAGAAAGTTTAGATTATTTCAATAAGAATACCACTTTAACACTAGAATATAGCCTTTAGAAACTTCAAATATATCAATTCAGCTAGATTTAGAGCAAAGAATTATTAAAATTTCTTAATAATTGTTCTAAAACAATGACTTTTAAAAGTAAAACCCTTCCAAAATTTTATATTTTAAGTTTAAAGTCAAAATTGTTTGTGTTTGATTGAACTAAGCCAAACTTGAGATTTATGTTACAAATATGAAAATTACAATTAGATATATATATATATATATATATATATATATATATATATATATATATAGATTCAGCTAGATTTGAGCAAAGAATTTATTAAAACTTTATTATTTTTGCTCTAAAATCAATTAAGGTTTTCTTTTGTTTTATCTCTTAGATTACTGACTTTAGAGAAAGGTTTATGTTACTAAATATAAGTTCTTCAATATTTCACAAAAATAAATGTACAGCAGGCAGCGAATTTCGCCCCAATAAGAGTTAAGATTATTAACTTTTACAGAAATTAGACTTTAAAAATAAAGCTCTACCAAATGTTAAGATTATAGCTTTAGTCATTATTTGGTAGACTTTATGTCATGATAGATTCAAGTAGAGTTGTTTTTAATTTTGTCAAAAACTTAGAAAACAGTTTTAGTAAACAAAACCTAATTGTCCCATTGCCAAGTTTGTATAATTTATTCGGAATGGTTTACATATAGATTTGAATAATTTTGCATTACATCTCTAGTCTAAGACCGAGTTTTACCAAATACAAATGAAAACCAGCAATTGGTCCCCCTCCCAAAAAAAAAGGAAAAAGTAAAGAGAATAAAACTTTATT


In [7]:
seqs_by_model["DNABERT-2"]["iter00"]

'GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTAGAGGAAAAAAATAATCTGAGCCAAGTAGAAGACCTTTTCCCCTCCTACCCCTACTTTCTAAGTCACAGAGGCTTTTTGTTCCCCCAGACACTCTTGCAGATTAGTCCAGGCAGAAACAGTTAGATGTCCCCAGTTAACCTCCTATTTGACACCACTGATTACCCCATTGATAGTCACACTTTGGGTTGTAAGTGACTTTTTATTTATTTGTATTTTTGACTGCATTAAGAGGTCTCTAGTTTTTTATCTCTTGTTTCCCAAAACCTAATAAGTAACTAATGCACAGAGCACATTGATTTGTATTTATTCTATTTTTAGACATAATTTATTAGCATGCATGAGCAAATTAAGAAAAACAACAACAAATGAATGCATATATATGTATATGTATGTGTGTATATATACACACATATATATATATATTTTTTCTTTTCTTACCAGAAGGTTTTAATCCAAATAAGGAGAAGATATGCTTAGAACCGAGGTAGAGTTTTCATCCATTCTGTCCTGTAAGTATTTTGCATATTCTGGAGACGCAGGAAGAGATCCATCTACATATCCCAAAGCTGAATTATGGTAGACAAAACTCTTCCACTTTTAGTGCATCAACTTCTTATTTGTGTAATAAGAAAATTGGGAAAACGATCTTCAATATGCTTACCAAGCTGTGATTCCAAATATTACGTAAATACACTTGCAAAGGAGGATGTTTTTAGTAGCAATTTGTACTGATGGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCACACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTT

In [8]:
# from utils import get_sequence

# seq, meta = get_sequence("HBB", gtf)
# print(len(seq), meta["gene"], meta["chrom"])
# print(seq[:50]+"...")

In [9]:
GENE_NAME='HBB'
gene_gtf = gtf[(gtf['gene_name'] == GENE_NAME) & (gtf['Feature']=='transcript')].iloc[0]
strand, start, end = gene_gtf['Strand'], int(gene_gtf['Start']), int(gene_gtf['End'])
strand, start, end

('-', 5225463, 5227071)

In [14]:
seqs_by_model.items()
seqs_by_model['DNABERT-2']['iter00']

'GAACTTGAATCAAGGAAATGATTTTAAAACGCAGTATTCTTAGTGGACTAGAGGAAAAAAATAATCTGAGCCAAGTAGAAGACCTTTTCCCCTCCTACCCCTACTTTCTAAGTCACAGAGGCTTTTTGTTCCCCCAGACACTCTTGCAGATTAGTCCAGGCAGAAACAGTTAGATGTCCCCAGTTAACCTCCTATTTGACACCACTGATTACCCCATTGATAGTCACACTTTGGGTTGTAAGTGACTTTTTATTTATTTGTATTTTTGACTGCATTAAGAGGTCTCTAGTTTTTTATCTCTTGTTTCCCAAAACCTAATAAGTAACTAATGCACAGAGCACATTGATTTGTATTTATTCTATTTTTAGACATAATTTATTAGCATGCATGAGCAAATTAAGAAAAACAACAACAAATGAATGCATATATATGTATATGTATGTGTGTATATATACACACATATATATATATATTTTTTCTTTTCTTACCAGAAGGTTTTAATCCAAATAAGGAGAAGATATGCTTAGAACCGAGGTAGAGTTTTCATCCATTCTGTCCTGTAAGTATTTTGCATATTCTGGAGACGCAGGAAGAGATCCATCTACATATCCCAAAGCTGAATTATGGTAGACAAAACTCTTCCACTTTTAGTGCATCAACTTCTTATTTGTGTAATAAGAAAATTGGGAAAACGATCTTCAATATGCTTACCAAGCTGTGATTCCAAATATTACGTAAATACACTTGCAAAGGAGGATGTTTTTAGTAGCAATTTGTACTGATGGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCACACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTT

In [None]:
from utils import reverse_complement, get_sequence
seq_original=''
PROMO_LENGTH=len(seqs_by_model['DNABERT-2']['iter00'])

if strand=='-':
    """
    근데 이 부분에서 reverse complement가 딱 한 번만 일어나는게 보장이 되어야 함;; 여러 번 돌리면 매우 큰일남;;
    """
    TSS=end
    seq_original, _ = get_sequence(GENE_NAME, start=TSS, end=TSS+PROMO_LENGTH)
    seq_original = reverse_complement(seq_original)
    for model, iters in seqs_by_model.items():
        for it, seq in iters.items():
            iters[it] = reverse_complement(seq)
else: # strand=='+' or '.'
    TSS=start
    seq_original, _ = get_sequence(GENE_NAME, start=TSS-PROMO_LENGTH, end=TSS)


gencode.v46.annotation.gtf.gz.feather already exists!


In [12]:
seq_original==seqs_by_model['DNABERT-2']['iter00']

True

In [None]:


variants=[
    # genome.Variant(chrom, start, ori_seq_rc, seq_iter00),
    genome.Variant(chrom, start, ori_seq_rc, seq_iter10),
    genome.Variant(chrom, start, ori_seq_rc, seq_iter20),
    genome.Variant(chrom, start, ori_seq_rc, seq_iter30),
]

requested_outputs={
    dna_client.OutputType.CAGE,
    dna_client.OutputType.PROCAP,
    dna_client.OutputType.RNA_SEQ
}

outputs=dna_model.predict_variants(
    intervals=interval,
    variants=variants,
    requested_outputs=requested_outputs,
    ontology_terms=ontology_terms,
    progress_bar=True
)