In [None]:
import pandas as pd
import numpy as np
import topiary
import re, copy, os

In [None]:
# Run basic _prepare_for_blast on "good-df.csv"

df = pd.read_csv("../good-df.csv")
call_dict={"LY96":["lymphocyte antigen 96","esop1"],
          "LY86":re.compile("lymphocyte antigen 86")}
local_rev_blast_db="GRCh38"

default_kwargs = {"local_rev_blast_db":None,
                  "ncbi_rev_blast_db":None,
                  "ignorecase":True,
                  "max_del_best":100,
                  "min_call_prob":0.95,
                  "use_start_end":True}

kwargs = copy.deepcopy(default_kwargs)
kwargs["df"] = df
kwargs["call_dict"] = call_dict
kwargs["local_rev_blast_db"] = local_rev_blast_db

out = topiary.ncbi._reverse_blast._prepare_for_blast(**kwargs)
out_df, sequence_list, patterns, out_max_del_best, out_min_call_prob = out


In [None]:
# Run local reverse blast. Assumes presence of GRCh38 blastdb in the
# working directory. Writes out local_hits_dfs/hit_{i}.csv with the 
# blast hits for each sequence in good-df.csv
local_rev_blast_db="GRCh38"
default_kwargs = {"local_rev_blast_db":None,
                  "ncbi_rev_blast_db":None,
                  "ncbi_taxid":None,
                  "hitlist_size":50,
                  "e_value_cutoff":0.01,
                  "gapcosts":(11,1),
                  "local_num_threads":-1}

# Run local blast
kwargs = copy.deepcopy(default_kwargs)
kwargs["sequence_list"] = sequence_list
kwargs["local_rev_blast_db"] = local_rev_blast_db


local_hit_dfs = topiary.ncbi._reverse_blast._run_blast(**kwargs)

try:
    os.mkdir("local_hit_dfs")
except FileExistsError:
    pass

for i, d in enumerate(local_hit_dfs):
    d.to_csv(os.path.join("local_hit_dfs",f"hit_{i}.csv"))


In [None]:
# Run ncbi reverse blast against nr. Writes out ncbi_hits_dfs/hit_{i}.csv
# with the blast hits for each sequence in good-df.csv

ncbi_rev_blast_db = "nr"
ncbi_taxid = 9606
default_kwargs = {"local_rev_blast_db":None,
                  "ncbi_rev_blast_db":None,
                  "ncbi_taxid":None,
                  "hitlist_size":50,
                  "e_value_cutoff":0.01,
                  "gapcosts":(11,1),
                  "local_num_threads":-1}

# Run local blast
kwargs = copy.deepcopy(default_kwargs)
kwargs["sequence_list"] = sequence_list
kwargs["ncbi_rev_blast_db"] = ncbi_rev_blast_db
kwargs["ncbi_taxid"] = ncbi_taxid

ncbi_hit_dfs = topiary.ncbi._reverse_blast._run_blast(**kwargs)

try:
    os.mkdir("ncbi_hit_dfs")
except FileExistsError:
    pass

for i, d in enumerate(ncbi_hit_dfs):
    d.to_csv(os.path.join("ncbi_hit_dfs",f"hit_{i}.csv"))

In [None]:
df = df
hit_dfs_list = [local_hit_dfs,ncbi_hit_dfs]
patterns = patterns
ncbi_rev_blast_db_list = [None,"nr"]

ncbi_taxid = 9606
default_kwargs = {"max_del_best":10,
                  "min_call_prob":0.95,
                  "ncbi_rev_blast_db":None}

for i in range(2):

    kwargs = copy.deepcopy(default_kwargs)
    kwargs["df"] = df
    kwargs["hit_dfs"] = hit_dfs_list[i]
    kwargs["patterns"] = patterns
    kwargs["ncbi_rev_blast_db"] = ncbi_rev_blast_db_list[i]

    out_df = topiary.ncbi._reverse_blast._make_reverse_blast_calls(**kwargs)
