# Setting

In [None]:
from __init__ import *
from primer_modules import *
from ml_modules import *

import os
import ast
import numpy as np
import pandas as pd
import time
import bisect
from multiprocessing import Pool, cpu_count
from tqdm.auto import tqdm
from collections import defaultdict, Counter
from Bio import SeqIO, Seq, Entrez
from Levenshtein import distance

In [None]:
import joblib

CLASSIFIER = '/home/jupyter/ADAPT_PCR_share/safe/resources/ml/0721_LSTM_Classifier.pth'
REGRESSOR = '/home/jupyter/ADAPT_PCR_share/safe/resources/ml/0725_combined_model.pth'
SCALER = '/home/jupyter/ADAPT_PCR_share/safe/resources/ml/0728_scaler.joblib'
FEATS = ['f_len','f_Tm','f_GC','f_indel','f_mm','r_len','r_Tm','r_GC','r_indel','r_mm','prod_len','prod_Tm']

scaler = joblib.load(SCALER)
classifier = torch.load(CLASSIFIER, weights_only=False)
regressor = torch.load(REGRESSOR, weights_only=False)
classifier.eval()
regressor.eval()

# BBP

In [None]:
SEQPATH = '/home/jupyter/ADAPT_PCR_share/safe/resources/Seqs_used_for_design_from_Elyse/'
consensus_header = 25

In [None]:
priexcel = pd.ExcelFile('%s/BPPv2_dPCR_Sequences.xlsx' % SEQPATH)
pritbl = priexcel.parse('Primers', index_col=0)
pritbl.head(1)

In [None]:
conexcel = pd.ExcelFile('%s/BPPv2_ENS_Current.xlsx' % SEQPATH)
contbl = conexcel.parse('gBlocks').drop([0,13]).iloc[:26].set_index('Sequence Name')
contbl.head(1)

In [None]:
from itertools import product

# IUPAC degenerate base symbol dictionary
IUPAC_CODES = {
    "A": ["A"],
    "C": ["C"],
    "G": ["G"],
    "T": ["T"],
    "R": ["A", "G"],
    "Y": ["C", "T"],
    "S": ["G", "C"],
    "W": ["A", "T"],
    "K": ["G", "T"],
    "M": ["A", "C"],
    "B": ["C", "G", "T"],
    "D": ["A", "G", "T"],
    "H": ["A", "C", "T"],
    "V": ["A", "C", "G"],
    "N": ["A", "C", "G", "T"],
}

def expand_degenerate_sequence(seq):
    base_lists = []
    for base in seq.upper():
        if base not in IUPAC_CODES:
            raise ValueError(f"Unknown base symbol: {base}")
        base_lists.append(IUPAC_CODES[base])
    
    all_combinations = product(*base_lists)
    return ["".join(p) for p in all_combinations]

example_seq = "ATGS"
expanded = expand_degenerate_sequence(example_seq)
print(expanded)

def get_fasta_seqs(fasta):
    return { s.id:str(s.seq) for s in SeqIO.parse(fasta, 'fasta') }

def measure_dist(seq, ref, dcut=15):
    if len(ref) < len(seq):
        return (-1, 999) 
    if seq in ref:
        return (ref.find(seq), 0)
    for i in range(len(ref)-len(seq)):
        if distance(seq, ref[i:i+len(seq)]) <= dcut:
            return (i, distance(seq, ref[i:i+len(seq)]))
    return (-1, 999)

def get_min_dist(seq, ref):
    if len(ref) < len(seq):
        return (-1, 999) 
    if seq in ref:
        return 0
    return min([distance(seq, ref[i:i+len(seq)]) for i in range(len(ref)-len(seq))])

## Assort target seqs

### CCHFV

In [None]:
virus = 'CCHFV'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_L_all.fasta' ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title(tarf.split('/')[-1].split('.')[0],fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
inccon = [seq for seq in tarseqs.values() if measure_dist(tseq, seq)[1]<999]
print('# all variants: %s' % len(tarseqs))
print('# seq including the consensus: %s' % len(inccon))

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### YFV

In [None]:
virus = 'YFV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta' ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title(tarf.split('/')[-1].split('.')[0],fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
inccon = [seq for seq in tarseqs.values() if measure_dist(tseq, seq)[1]<999]
print('# all variants: %s' % len(tarseqs))
print('# seq including the consensus: %s' % len(inccon))

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### EBOV

In [None]:
virus = 'EBOVS'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all.fasta', f'{SEQPATH}/{virus}_all.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title(tarf.split('/')[-1].split('.')[0],fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
inccon = [seq for seq in tarseqs.values() if measure_dist(tseq, seq)[1]<999]
print('# all variants: %s' % len(tarseqs))
print('# seq including the consensus: %s' % len(inccon))

In [None]:
virus = 'EBOVZ'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_from2015.fasta', f'{SEQPATH}/{virus}_from2022.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[1])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break
# inccon = [seq for seq in tarseqs.values() ]
# print('# all variants: %s' % len(tarseqs))
# print('# seq including the consensus: %s' % len(inccon))

In [None]:
select = tarfs[1]
!cp $select $ASSORTPATH

### WNV

In [None]:
virus = 'WNV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### HIV2

In [None]:
virus = 'HIV2'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all.fasta', f'{SEQPATH}/{virus}_all.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### CHIK

In [None]:
virus = 'CHIK'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### MBV

In [None]:
virus = 'MBV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_from2022.fasta', f'{SEQPATH}/{virus}_from2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### HBV

In [None]:
virus = 'HBV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### MMV

In [None]:
virus = 'MMV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_all_2022.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### RBV

In [None]:
virus = 'RBV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### RVFV

In [None]:
virus = 'RVFV'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### HIV1

In [None]:
virus = 'HIV1'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}_all_2022.fasta', f'{SEQPATH}/{virus}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### Zika

In [None]:
virus = 'Zika'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus.upper()}_all_2022.fasta', f'{SEQPATH}/{virus.upper()}_Africa_2015.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### DVT

In [None]:
virus = 'DVT'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R1','Sequence'])
tseq = contbl.loc[f'DengueT1_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))

In [None]:
virus = 'DVT'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R2','Sequence'])
tseq = contbl.loc[f'DengueT2_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
virus = 'DVT'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R3','Sequence'])
tseq = contbl.loc[f'DengueT3_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
virus = 'DVT'
consensus_header = 25
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R4','Sequence'])
tseq = contbl.loc[f'DengueT4_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}1_all_2022.fasta', f'{SEQPATH}/{virus}2_all_2022.fasta',
          f'{SEQPATH}/{virus}3_all_2022.fasta', f'{SEQPATH}/{virus}4_all_2022.fasta']
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
for i in range(1,5):
    tseq = contbl.loc[f'DengueT{i}_v2_gblock_1','Sequence'][consensus_header:]
    tarseqs = get_fasta_seqs(tarfs[i-1])
    for seq in tarseqs.values():
        if measure_dist(tseq, seq)[1]<999:
            print(f'Seq includes the consensus DVT{i}')
            break

In [None]:
select = f'{SEQPATH}/DVT*_all_2022.fasta'
#!cp $select $ASSORTPATH

### HCV

In [None]:
virus = 'HCV'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R1','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R2','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus.upper()}_2022.fasta', f'{SEQPATH}/{virus.upper()}_Africa.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
def measure_dist(seq, ref, dcut=5):
    if len(ref) < len(seq):
        return (-1, 999) 
    if seq in ref:
        return (ref.find(seq), 0)
    for i in range(len(ref)-len(seq)):
        if distance(seq, ref[i:i+len(seq)]) <= dcut:
            return (i, distance(seq, ref[i:i+len(seq)]))
    return (-1, 999)

tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    fpos, fd = measure_dist(fseqs[0], seq)
    rpos, rd = measure_dist(reverse_complement_dna(rseqs[0]), seq)
    if fd<999 and rd<999:
        print(rpos-fpos)
        #print('Seq includes the consensus')
        #break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### ONN

In [None]:
virus = 'ONN'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
tarfs = [ f'{SEQPATH}/{virus}.fasta', f'{SEQPATH}/{virus}.fasta'  ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(tarf)
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(tarfs[0])
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<999:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### LASV

In [None]:
virus = 'LASV'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_S_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_S_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
tarfs = [ f for f in os.listdir(f'{SEQPATH}') if f.startswith(f'{virus}_S_') ]
fig, axes = plt.subplots(1, len(tarfs), figsize=(2*len(tarfs), 2), sharey=True)

for tarf, ax in zip(tarfs, axes):
    tarseqs = get_fasta_seqs(f'{SEQPATH}/{tarf}')
    tlens = [ len(seq) for seq in tarseqs.values() ]
    ax.hist(tlens)
    ax.set_title('%s (n=%i)' % (tarf.split('/')[-1].split('.')[0], len(tlens)), fontsize=9)

In [None]:
tarseqs = get_fasta_seqs(f'{SEQPATH}/{tarfs[0]}')
for seq in tarseqs.values():
    if measure_dist(tseq, seq)[1]<15:
        print('Seq includes the consensus')
        break

In [None]:
select = tarfs[0]
!cp $select $ASSORTPATH

### Mpox

In [None]:
virus = 'Mpox_B1'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

In [None]:
virus = 'Mpox_CladeI'
fseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_F','Sequence'])
rseqs = expand_degenerate_sequence(pritbl.loc[f'{virus}_v2_R','Sequence'])
tseq = contbl.loc[f'{virus}_v2_gblock_1','Sequence'][consensus_header:]
print(f'For: {fseqs}\nRev: {rseqs}\nTarget: {tseq}')
for fseq in fseqs:
    for rseq in rseqs:
        print('For, Rev in Target: %i, %i' % (fseq in tseq, reverse_complement_dna(rseq) in tseq))
if fseq not in tseq:
    print('For - Target dist: %i' % get_min_dist(fseq, tseq))
if reverse_complement_dna(rseq) not in tseq:
    print('Rev - Target dist: %i' % get_min_dist(reverse_complement_dna(rseq), tseq))

## Combine target seqs

In [None]:
allvs = [(f.split('_')[0],f) if '_' in f else (f.split('.')[0],f) for f in sorted(os.listdir(ASSORTPATH))
         if not f.startswith('bbp')]
allvns = [vname for vname,fasta in allvs]
fastas = dict(allvs)
print('# virus: %i' % len(allvs), allvs[0])

In [None]:
# for virus in allvns:
#     parsed = f'{SAMPATH}/{virus}.parsed'
#     mapped = f'{SAMPATH}/{virus}.target.mapped'
#     mapful = f'{SAMPATH}/{virus}.target.mapped.full'
    
#     newparsed = f'{SAMPATH}/{virus}.{virus}.parsed'
#     newmapped = f'{SAMPATH}/{virus}.{virus}.mapped'
#     newmapful = f'{SAMPATH}/{virus}.{virus}.mapped.full'
#     #!mv $parsed $newparsed
#     !mv $mapped $newmapped
#     !mv $mapful $newmapful

In [None]:
# bbpf = f'{ASSORTPATH}/bbp_combined_0810.fa'
# with open(bbpf, 'wt') as out:
#     for vname, fasta in allvs:
#         for s in SeqIO.parse(f'{ASSORTPATH}/{fasta}', 'fasta'):
#             seqname = f'{vname}_{s.id}'
#             seq = str(s.seq)
#             out.write(f'>{seqname}\n{seq}\n')
# !head -1 $bbpf

### LASV and HCV

In [None]:
fs_lassa = [ f for f in os.listdir(SEQPATH) if 'LASV_S_' in f ]
fs_lassa

In [None]:
seqs_lassa = {}
for f in fs_lassa:
    seqs = get_fasta_seqs(f'{SEQPATH}/{f}')
    print(f, len(seqs), np.min([len(s) for s in seqs.values()]))
    seqs_lassa.update(seqs)
print(len(seqs_lassa))

In [None]:
fs_hcv = [ f for f in os.listdir(SEQPATH) if 'HCV' in f ]
fs_hcv

In [None]:
seqs_hcv = {}
for f in fs_hcv:
    seqs = get_fasta_seqs(f'{SEQPATH}/{f}')
    print(f, len(seqs))
    seqs_hcv.update(seqs)
print(len(seqs_hcv))

## Get published primer seqs

In [None]:
pub_pris = defaultdict(list)
vcnts = defaultdict(int)
for pname, row in pritbl.iterrows():
    if pname.startswith('Mpox_B1_v2'):
        vn = 'mpoxIIb'
    elif pname.startswith('Mpox_CladeI'):
        vn = 'mpoxIaIb'
    elif pname == 'DVT_v2_F':
        seq = expand_degenerate_sequence(row['Sequence'])[0]
        for i in range(1,5):
            vn = f'DVT{i}'
            pname = f'{vn}_pub_1_f'
            pub_pris[vn].append((pname, seq))
        continue
    elif pname.startswith('DVT'):
        vn = pname.split('_')[0] + pname[-1]
        pname = pname[:-1]
    elif pname.startswith('Zika'):
        vn = 'ZIKA'
    else:
        vn = pname.split('_')[0]
    if vn not in allvns:
        continue
    
    ori = pname.split('_')[-1]
    if len(ori) == 2: # R1 
        ori = ori[0].lower()
    else:
        ori = ori.lower()
        
    seqs = expand_degenerate_sequence(row['Sequence'])
    if vn in allvns:
        for seq in seqs:
            vcnts[(vn,ori)] += 1
            num = vcnts[(vn,ori)]
            pname = f'{vn}_pub_{num}_{ori}'
            pub_pris[vn].append((pname, seq))

for vn in pub_pris:
    print(vn, len(pub_pris[vn]))
print('# published primers: %i' % len(pub_pris))

## Generate primers

### Prep

In [None]:
## Sequences of the targets
#TARGETS = '/home/jupyter/ADAPT_PCR_share/safe/design/pipeline/H5N1_all_seqs_01_24_to_05_25.fasta'

## Sequences of the host
HOST = '/home/jupyter/ADAPT_PCR_share/safe/resources/genomes/Homo_sapiens.GRCh38.tx.sort.fa.gz'

## Bowtie2 index path
IDXPATH = '/home/jupyter/ADAPT_PCR_share/safe/resources/genomes/bowtie2'

## Output path
WORKPATH = '/home/jupyter/ADAPT_PCR_share/safe/design/BBP'

## Output name (a directory will be made with this name under the output path)
OUTNAME = '0810_test'

In [None]:
OUTPATH = '%s/%s' % (WORKPATH, OUTNAME)
if not os.path.exists(OUTPATH):
    os.makedirs(OUTPATH)

PRIPATH = '%s/primer_seqs' % OUTPATH
TARPATH = '%s/target_seqs' % OUTPATH
SAMPATH = '%s/alignments' % OUTPATH
INPPATH = '%s/inputs' % OUTPATH
OUPPATH = '%s/outputs' % OUTPATH
TMPPATH = '%s/temp' % OUTPATH
for path in [PRIPATH, SAMPATH, TMPPATH]:
    if not os.path.exists(path):
        os.makedirs(path)

LOG = '%s/log' % OUTPATH
# with open(LOG, 'wt') as out:
#     out.write('''Input files
#   TARGETS = '/home/jupyter/ADAPT_PCR_share/safe/design/pipeline/H5N1_all_seqs_01_24_to_05_25.fasta'
#   SPECIFICITY = '/home/jupyter/ADAPT_PCR_share/safe/design/H5/target_seqs/h1h3_combined.fa'
#   HOST = '/home/jupyter/ADAPT_PCR_share/safe/resources/genomes/Homo_sapiens.GRCh38.tx.sort.fa.gz'
#   IDXPATH = '/home/jupyter/ADAPT_PCR_share/safe/resources/genomes/bowtie2'
#   PARAMFILE = '/home/jupyter/ADAPT_PCR_share/safe/design/pipeline/params.txt'
#   WORKPATH = '/home/jupyter/ADAPT_PCR_share/safe/design/pipeline'
#   OUTNAME = '0728_H5'\n\n''')

In [None]:
allvs = [(f.split('_')[0],f) if '_' in f else (f.split('.')[0],f) for f in sorted(os.listdir(TARPATH))]
allvns = [vname for vname,fasta in allvs]
fastas = dict(allvs)
print('# virus: %i' % len(allvs), allvs[0])

In [None]:
PARAMFILE = f'{OUTPATH}/params.txt'
PARAMS = {}
for l in open(PARAMFILE, 'rt'):
    print(l.strip())
    if '=' in l:
        param, value = map(lambda x:x.strip(), l.split('='))
        try:
            PARAMS[param] = int(value)
        except ValueError:
            PARAMS[param] = value

In [None]:
from Bio.SeqUtils import MeltingTemp, gc_fraction
def get_tm(seq):
    return MeltingTemp.Tm_NN(seq, Na=50, Mg=1.5, dNTPs=.6)

def get_gc(seq):
    return gc_fraction(seq)*100

def get_dg_vienna(seq1, seq2):
    inp = '"%s\n%s"'%(seq1,seq2)
    res = !echo -e $inp | $RNAduplex - --noconv --paramFile=DNA 2>tmp
    dg = float(res[0].split()[-1][1:-1])
    return dg    

def get_primers_single_target(target_seq, step):
    plen = PARAMS['PRIMER_LEN']
    minlen = PARAMS['AMPLEN_MIN']
    
    target_seq_rc = reverse_complement_dna(target_seq)
    forps, revps = {}, {}
    for i in range(0, len(target_seq)-plen-minlen, step):
        fseq = target_seq[i:i+plen]
        if 'N' not in fseq:
            forps[fseq] = i
        rseq = target_seq_rc[i:i+plen]
        if 'N' not in rseq:
            revps[rseq] = len(target_seq)-i
    return forps, revps

def get_primers_multi_target(target_seqs, step=PARAMS['TILING_STEP']):
    forps, revps = {}, {}
    for tseq in target_seqs:
        flist, rlist = get_primers_single_target(tseq, step)
        forps.update(flist)
        revps.update(rlist)
    
    for_filt, rev_filt = {}, {}
    for plist, filt in zip([forps,revps],[for_filt,rev_filt]):
        for pseq in plist:
            gc = get_gc(pseq) # gc_fraction(pseq)
            tm = get_tm(pseq)
            if PARAMS['TM_MIN']<=tm<=PARAMS['TM_MAX'] and gc<=PARAMS['GC_MAX']:
                dg = get_dg_vienna(pseq, pseq)
                if PARAMS['DG_MIN']<=dg:
                    filt[pseq] = plist[pseq]           
    return forps, revps, for_filt, rev_filt

def count_primer_pairs(sts, ens, minlen=PARAMS['AMPLEN_MIN'], maxlen=PARAMS['AMPLEN_MAX']):
    ens_sorted = sorted(ens)
    count = 0
    for st in sts:
        left = bisect.bisect_left(ens_sorted, st + minlen)
        right = bisect.bisect_right(ens_sorted, st + maxlen)
        count += (right - left)
    return count

### Generate

In [None]:
consen_seqs = {}
for vn in allvns:
    vn0 = vn
    if vn.startswith('DVT'):
        vn = vn.replace('DVT','DengueT')
    if vn == 'ZIKA':
        vn = 'Zika'
    if vn == 'mpoxIIb':
        vn = 'Mpox_B1'
    if vn == 'mpoxIaIb':
        vn = 'Mpox_CladeI'
    tseq = contbl.loc[f'{vn}_v2_gblock_1','Sequence'][consensus_header:]
    consen_seqs[vn0] = tseq
print(len(consen_seqs))

In [None]:
slicers = {}
for vn in ['LASV']:# allvns:
    fasta = f'{TARPATH}/{fastas[vn]}'
    tarseqs = get_fasta_seqs(fasta).values()
    consen = consen_seqs[vn]
    for seq in tarseqs:
        st, dist = measure_dist(consen, seq)
        if dist < 999:
            break
    else:
        st, dist = measure_dist(pub_pris[f'{vn}_pub_1_f'], seq)
        
    if st < 500:
        sl = slice(None, 1500)
    elif st > min(map(len, tarseqs)) - 1500:
        sl = slice(-1500, None)
    else:
        sl = slice((st//500)*500-500, (st//500)*500+1000)
    slicers[vn] = sl    
    print(vn, sl)

In [None]:
pub_pris['LASV']

In [None]:
start = time.time()
for vname in ['LASV']:# allvns:
    TARGETS = f'{TARPATH}/{fastas[vname]}'
    sl = slicers[vname]
    target_seqs = [str(s.seq) for s in SeqIO.parse(TARGETS,'fasta')]
    tlen = np.average([len(tseq) for tseq in target_seqs])
    target_seqs = [ tseq[sl] for tseq in target_seqs ]
    forps, revps, for_filt, rev_filt = get_primers_multi_target(target_seqs)
    allps = count_primer_pairs(forps.values(), revps.values())
    selps = count_primer_pairs(for_filt.values(), rev_filt.values())
    print('%s (n=%i, mean len=%.0f bp, design=%s-%s)' % (vname, len(target_seqs), tlen, sl.start, sl.stop))
    print('Unique primers: %i forward, %i reverse, %i pairs' % (len(forps), len(revps), allps))
    print('Pre-selected: %i forward, %i reverse, %i pairs' % (len(for_filt), len(rev_filt), selps))

    prefa = f'{PRIPATH}/{vname}_pre.fa'
    with open(prefa, 'wt') as out:
        for pname, seq in pub_pris[vname]:
            out.write(f'>{pname}\n{seq}\n')
        for i, pseq in enumerate(for_filt.keys()):
            out.write('>%s_ai_%s_f\n%s\n'%(vname, i+1, pseq))
        for i, pseq in enumerate(rev_filt.keys()):
            out.write('>%s_ai_%s_r\n%s\n'%(vname, i+1, pseq))

#     with open(LOG, 'a') as out:
#         out.write(' %s (n=%i, mean len=%.0f bp, design=%s-%s)\n' % (vname, len(target_seqs), tlen, sl.start, sl.stop))
#         out.write(' Unique primers: %i forward, %i reverse, %i pairs\n' % (len(forps), len(revps), allps))
#         out.write(' Pre-selected: %i forward, %i reverse, %i pairs\n' % (len(for_filt), len(rev_filt), selps))
    
runtime = (time.time() - start)
print('Run time: %.3f sec' % runtime)

In [None]:
allfa = f'{PRIPATH}/all_pre.fa'
# with open(allfa, 'a') as allout:
#     for pname in sorted(pub_pris.keys()):
#         seq = pub_pris[pname]
#         allout.write(f'>{pname}\n{seq}\n')
!wc -l $allfa

In [None]:
# for vname, fasta in allvs:
#     prefa = f'{PRIPATH}/{vname}_pre.fa'
#     allps = get_fasta_seqs(prefa)
#     print(vname, len([p for p in allps if 'pub' in p]))
#     with open(prefa, 'a') as out:
#         for pname, seq in pub_pris[vname]:
#             if pname not in allps:
#                 out.write(f'>{pname}\n{seq}\n')
#                 print(pname)
# !tail -4 $prefa

## Map

In [None]:
virus = 'LASV'
sam = f'/home/jupyter/ADAPT_PCR_share/safe/design/BBP/0810_test/alignments/{virus}.{virus}.sam'
fa = f'/home/jupyter/ADAPT_PCR_share/safe/design/BBP/0810_test/primer_seqs/{virus}_pre.fa'
key = f'"$(tail -2 {sam} | head -1 | cut -f1)"'
!wc -l $fa
!grep -n $key $fa

## Make input

In [None]:
#!du -h $INPPATH/*

In [None]:
def _ensure_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    return [x]

MIN_BP, MAX_BP = 55, 180
names = ['pname','orientation','tname','start','pseq','tseq','match']

virus = 'ONN'
mapped = f'/home/jupyter/ADAPT_PCR_share/safe/design/BBP/0810_test/alignments/{virus}.{virus}.mapped.full'
#evalf = f'/home/jupyter/ADAPT_PCR_share/safe/design/BBP/0810_test/inputs/{virus}.{virus}.input'

raw = pd.read_table(mapped, sep='\t', names=names)
maptbl = raw.groupby(['pname','pseq','tseq']).first()
maptbl['tnames'] = raw.groupby(['pname','pseq','tseq'])['tname'].apply(list)
maptbl['starts'] = raw.groupby(['pname','pseq','tseq'])['start'].apply(list)
maptbl = maptbl.reset_index()
maptbl['pgap'] = maptbl['pseq'].apply(lambda x: x.count('-'))
maptbl['tgap'] = maptbl['tseq'].apply(lambda x: x.count('-'))
maptbl['indel'] = maptbl['pgap'] + maptbl['tgap']
maptbl['mm'] = maptbl['pseq'].apply(len) - maptbl['match'].apply(lambda x: x.count('|')) - maptbl['indel']
maptbl['pseq_raw'] = maptbl['pseq'].apply(lambda x: x.replace('-',''))
maptbl['len'] = maptbl['pseq_raw'].apply(len)
maptbl['Tm'] = maptbl['pseq_raw'].apply(get_tm)
maptbl['GC'] = maptbl['pseq_raw'].apply(gc_fraction)

subcols = ['pname','pseq','pseq_raw','tseq','orientation','mm','indel','len','Tm','GC','tnames','starts']
fors = maptbl.loc[maptbl.apply(lambda row:(row['orientation']%256==0)&('_f' in row['pname']),axis=1), subcols]
revs = maptbl.loc[maptbl.apply(lambda row:(row['orientation']%256==16)&('_r' in row['pname']),axis=1), subcols]
revs['pseq'] = revs['pseq'].apply(rev_com_enc)
revs['tseq'] = revs['tseq'].apply(rev_com_enc)

revs_meta = revs.reset_index().rename(columns={"index": "r_id"})
rev_index = defaultdict(list)
for r in revs_meta.itertuples(index=False):
    tnames = _ensure_list(getattr(r, "tnames"))
    starts = _ensure_list(getattr(r, "starts"))
    r_id   = getattr(r, "r_id")
    for t, st in zip(tnames, starts):
        rev_index[t].append((r_id, int(st)))

# 1) 출력 파일 초기화(기존 파일 덮어쓰려면)
header_flag = True

# 2) forward를 한 줄씩 훑으며 매칭
for f in fors.itertuples():
    f_id    = f.Index
    tnamesf = _ensure_list(getattr(f, "tnames"))
    startsf = _ensure_list(getattr(f, "starts"))
    if not tnamesf or not startsf:
        continue

    # r_id -> 공통 타깃 set
    targets_by_r = defaultdict(set)

    for t_f, st_f in zip(tnamesf, startsf):
        cand = rev_index.get(t_f)
        if not cand:
            continue
        for r_id, st_r in cand:
            d = st_r - int(st_f)   # 방향성 유지 (rev - fwd)
            if MIN_BP <= d <= MAX_BP:
                targets_by_r[r_id].add(t_f)

    if not targets_by_r:
        continue

    # 3) 이번 forward 행과 매칭된 reverse 부분만 뽑아 cross-merge 1회
    rev_ids = list(targets_by_r.keys())
    revsub = revs.loc[rev_ids].copy()
    # 공통 타깃 리스트 부여(인덱스 정렬과 무관하게 r_id 기준으로 매핑)
    revsub["targets"] = [sorted(list(targets_by_r[r_id])) for r_id in revsub.index]

    forsub = fors.loc[[f_id]].copy().drop(['tnames','starts'],axis=1)
    pairs = forsub.merge(revsub.drop(['tnames','starts'],axis=1), how="cross", suffixes=("_f", "_r"))

    # 필요 시 산출값 계산(지금은 placeholder)
    pairs["prod_len"] = 125
    pairs["prod_Tm"] = 85

    #pairs.to_csv(evalf, mode="a", index=False, header=header_flag)
    #header_flag = False


## Evaluation with ML

In [None]:
def encode_row(row):
    fenc = one_hot_encode(row['pseq_f'])
    ftenc = one_hot_encode(row['tseq_f'])
    renc = one_hot_encode(row['pseq_r'])
    rtenc = one_hot_encode(row['tseq_r'])
    prienc = np.append(fenc, renc, axis=0)   # Primer
    tarenc = np.append(ftenc, rtenc, axis=0) # Target
    combined = np.append(tarenc, prienc, axis=1)
    return combined

def one_hot_encode_pbs_gap_parallel(df_seqs):
    rows = df_seqs.to_dict('records')
    with Pool(processes=cpu_count()) as pool:
        results = pool.map(encode_row, rows)

    final_encoded = np.array(results)  # (batch, 56, 10)
    #print(final_encoded.shape)
    return torch.tensor(final_encoded, dtype=torch.float32)

In [None]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer
featcols = ['len_f','Tm_f','GC_f','indel_f','mm_f','len_r','Tm_r','GC_r','indel_r','mm_r','prod_len','prod_Tm']
newcols = ['f_len','f_Tm','f_GC','f_indel','f_mm','r_len','r_Tm','r_GC','r_indel','r_mm','prod_len','prod_Tm']
inpcols = ['pname_f','pname_r','targets']

In [None]:
tnames = {}
for virus in allvns:
    tnames[virus] = [s.id for s in SeqIO.parse(f'{TARPATH}/{fastas[virus]}', 'fasta')]

selected = ['CCHFV','CHIK','EBOVZ','HCV','LASV','mpoxIaIb','mpoxIIb','WNV','YFV','ZIKA']
print(len(selected))

In [None]:
# virus = 'MMV'
# fa = f'{TARPATH}/{virus}_all_2022.fasta'
# pseq = 'CCTATGGGAGGTATAGAAGGG'
# rseq = reverse_complement_dna('AGCAGCCAGGTATAAATAGGG')
# for s in SeqIO.parse(fa, 'fasta'):
#     tseq = str(s.seq)
#     if pseq in tseq and rseq in tseq:
#         print(s.id, tseq.find(pseq), tseq.find(rseq))

In [None]:
virus = 'MMV'
clsf = f'{OUPPATH}/{virus}.{virus}.cls'
regf = f'{OUPPATH}/{virus}.{virus}.reg'
!rm -f $clsf $regf

In [None]:
covcut = .1
scocut = .5
for virus in ['LASV']:# allvns: 
    inpf = f'{INPPATH}/{virus}.{virus}.input'
    clsf = f'{OUPPATH}/{virus}.{virus}.cls'
    regf = f'{OUPPATH}/{virus}.{virus}.reg'
    if os.path.exists(clsf):
        continue
    for i, chunk in tqdm(enumerate(pd.read_csv(inpf, chunksize=100000)), desc=virus):
        chunk['targets'] = chunk['targets'].apply(ast.literal_eval)

        inps = chunk[inpcols].copy()
        inps_feat = chunk[featcols]
        inps_feat.columns = newcols
        inps_seq = chunk[['pseq_f','tseq_f','pseq_r','tseq_r']]

        seq_input = one_hot_encode_pbs_gap_parallel(inps_seq)
        fea_input = scaler.transform(inps_feat)
        labels = np.array([0]*len(inps))
        dataset = PcrDataset(seq_input, fea_input, labels)
        loader = DataLoader(dataset, batch_size=64, shuffle=False)

        predict_cls, predict_reg = [], []
        with torch.no_grad():
            for seq_in, fea_in, lab in loader:
                seq_in, fea_in, lab = seq_in.to(device).float(), fea_in.to(device).float(), lab.to(device).float()
                out_cls = classifier(seq_in)
                out_reg = regressor(fea_in, seq_in)
                if len(seq_in)==1:
                    predict_cls.append(np.array([out_cls.squeeze().detach().cpu().numpy()]))
                    predict_reg.append(np.array([out_reg.squeeze().detach().cpu().numpy()]))
                else:
                    predict_cls.append(out_cls.squeeze().detach().cpu().numpy())
                    predict_reg.append(out_reg.squeeze().detach().cpu().numpy())
            predict_cls = np.concatenate(predict_cls)
            predict_reg = np.round(np.concatenate(predict_reg), decimals=3)
        inps.loc[:,'classifier'] = predict_cls
        inps.loc[:,'regressor'] = predict_reg

        mlb = MultiLabelBinarizer()
        onehot = mlb.fit_transform(inps['targets'])
        target_cols = list(mlb.classes_)
        
        for label, savef in zip(['classifier','regressor'],[clsf,regf]):
            targets_df = pd.DataFrame(onehot, columns=target_cols, index=inps.index)
            targets_df = targets_df.mul(inps[label], axis=0)
            evaltbl = pd.concat([inps.drop(columns=['targets','classifier','regressor']), targets_df], axis=1)
            agg_dict = {c: "max" for c in evaltbl.columns[2:]}
            evaltbl = evaltbl.groupby(['pname_f','pname_r']).agg(agg_dict)
            evaltbl = evaltbl.reindex(columns=tnames[virus], fill_value=0)

            filt = evaltbl[(evaltbl!=0).sum(axis=1)/len(tnames[virus]) > covcut]
            filt = filt[filt.replace(0, np.nan).mean(axis=1) > scocut]
            pubidx = [(f,r) for f,r in evaltbl.index if 'pub' in f and 'pub' in r]
            if pubidx:
                filt = pd.concat([filt, evaltbl.reindex(pubidx)])
            filt.to_csv(savef, mode='a', header=(i==0), float_format="%.2f")

## Selection

In [None]:
for virus in ['LASV']:# allvns:
    clsf = f'{OUPPATH}/{virus}.{virus}.cls'    
    clstbl = pd.read_csv(clsf)
    agg_dict = {c: "max" for c in clstbl.columns[2:]}
    clstbl = clstbl.groupby(['pname_f','pname_r']).agg(agg_dict)
    clstbl.to_csv(clsf)

In [None]:
for virus in ['LASV']:# allvns:
    regf = f'{OUPPATH}/{virus}.{virus}.reg'
    regtbl = pd.read_csv(regf)
    agg_dict = {c: "max" for c in regtbl.columns[2:]}
    regtbl = regtbl.groupby(['pname_f','pname_r']).agg(agg_dict)
    regtbl.to_csv(regf)

In [None]:
for virus in ['LASV']:# allvns:
    clsf = f'{OUPPATH}/{virus}.{virus}.cls'
    regf = f'{OUPPATH}/{virus}.{virus}.reg'
    resf = f'{OUPPATH}/{virus}.{virus}.result'
    
    regtbl = pd.read_csv(regf, index_col=[0,1])
    #agg_dict = {c: "max" for c in regtbl.columns[2:]}
    #regtbl = regtbl.groupby(['pname_f','pname_r']).agg(agg_dict)

    clstbl = pd.read_csv(clsf, index_col=[0,1])
    #agg_dict = {c: "max" for c in clstbl.columns[2:]}
    #clstbl = clstbl.groupby(['pname_f','pname_r']).agg(agg_dict)
    clstbl = clstbl.reindex(regtbl.index)
    
    coverage = ((clstbl>.5).mean(axis=1) * 100).reset_index(name='coverage')
    scores = (regtbl * (clstbl>.5)).replace(0, np.nan).mean(axis=1).reset_index(name='score')
    res = coverage.merge(scores, on=['pname_f','pname_r'])
    
    res['target'] = res['coverage'] * res['score']
    res.to_csv(resf)
    
    select = res.sort_values('target',ascending=False).iloc[:100]
    priseqs = { s.id:str(s.seq) for s in SeqIO.parse(f'{PRIPATH}/{virus}_pre.fa','fasta') }
    with open(f'{PRIPATH}/{virus}_target.fa', 'wt') as out:
        allpns = set(select['pname_f']) | set(select['pname_r'])
        for pname in allpns:
            out.write(f'>{pname}\n{priseqs[pname]}\n')
        for pname in priseqs:
            if 'pub' in pname and pname not in allpns:
                out.write(f'>{pname}\n{priseqs[pname]}\n')

In [None]:
virus = 'LASV'
resf = f'{OUPPATH}/{virus}.{virus}.result'
res = pd.read_csv(resf,index_col=0)
select = res.sort_values('target',ascending=False).iloc[:100]
priseqs = { s.id:str(s.seq) for s in SeqIO.parse(f'{PRIPATH}/{virus}_pre.fa','fasta') }
with open(f'{PRIPATH}/{virus}_target.fa', 'wt') as out:
    allpns = set(select['pname_f']) | set(select['pname_r'])
    for pname in allpns:
        out.write(f'>{pname}\n{priseqs[pname]}\n')
    for pname in priseqs:
        if 'pub' in pname and pname not in allpns:
            out.write(f'>{pname}\n{priseqs[pname]}\n')

In [None]:
pubs = {}
fig, axes = plt.subplots(5, 5, figsize=(10, 10), sharex=True, sharey=True)
for virus, ax in zip(allvns, axes.flatten()):
    resf = f'{OUPPATH}/{virus}.{virus}.result'
    res = pd.read_csv(resf,index_col=0)
    covs = res['coverage']
    scos = res['score']
    res['target'] = np.sqrt(covs * scos)
    
    select = res.sort_values('target',ascending=False).iloc[:100]   
    pub = res[res['pname_f'].apply(lambda x:'pub' in x) & res['pname_r'].apply(lambda x:'pub' in x)]
    if pub.empty:
        pub = pd.DataFrame([[0]*len(res.columns)], columns=res.columns)
        pub['pname_f'] = f'{virus}_pub_1_f'
        pub['pname_r'] = f'{virus}_pub_1_r'
        pub['score'] = .5
    pubs[virus] = pub
    ax.scatter(covs, scos, s=1, alpha=.05, color='black')
    ax.scatter(select['coverage'], select['score'], s=5, alpha=1, color='tab:orange', label='Selected')
    ax.scatter(pub['coverage'], pub['score'], s=5, alpha=1, color='tab:red', label='Manual')
    ax.set_ylim(.45, 1.2)
    ax.text(3, 1.2, f'{virus}\nn = {len(covs)}', fontsize=9, va='top')
ax.legend(loc=(1.05,.6), handlelength=1)
for ax in axes.flatten()[len(allvns):]:
    fig.delaxes(ax)
fig.subplots_adjust(wspace=.1,hspace=.1)
fig.text(.5, .06, 'Coverage (%)', ha='center')
fig.text(.06, .5, 'Mean score', va='center', rotation=90)
fig.savefig(f'{OUTPATH}/figures/0827_bbp_target_v3.png', dpi=500, bbox_inches='tight')

## Examine cross-reactivity

In [None]:
for virus in ['LASV']:# allvns:
    savef = f'{OUPPATH}/{virus}.cross.cls'
    outtbl = pd.DataFrame()
    for cross in tqdm(allvns, desc=virus):
        if virus==cross:
            continue
        inpf = f'{INPPATH}/{virus}.{cross}.input'
        chunk = pd.read_csv(inpf)
        chunk['targets'] = chunk['targets'].apply(ast.literal_eval)

        inps = chunk[inpcols].copy()
        inps_feat = chunk[featcols]
        inps_feat.columns = newcols
        inps_seq = chunk[['pseq_f','tseq_f','pseq_r','tseq_r']]

        seq_input = one_hot_encode_pbs_gap_parallel(inps_seq)
        fea_input = scaler.transform(inps_feat)
        labels = np.array([0]*len(inps))
        dataset = PcrDataset(seq_input, fea_input, labels)
        loader = DataLoader(dataset, batch_size=64, shuffle=False)

        predict_cls = []
        with torch.no_grad():
            for seq_in, fea_in, lab in loader:
                seq_in, fea_in, lab = seq_in.to(device).float(), fea_in.to(device).float(), lab.to(device).float()
                out_cls = classifier(seq_in)
                if len(seq_in)==1:
                    predict_cls.append(np.array([out_cls.squeeze().detach().cpu().numpy()]))
                else:
                    predict_cls.append(out_cls.squeeze().detach().cpu().numpy())
            predict_cls = np.concatenate(predict_cls)
        inps.loc[:,'classifier'] = predict_cls

        mlb = MultiLabelBinarizer()
        onehot = mlb.fit_transform(inps['targets'])
        target_cols = list(mlb.classes_)

        targets_df = pd.DataFrame(onehot, columns=target_cols, index=inps.index)
        targets_df = targets_df.mul(inps['classifier'], axis=0)
        evaltbl = pd.concat([inps.drop(columns=['targets','classifier']), targets_df], axis=1)
        agg_dict = {c: "max" for c in evaltbl.columns[2:]}
        evaltbl = evaltbl.groupby(['pname_f','pname_r']).agg(agg_dict)
        if outtbl.empty:
            outtbl = evaltbl
        else:
            outtbl = outtbl.join(evaltbl, how='outer').fillna(0)
    outtbl.round(2).to_csv(savef)

In [None]:
for virus in ['LASV']: 
    inpf = f'{INPPATH}/{virus}.GRCH38_tx_sort.input'
    savef = f'{OUPPATH}/{virus}.GRCH38_tx_sort.cls'
    for i, chunk in tqdm(enumerate(pd.read_csv(inpf, chunksize=100000)), desc=virus):
        chunk['targets'] = chunk['targets'].apply(ast.literal_eval)

        inps = chunk[inpcols].copy()
        inps_feat = chunk[featcols]
        inps_feat.columns = newcols
        inps_seq = chunk[['pseq_f','tseq_f','pseq_r','tseq_r']]

        seq_input = one_hot_encode_pbs_gap_parallel(inps_seq)
        fea_input = scaler.transform(inps_feat)
        labels = np.array([0]*len(inps))
        dataset = PcrDataset(seq_input, fea_input, labels)
        loader = DataLoader(dataset, batch_size=64, shuffle=False)

        predict_cls = []
        with torch.no_grad():
            for seq_in, fea_in, lab in loader:
                seq_in, fea_in, lab = seq_in.to(device).float(), fea_in.to(device).float(), lab.to(device).float()
                out_cls = classifier(seq_in)
                if len(seq_in)==1:
                    predict_cls.append(np.array([out_cls.squeeze().detach().cpu().numpy()]))
                else:
                    predict_cls.append(out_cls.squeeze().detach().cpu().numpy())
            predict_cls = np.concatenate(predict_cls)
        inps.loc[:,'classifier'] = predict_cls
        
        mlb = MultiLabelBinarizer()
        onehot = mlb.fit_transform(inps['targets'])
        target_cols = list(mlb.classes_)

        targets_df = pd.DataFrame(onehot, columns=target_cols, index=inps.index)
        targets_df = targets_df.mul(inps['classifier'], axis=0)
        evaltbl = pd.concat([inps.drop(columns=['targets','classifier']), targets_df], axis=1)
        agg_dict = {c: "max" for c in evaltbl.columns[2:]}
        evaltbl = evaltbl.groupby(['pname_f','pname_r']).agg(agg_dict)
        evaltbl.to_csv(savef, mode='a', header=(i==0), float_format="%.2f")

In [None]:
allhuman = 77601
allcross = 17623
numselect = 100

for virus in ['LASV']:
    outf = f'{OUPPATH}/{virus}.final'
    resf = f'{OUPPATH}/{virus}.{virus}.result'
    res = pd.read_csv(resf,index_col=0)
    
    select = res.sort_values('target', ascending=False).iloc[:numselect]
    pub = res[res['pname_f'].apply(lambda x:'pub' in x) & res['pname_r'].apply(lambda x:'pub' in x)]
    select = pd.concat([select, pub]).drop_duplicates(subset=['pname_f','pname_r'])
    select = select.set_index(['pname_f','pname_r']).sort_index()
    if pub.empty:
        select.loc[(f'{virus}_pub_1_f',f'{virus}_pub_1_r'),:] = 0
    
    crossf = f'{OUPPATH}/{virus}.cross.cls'
    crosstbl = pd.read_csv(crossf)
    crosstbl['cross'] = (crosstbl.iloc[:,2:]>.5).sum(axis=1)
    
    humanf = f'{OUPPATH}/{virus}.GRCH38_tx_sort.cls'
    humantbl = pd.read_csv(humanf, on_bad_lines='skip')
    humantbl['human'] = (humantbl.iloc[:,2:]>.5).sum(axis=1)
    
    for pair, row in select.iterrows():
        ps = set(pair)
        select.loc[pair,'cross'] = crosstbl.loc[crosstbl['pname_f'].isin(ps) 
                                                & crosstbl['pname_r'].isin(ps),'cross'].sum()/allcross*100
        select.loc[pair,'human'] = humantbl.loc[humantbl['pname_f'].isin(ps) 
                                                & humantbl['pname_r'].isin(ps),'human'].sum()/allhuman*100
    select['off-target'] = (select['cross']+100/allcross) + (select['human']+100/allhuman)
    select.to_csv(outf)

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(10, 10), sharex=True, sharey=True)
for virus, ax in zip(allvns, axes.flatten()):
    finf = f'{OUPPATH}/{virus}.final'
    res = pd.read_csv(finf) 
    pub = res[res['pname_f'].apply(lambda x:'pub' in x) & res['pname_r'].apply(lambda x:'pub' in x)]
    ax.scatter(res['target'], res['off-target'], s=5, alpha=1, color='tab:orange',label='selected')
    ax.scatter(pub['target'], pub['off-target'], s=5, alpha=1, color='black',label='manual')
    ax.text(40, 32, f'{virus}', fontsize=9)
    
ax.legend(loc=(1.05,.6), handlelength=1)
for ax in axes.flatten()[len(allvns):]:
    fig.delaxes(ax)
fig.subplots_adjust(wspace=.1,hspace=.1)
fig.text(.5, .06, 'Cross reactivity', ha='center')
fig.text(.07, .5, 'Host reactivity', va='center', rotation=90)
fig.savefig(f'{OUTPATH}/figures/0827_bbp_off_v2.png', dpi=500, bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(10, 10), sharex=True, sharey=True)
for virus, ax in zip(allvns, axes.flatten()):
    finf = f'{OUPPATH}/{virus}.final'
    res = pd.read_csv(finf) 
    pub = res[res['pname_f'].apply(lambda x:'pub' in x) & res['pname_r'].apply(lambda x:'pub' in x)]
    ax.scatter(res['cross'], res['human'], s=5, alpha=1, color='tab:orange',label='selected')
    ax.scatter(pub['cross'], pub['human'], s=5, alpha=1, color='black',label='manual')
    ax.text(0, 1.5, f'{virus}', fontsize=9)
    
#ax.legend(loc=(1.05,.6), handlelength=1)
for ax in axes.flatten()[len(allvns):]:
    fig.delaxes(ax)
fig.subplots_adjust(wspace=.1,hspace=.1)
fig.text(.5, .07, 'Cross reactivity (%)', ha='center')
fig.text(.07, .5, 'Host reactivity (%)', va='center', rotation=90)
fig.savefig(f'{OUTPATH}/figures/0827_bbp_off_v2.png', dpi=500, bbox_inches='tight')

In [None]:
finals = {}
for virus in allvns:
    outf = f'{OUPPATH}/{virus}.final'
    sumtbl = pd.read_csv(outf)
    sumtbl['final'] = sumtbl['target'] / np.sqrt(sumtbl['off-target'])
    if 'gmean' in sumtbl.columns:
        sumtbl = sumtbl.drop(columns=['gmean'])
    best = sumtbl.sort_values('final').iloc[-1]
    pub = sumtbl[sumtbl['pname_f'].apply(lambda x:'pub' in x) & sumtbl['pname_r'].apply(lambda x:'pub' in x)]
    pub = pub.sort_values('final').iloc[-1]
    
    best.index = [c + '_ai' for c in best.index]
    pub.index = [c + '_pub' for c in pub.index]
    
    comb = pd.concat([best.drop(['pname_f_ai','pname_r_ai']), 
                      pub.drop(['pname_f_pub','pname_r_pub'])])
    finals['%s\n(n = %i)' % (virus, len(tnames[virus]))] = comb
    
final = pd.DataFrame(finals)
final.T.astype(float).round(2)

In [None]:
final.T.astype(float).round(2).to_csv(f'{OUPPATH}/0905_assembled.csv')