## ***Sampling candidate sequences using Random Heuristic Method***
- reference : Lee, Wook, and Kyungsook Han. "Constructive prediction of potential RNA aptamers for a protein target." IEEE/ACM transactions on computational biology and bioinformatics (2019).

In [1]:
from preprocess import load_docking_benchmark_dataset
# Load docking-benchmark testset
docking_benchmark_dataset_path = "__benchmark_dataset/benchmark_docking.csv"
pseqs, rseqs, px, rx, df = load_docking_benchmark_dataset(docking_benchmark_dataset_path)
df.head()

> Benchmark        : benchmark_docking.csv
- protein features : (56, 399)
- rna features     : (56, 340)


Unnamed: 0,test-ID,protein-ID,rna-ID,protein,protein-length,rna,rna-length
0,0,Bovine_factor_IX,17030508-Bovine factor IX-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAGCUCAGCCUUCACUGCCUACGCGGGCGUUUACGUAACGGCUU...,114
1,1,Bovine_factor_IX,17030508-Bovine factor IX-Ma-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAUCGUGGUAGUGCGAAGCCAGUAAACGAGGGCACCACGGUCC,45
2,2,oligoadenylate_synthetase,9452437-oligoadenylatesynthetase-4,MAVAQELYGFPASKLDSFVAQWLQPTREWKEEVLETVQTVEQFLRQ...,511,UAUCCUCGCCCGUGCACGCCCUCCACUAAGCCCA,34
3,3,Envelope_glycoprotein_GP120,14610191-gp120-10,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,GUUUAUAUAUACACAGGUUAAGCGUAACUUCGCUGGACAGCAAGAA...,50
4,4,Envelope_glycoprotein_GP120,14610191-gp120-9,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,CACCUACCUAAUUAUUAAACUUUGGGCAGUAUCCCGCUUUGCUUCU...,50


#### *Sampling with benchmark-A classifier and docking-benchmark (56 RNA pairs)*

In [2]:
from rand_hue import RandomHeuristicSampling
score_function_path = "classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees"
tag = "RAND_A"
sampler = RandomHeuristicSampling(score_function_path=score_function_path, tag=tag)


> Load RF model with 61 trees
- from : classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees





In [3]:
# In our paper, n_samples 6000000, n_jobs 30
sampler.pre_sampling(n_samples=6000000, n_jobs=30, bp=27)

- pre-sampling results already exists : aptamers/pre-samples-n6000000-j30-bp27.txt


In [4]:
sampler.post_sampling(target_pseqs=pseqs, target_rseqs=rseqs, top_k=10)

- Load pre-sampled sequences : aptamers/pre-samples-n6000000-j30-bp27.txt
- process complete : aptamers/RAND_A/output-00.txt
- process complete : aptamers/RAND_A/output-01.txt
- process complete : aptamers/RAND_A/output-02.txt
- process complete : aptamers/RAND_A/output-03.txt
- process complete : aptamers/RAND_A/output-04.txt
- process complete : aptamers/RAND_A/output-05.txt
- process complete : aptamers/RAND_A/output-06.txt
- process complete : aptamers/RAND_A/output-07.txt
- process complete : aptamers/RAND_A/output-08.txt
- process complete : aptamers/RAND_A/output-09.txt
- process complete : aptamers/RAND_A/output-10.txt
- process complete : aptamers/RAND_A/output-11.txt
- process complete : aptamers/RAND_A/output-12.txt
- process complete : aptamers/RAND_A/output-13.txt
- process complete : aptamers/RAND_A/output-14.txt
- process complete : aptamers/RAND_A/output-15.txt
- process complete : aptamers/RAND_A/output-16.txt
- process complete : aptamers/RAND_A/output-17.txt
- proces

#### *Sampling with benchmark-B classifier and docking-benchmark (56 RNA pairs)*

In [5]:
score_function_path = "classifiers/TESTING-B/mcc0.593-ppv0.688-acc0.768-sn0.982-sp0.554-npv0.969-yd0.536-79trees"
tag = "RAND_B"
sampler = RandomHeuristicSampling(score_function_path=score_function_path, tag=tag)


> Load RF model with 79 trees
- from : classifiers/TESTING-B/mcc0.593-ppv0.688-acc0.768-sn0.982-sp0.554-npv0.969-yd0.536-79trees



In [6]:
# In our paper, n_samples 6000000, n_jobs 30
sampler.pre_sampling(n_samples=6000000, n_jobs=30, bp=27)

- pre-sampling results already exists : aptamers/pre-samples-n6000000-j30-bp27.txt


In [7]:
sampler.post_sampling(target_pseqs=pseqs, target_rseqs=rseqs, top_k=10)

- Load pre-sampled sequences : aptamers/pre-samples-n6000000-j30-bp27.txt
- process complete : aptamers/RAND_B/output-00.txt
- process complete : aptamers/RAND_B/output-01.txt
- process complete : aptamers/RAND_B/output-02.txt
- process complete : aptamers/RAND_B/output-03.txt
- process complete : aptamers/RAND_B/output-04.txt
- process complete : aptamers/RAND_B/output-05.txt
- process complete : aptamers/RAND_B/output-06.txt
- process complete : aptamers/RAND_B/output-07.txt
- process complete : aptamers/RAND_B/output-08.txt
- process complete : aptamers/RAND_B/output-09.txt
- process complete : aptamers/RAND_B/output-10.txt
- process complete : aptamers/RAND_B/output-11.txt
- process complete : aptamers/RAND_B/output-12.txt
- process complete : aptamers/RAND_B/output-13.txt
- process complete : aptamers/RAND_B/output-14.txt
- process complete : aptamers/RAND_B/output-15.txt
- process complete : aptamers/RAND_B/output-16.txt
- process complete : aptamers/RAND_B/output-17.txt
- proces