## ***Sampling candidate sequences using Monte-Carlo Tree Search Method***

In [1]:
from preprocess import load_docking_benchmark_dataset
# Load docking-benchmark testset
docking_benchmark_dataset_path = "__benchmark_dataset/benchmark_docking.csv"
pseqs, rseqs, px, rx, df = load_docking_benchmark_dataset(docking_benchmark_dataset_path)
df.head()

> Benchmark        : benchmark_docking.csv
- protein features : (56, 399)
- rna features     : (56, 340)


Unnamed: 0,test-ID,protein-ID,rna-ID,protein,protein-length,rna,rna-length
0,0,Bovine_factor_IX,17030508-Bovine factor IX-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAGCUCAGCCUUCACUGCCUACGCGGGCGUUUACGUAACGGCUU...,114
1,1,Bovine_factor_IX,17030508-Bovine factor IX-Ma-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAUCGUGGUAGUGCGAAGCCAGUAAACGAGGGCACCACGGUCC,45
2,2,oligoadenylate_synthetase,9452437-oligoadenylatesynthetase-4,MAVAQELYGFPASKLDSFVAQWLQPTREWKEEVLETVQTVEQFLRQ...,511,UAUCCUCGCCCGUGCACGCCCUCCACUAAGCCCA,34
3,3,Envelope_glycoprotein_GP120,14610191-gp120-10,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,GUUUAUAUAUACACAGGUUAAGCGUAACUUCGCUGGACAGCAAGAA...,50
4,4,Envelope_glycoprotein_GP120,14610191-gp120-9,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,CACCUACCUAAUUAUUAAACUUUGGGCAGUAUCCCGCUUUGCUUCU...,50


#### *Sampling with benchmark-A classifier and docking-benchmark (56 RNA pairs)*

In [2]:
from mcts_seq import MCTSeq
score_function_path = "classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees"
tag = "MCTS_A"
sampler = MCTSeq(score_function_path=score_function_path, tag=tag)


> Load RF model with 61 trees
- from : classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees





In [3]:
# original method's iteration number : 1000
sampler.sampling_with_truth(target_pseqs=pseqs, target_rseqs=rseqs, top_k=0, n_iter=100) # top-0 means that save all samples

> MCTS-seq sampling start processing 1 / 56
- True aptamer length : 114
CucGuuCgaGGUUgGUauuAAGuuGgCGuUcgAcGaAuGAgccGaUaUACGaAcgGGAguUAcgCAcGCGuUCgcAaauGAGCUuUaUgaaUcGUuCaUUCgugUUCauauacA
- Total number of candidates : 10498 (original 11400)
- process complete : aptamers/MCTS_A/output-00.txt
> MCTS-seq sampling start processing 2 / 56
- True aptamer length : 45
UUGAgaccGAcAAuAcAguCaGaccaUCuuAGgcGaUuuuggaAc
- Total number of candidates : 3722 (original 4500)
- process complete : aptamers/MCTS_A/output-01.txt
> MCTS-seq sampling start processing 3 / 56
- True aptamer length : 34
cAgCuUCAACCCCAUAUCaaugcuUCCaAAgugc
- Total number of candidates : 2002 (original 3400)
- process complete : aptamers/MCTS_A/output-02.txt
> MCTS-seq sampling start processing 4 / 56
- True aptamer length : 50
uAuAUccGucCcGAuuACcagcCcacCGacCucgguuugAcuGGGAUCcu
- Total number of candidates : 4133 (original 5000)
- process complete : aptamers/MCTS_A/output-03.txt
> MCTS-seq sampling start processing 5 / 56
- True apt

#### *Sampling with benchmark-B classifier and docking-benchmark (56 RNA pairs)*

In [4]:
score_function_path = "classifiers/TESTING-B/mcc0.593-ppv0.688-acc0.768-sn0.982-sp0.554-npv0.969-yd0.536-79trees"
tag = "MCTS_B"
sampler = MCTSeq(score_function_path=score_function_path, tag=tag)


> Load RF model with 79 trees
- from : classifiers/TESTING-B/mcc0.593-ppv0.688-acc0.768-sn0.982-sp0.554-npv0.969-yd0.536-79trees



In [None]:
sampler.sampling_with_truth(target_pseqs=pseqs, target_rseqs=rseqs, top_k=0, n_iter=100) # top-0 means that save all samples

> MCTS-seq sampling start processing 1 / 56
- True aptamer length : 114
ucggauugcgCgGUcguAGAacCCCGcCgaGCAUAUuauGcguGGacAAgcCuAggggcgauGGuCguUgcCcCAuuCacacgGAuuaaGUggcAuauGGgAccggauAGcGcu
- Total number of candidates : 10710 (original 11400)
- process complete : aptamers/MCTS_B/output-00.txt
> MCTS-seq sampling start processing 2 / 56
- True aptamer length : 45
cggUUgaUgccccccAGCCCCUUCacuAUGgGCCGCCCUCGccGu
- Total number of candidates : 3534 (original 4500)
- process complete : aptamers/MCTS_B/output-01.txt
> MCTS-seq sampling start processing 3 / 56
- True aptamer length : 34
UgcCgacACCCcGCuucCCaguuuugauaACuag
- Total number of candidates : 2200 (original 3400)
- process complete : aptamers/MCTS_B/output-02.txt
> MCTS-seq sampling start processing 4 / 56
- True aptamer length : 50
uUGGAugUUGUauUcGggCuauCCcCaUccGcUCgCcCcCcccgCCCuaC
- Total number of candidates : 4304 (original 5000)
- process complete : aptamers/MCTS_B/output-03.txt
> MCTS-seq sampling start processing 5 / 56
- True apt