## ***Sampling candidate sequences using Monte-Carlo Tree Search Method***

In [7]:
from preprocess import load_docking_benchmark_dataset
# Load docking-benchmark testset
docking_benchmark_dataset_path = "__benchmark_dataset/benchmark_docking.csv"
pseqs, rseqs, px, rx, df       = load_docking_benchmark_dataset(docking_benchmark_dataset_path)
df.head()

> Benchmark        : benchmark_docking.csv
- protein features : (56, 399)
- rna features     : (56, 340)


Unnamed: 0,test-ID,protein-ID,rna-ID,protein,protein-length,rna,rna-length
0,0,Bovine_factor_IX,17030508-Bovine factor IX-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAGCUCAGCCUUCACUGCCUACGCGGGCGUUUACGUAACGGCUU...,114
1,1,Bovine_factor_IX,17030508-Bovine factor IX-Ma-1,MQRVNMIMAESPGLITICLLGYLLSAECTVFLDHENANKILNRPKR...,461,GGGAUCGUGGUAGUGCGAAGCCAGUAAACGAGGGCACCACGGUCC,45
2,2,oligoadenylate_synthetase,9452437-oligoadenylatesynthetase-4,MAVAQELYGFPASKLDSFVAQWLQPTREWKEEVLETVQTVEQFLRQ...,511,UAUCCUCGCCCGUGCACGCCCUCCACUAAGCCCA,34
3,3,Envelope_glycoprotein_GP120,14610191-gp120-10,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,GUUUAUAUAUACACAGGUUAAGCGUAACUUCGCUGGACAGCAAGAA...,50
4,4,Envelope_glycoprotein_GP120,14610191-gp120-9,LLNGSLAEEIVIRTENIADNTKDIIVQFNKTVSIACTRPHNNTRRG...,214,CACCUACCUAAUUAUUAAACUUUGGGCAGUAUCCCGCUUUGCUUCU...,50


#### *Sampling with benchmark-A classifier and docking-benchmark (56 RNA pairs)*

In [9]:
from mcts_seq import MCTSeq
def MCTS_seq_sampling(tag, top_k, n_iter, bp):
    score_function_path = "classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees"
    tag                 = tag
    sampler             = MCTSeq(score_function_path=score_function_path, tag=tag)
    # original method's iteration number : 1000
    sampler.sampling_with_truth(target_pseqs = pseqs,  # target protein sequence (for evaluation)
                                target_rseqs = rseqs,  # target rna-aptamer sequences (for evaluation)
                                top_k        = top_k,  # when k=0 then save all candidates
                                n_iter       = n_iter, # default iteration is 1000
                                bp           = bp)     # when bp=0 the length of samples is same with target rna-aptamer sequence

In [10]:
MCTS_seq_sampling(tag="MCTS-A-10", top_k=100, n_iter=1000, bp=10)


> Load RF model with 61 trees
- from : classifiers/TESTING-A/mcc0.496-ppv1.000-acc0.826-sn0.303-sp1.000-npv0.812-yd0.303-61trees

> MCTS-seq sampling start processing 1 / 56
- True aptamer length : 114
CGAUCgcCuu
- Total number of candidates : 10 (original 10000)
- process complete : aptamers/MCTS-A-10/output-00.txt / reward : 0.45901639344262296 (original 0.08196721311475409)
- process complete : aptamers/MCTS-A-10/output-00-sequential.txt
> MCTS-seq sampling start processing 2 / 56
- True aptamer length : 45
CGAUCgcCug
- Total number of candidates : 10 (original 10000)
- process complete : aptamers/MCTS-A-10/output-01.txt / reward : 0.45901639344262296 (original 0.13114754098360656)
- process complete : aptamers/MCTS-A-10/output-01-sequential.txt
> MCTS-seq sampling start processing 3 / 56
- True aptamer length : 34
ccUCuc

KeyboardInterrupt: 

In [None]:
MCTS_seq_sampling(tag="MCTS-A-20", top_k=100, n_iter=1000, bp=20)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-30", top_k=100, n_iter=1000, bp=30)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-40", top_k=100, n_iter=1000, bp=40)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-50", top_k=100, n_iter=1000, bp=50)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-60", top_k=100, n_iter=1000, bp=60)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-70", top_k=100, n_iter=1000, bp=70)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-80", top_k=100, n_iter=1000, bp=80)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-90", top_k=100, n_iter=1000, bp=90)

In [None]:
MCTS_seq_sampling(tag="MCTS-A-100", top_k=100, n_iter=1000, bp=100)

#### *Sampling with benchmark-B classifier and docking-benchmark (56 RNA pairs)*

In [None]:
MCTS_seq_sampling(tag="MCTS-A-10", top_k=100, n_iter=1000, bp=10)

In [None]:
MCTS_seq_sampling(tag="MCTS-B-20", top_k=100, n_iter=1000, bp=20)

In [None]:
MCTS_seq_sampling(tag="MCTS-B-30", top_k=100, n_iter=1000, bp=30)

In [None]:
MCTS_seq_sampling(tag="MCTS-B-40", top_k=100, n_iter=1000, bp=40)

In [None]:
MCTS_seq_sampling(tag="MCTS-B-50", top_k=100, n_iter=1000, bp=50)