# Goal

* Design primers for Christensenella

# Var

In [17]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/primers_r5/'

# Init

In [18]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)

In [19]:
df.dims()

# Primer design

## llprimer pipeline

### Config

In [5]:
cat_file(file.path(work_dir, 'config.yaml'))

#-- I/O --#
samples_file: experiments/christensenella/genbank/samples_christ_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/primers_r4/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prodigal: ""    
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.9 -c 0.8
    core_genes: --frac 1 --max-clusters 1000
    blastx: -evalue 1e-10 -max_target_seqs 3
    blastx_nontarget: -evalue 1e-5 -max_target_seqs 30
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 200
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-prod-size 100 --max-prod-size 200
      Tm: --opt-tm 62 --min-tm 55 --max-tm 70 --max-tm-diff 1
      PCR: --oligo-D

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/
./snakemake_sge.sh experiments/christensenella/genbank/primers_r2/config.yaml 50 --notemp
```

## Summary

In [20]:
# table on final primer metadata
F = file.path(work_dir, 'cgp', 'primers_final_info.tsv')
info = read.delim(F, sep='\t')
info

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cds,3,22,167,167,0,22f,PRIMER_LEFT,MGKTCKGTSATYGARGCGAC,20,⋯,345,365,61.39241,2.093550,57.50000,5.590170,30.99646,23.27834,-12.64141,21.98370
cds,3,22,167,167,0,22r,PRIMER_RIGHT,CCVAGCGARATVACRGCCAT,20,⋯,492,512,62.15616,1.997969,56.66667,4.859127,17.09865,19.12572,-14.36313,20.60622
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
rrna,1,500,105,105,0,500f,PRIMER_LEFT,GGTAGTCCACGCCGTAAACG,20,⋯,792,812,61.70626,0,60,0,39.44303,0,-25.45991,0
rrna,1,500,105,105,0,500r,PRIMER_RIGHT,ACCTTGCGATCGTACTCCCC,20,⋯,877,897,62.33122,0,60,0,0.00000,0,18.12689,0


In [21]:
info$primer_set %>% unique_n('Primer sets')
info$cluster_id %>% unique_n('Primer sets')
info %>%
    group_by(cluster_id) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') %>%
    .$n_primer_sets %>% summary_x('Primer sets per gene cluster')

No. of unique Primer sets: 400 
No. of unique Primer sets: 82 


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Primer sets per gene cluster,1,2,4,9.731707,8,162,64.206,26.212


In [24]:
# primer sets per gene type
info %>%
    distinct(gene_type, cluster_id, primer_set) %>%
    group_by(gene_type) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') 

gene_type,n_primer_sets
<fct>,<int>
cds,358
rrna,162


In [25]:
df.dims(10, 30)
info %>%
    head(n=10)
df.dims()

Unnamed: 0_level_0,gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,cds,3,22,167,167,0,22f,PRIMER_LEFT,MGKTCKGTSATYGARGCGAC,20,64,1,345,365,61.39241,2.09355,57.5,5.59017,30.99646,23.27834,-12.641412,21.9837
2,cds,3,22,167,167,0,22r,PRIMER_RIGHT,CCVAGCGARATVACRGCCAT,20,36,1,492,512,62.15616,1.997969,56.66667,4.859127,17.09865,19.125723,-14.363129,20.60622
3,cds,3,43,175,175,0,43f,PRIMER_LEFT,TCAAYATCGGCAGCGCGTTC,20,2,1,634,654,63.52057,1.397559,57.5,2.5,0.0,0.0,8.660936,0.0
4,cds,3,43,175,175,0,43r,PRIMER_RIGHT,GTRCGYTCRAGCACGCTGTG,20,8,1,789,809,64.26538,2.08768,62.5,4.330127,50.50432,7.172953,15.252237,17.19286
5,cds,3,54,166,166,0,54f,PRIMER_LEFT,ARATCAAYATCGGCAGCGCG,20,4,1,631,651,62.43346,1.541262,55.0,3.535534,17.93556,17.976394,-0.75049,0.0
6,cds,3,54,166,166,0,54r,PRIMER_RIGHT,ACGCTGTGCATBGTYTCCAT,20,6,1,777,797,61.59511,1.099373,50.83333,3.435921,27.1457,19.842885,-15.725861,15.61321
7,cds,3,278,126,126,0,278f,PRIMER_LEFT,MGKTCKGTSATYGARGCGAC,20,64,1,345,365,61.39241,2.09355,57.5,5.59017,30.99646,23.27834,-12.641412,21.9837
8,cds,3,278,126,126,0,278r,PRIMER_RIGHT,RTCRAGYACCATRTTGCCCTG,21,16,1,450,471,60.50139,2.354485,52.38095,4.761905,22.29399,22.293988,-12.885432,18.19178
9,cds,3,350,176,176,0,350f,PRIMER_LEFT,TCAAYATCGGCAGCGCGTTC,20,2,1,634,654,63.52057,1.397559,57.5,2.5,0.0,0.0,8.660936,0.0
10,cds,3,350,176,176,0,350r,PRIMER_RIGHT,CGTRCGYTCRAGCACGCTG,19,8,1,791,810,64.15562,2.169601,65.78947,4.558028,55.6176,12.010899,21.792655,12.71115


In [26]:
# general stats
info$amplicon_size_avg %>% summary_x('amplicon_size_avg')
info$seq_degeneracy %>% summary_x('degen')
info$Tm_avg %>% summary_x('Tm-avg')
info$Tm_sd %>% summary_x('Tm-sd')
info$GC_avg %>% summary_x('GC-avg')
info$GC_sd %>% summary_x('GC-sd')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
amplicon_size_avg,100,119,136,141.4479,161,200,34.783,14.2


“NAs introduced by coercion”


Unnamed: 0,Length,Class,Mode,sd,sd_err_of_mean
degen,0,,,,


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-avg,57.03484,60.33245,61.17492,61.14643,62.06415,64.70111,2.491,1.017


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-sd,0,0,1.663903,1.359697,2.083074,3.097817,1.211,0.494


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-avg,42.5,52.38095,55,54.27032,57.5,67.5,8.075,3.297


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-sd,0,0,3.535534,2.979665,4.475088,6.299408,2.501,1.021


### CDS primers that are the best

In [40]:
# blastx on non-targets
F = file.path(work_dir, 'cgp', 'nontarget', 'cds_blastx.tsv')
cds_blast = read.delim(F, sep='\t') %>%
    filter(!grepl('Catabacter|Christensenella', sscinames))
cds_blast 

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>
1,1e017375b4d44109b09daf0557f8f7c3,MBD5559572.1,GNAT family N-acetyltransferase [Clostridia bacterium],72.185,151,42,1,453,1,151,1.14e-76,151,456,Clostridia bacterium,2044939,2
1,1e017375b4d44109b09daf0557f8f7c3,PWL49310.1,hypothetical protein DBY39_01400 [Clostridiales bacterium],43.791,153,84,1,453,1,153,9.32e-38,153,456,Clostridiales bacterium,1898207,4
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
156,844a979832a0426591c598e2570282b4,NLI60856.1,elongation factor P [Clostridiales bacterium],75.676,185,45,1,555,1,185,3.20e-103,185,558,Clostridiales bacterium,1898207,29
156,844a979832a0426591c598e2570282b4,NLX70507.1,elongation factor P [Clostridiales bacterium],76.216,185,44,1,555,1,185,3.42e-103,185,558,Clostridiales bacterium,1898207,26


In [44]:
# clusters with the most distant non-target relatives
cds_blast_f = cds_blast %>%
    group_by(cluster_id) %>%
    mutate(max_pident = max(pident)) %>%
    ungroup() %>%
    filter(max_pident < 80)
cds_blast_f %>% unique_n('clusters', cluster_id)

No. of unique clusters: 51 


In [57]:
# filtering primers
info_f = info %>%
    filter(gene_type == 'cds') %>%
    semi_join(cds_blast_f, c('cluster_id')) %>%
    filter(degeneracy <= 8, Tm_sd <= 1.5) %>%
    group_by(gene_type, cluster_id, primer_set) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(n == 2) 

df.dims(30,30)
info_f %>%
    arrange(Tm_sd, degeneracy, primer_set) %>%
    head(n=30)
df.dims()

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd,n
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
cds,87,95,157,157,0,95f,PRIMER_LEFT,AGCTACGACCTGATCCCCAA,20,1,1,48,68,60.99849,0,55.0,0,0.0,0,-29.59591,0,2
cds,87,95,157,157,0,95r,PRIMER_RIGHT,TGCGTCCCTGTCTTTTCAGC,20,1,1,185,205,61.53486,0,55.0,0,34.80076,0,-37.51648,0,2
cds,87,96,158,158,0,96f,PRIMER_LEFT,AAGCTACGACCTGATCCCCA,20,1,1,47,67,60.99849,0,55.0,0,0.0,0,-29.59591,0,2
cds,87,96,158,158,0,96r,PRIMER_RIGHT,TGCGTCCCTGTCTTTTCAGC,20,1,1,185,205,61.53486,0,55.0,0,34.80076,0,-37.51648,0,2
cds,87,101,159,159,0,101f,PRIMER_LEFT,AGCTACGACCTGATCCCCAA,20,1,1,48,68,60.99849,0,55.0,0,0.0,0,-29.59591,0,2
cds,87,101,159,159,0,101r,PRIMER_RIGHT,CGTGCGTCCCTGTCTTTTCA,20,1,1,187,207,61.52293,0,55.0,0,0.0,0,-35.82261,0,2
cds,87,102,160,160,0,102f,PRIMER_LEFT,AAGCTACGACCTGATCCCCA,20,1,1,47,67,60.99849,0,55.0,0,0.0,0,-29.59591,0,2
cds,87,102,160,160,0,102r,PRIMER_RIGHT,CGTGCGTCCCTGTCTTTTCA,20,1,1,187,207,61.52293,0,55.0,0,0.0,0,-35.82261,0,2
cds,87,139,161,161,0,139f,PRIMER_LEFT,AAGCTACGACCTGATCCCCAA,21,1,1,47,68,61.57581,0,52.38095,0,0.0,0,-29.59591,0,2
cds,87,139,161,161,0,139r,PRIMER_RIGHT,CCGTGCGTCCCTGTCTTTTC,20,1,1,188,208,62.20623,0,60.0,0,0.0,0,-35.82261,0,2


In [59]:
# cluster info
F = file.path(work_dir, 'cgp', 'core_clusters_info.tsv')
clst_info = read.delim(F, sep='\t')
clst_info

gene_type,cluster_id,seq_uuid,seq_orig_name,contig_id,taxon,start,end,score,strand,annotation,cluster_name,clust_id
<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<int>
cds,3,12dedcbcfa454b34be2b5533276587ea,JKEIJBMC_00467,CABKWJ010000002.1,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,178589,179638,.,-,Cell shape-determining protein Mbl,252b7a9ba11c4a8982495531ad40561a,3
cds,3,22bb4e430c0140fbbe96e566f40837f6,KICEOJCJ_01701,CP029256.1,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,1855857,1856906,.,-,Cell shape-determining protein Mbl,252b7a9ba11c4a8982495531ad40561a,3
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
rrna,1,30635833e6d143f2857300329b123190,EGMHHNPD_02779,CABMKF010000023.1,CABMKF010000001_1_Christensenellaceae_bacterium_isolate_MGYG_HGUT_02411,170,1703,0,+,16S ribosomal RNA,2,1
rrna,1,824cbac467a24f2db12551504d9d1238,MBFOIHCP_02856,JACOON010000011.1,JACOON010000001_1_Christensenella_sp__NSJ_35_HPD3569,3489,5022,0,-,16S ribosomal RNA,2,1


In [65]:
# checking cluster info
df.dims(10,30)
clst_info %>%
    filter(gene_type == 'cds', cluster_id %in% c(87, 129))
df.dims()

gene_type,cluster_id,seq_uuid,seq_orig_name,contig_id,taxon,start,end,score,strand,annotation,cluster_name,clust_id
<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<int>
cds,87,5867640b1563453c9865c22b2b7b4780,JKEIJBMC_00820,CABKWJ010000002.1,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,565457,565753,.,+,50S ribosomal protein L23,89175f88659b44088d2170d881abee78,87
cds,87,035d9810b4ac4a2888b9d2bcd70d635d,KICEOJCJ_01994,CP029256.1,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,2138600,2138896,.,-,50S ribosomal protein L23,89175f88659b44088d2170d881abee78,87
cds,87,3cd0d805ed6a49dc865f87aec2871667,HOHPHKNP_00652,LT700187.1,LT700187_1_Christensenella_sp__Marseille_P2438,646803,647099,.,-,50S ribosomal protein L23,89175f88659b44088d2170d881abee78,87
cds,87,89175f88659b44088d2170d881abee78,EGMHHNPD_01541,CABMKF010000004.1,CABMKF010000001_1_Christensenellaceae_bacterium_isolate_MGYG_HGUT_02411,284994,285290,.,-,50S ribosomal protein L23,89175f88659b44088d2170d881abee78,87
cds,87,c9186319dbd9495880d79281763c9f51,MBFOIHCP_01868,JACOON010000005.1,JACOON010000001_1_Christensenella_sp__NSJ_35_HPD3569,8579,8875,.,+,50S ribosomal protein L23,89175f88659b44088d2170d881abee78,87
cds,129,f75e5d6845d34829ae5dd05136d89be9,JKEIJBMC_00769,CABKWJ010000002.1,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,515617,516948,.,-,Replication-associated recombination protein A,fe745918481747ffa57aec99addf736a,129
cds,129,da588e9f3529487aa48453b887ab91b0,KICEOJCJ_02427,CP029256.1,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,2614566,2615882,.,+,Replication-associated recombination protein A,fe745918481747ffa57aec99addf736a,129
cds,129,ceecc01f345d4e83a7e311674514385b,HOHPHKNP_01050,LT700187.1,LT700187_1_Christensenella_sp__Marseille_P2438,1070486,1071865,.,+,Replication-associated recombination protein A,fe745918481747ffa57aec99addf736a,129
cds,129,fe745918481747ffa57aec99addf736a,EGMHHNPD_00815,CABMKF010000002.1,CABMKF010000001_1_Christensenellaceae_bacterium_isolate_MGYG_HGUT_02411,318466,319788,.,-,Replication-associated recombination protein A,fe745918481747ffa57aec99addf736a,129
cds,129,76b7152b86284628be83e8120b567e0f,MBFOIHCP_02249,JACOON010000006.1,JACOON010000001_1_Christensenella_sp__NSJ_35_HPD3569,90018,91334,.,-,Replication-associated recombination protein A,fe745918481747ffa57aec99addf736a,129


In [66]:
# checking cluster non-targets
cds_blast %>%
    filter(cluster_id %in% c(87, 129))

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>
87,89175f88659b44088d2170d881abee78,HBU12197.1,50S ribosomal protein L23 [Clostridiales bacterium],79.592,98,20,1,294,1,98,1.95e-47,98,297,Clostridiales bacterium,1898207,2
87,89175f88659b44088d2170d881abee78,PWM37934.1,50S ribosomal protein L23 [Clostridiales bacterium],72.449,98,27,1,294,1,98,9.96e-42,98,297,Clostridiales bacterium,1898207,3
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
129,fe745918481747ffa57aec99addf736a,NLT17416.1,replication-associated recombination protein A [Clostridiales bacterium],61.224,441,166,1,1314,1,439,0,446,1323,Clostridiales bacterium,1898207,26
129,fe745918481747ffa57aec99addf736a,MBE5784129.1,replication-associated recombination protein A [Clostridiales bacterium],60.373,429,161,1,1275,1,424,0,428,1323,Clostridiales bacterium,1898207,29


In [67]:
# writing primers
outF = file.path(work_dir, 'best_primers.tsv')
info_f %>% write_table(outF)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/primers_r5//best_primers.tsv 
