# Goal

* Design primers with llprimer pipeline

# Var

In [44]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/'
genomes_file = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/samples_genomes_hq.txt'

threads = 8

# Init

In [45]:
library(dplyr)
library(tidyr)
library(data.table)
library(tidytable)
library(ggplot2)
library(LeyLabRMisc)

In [46]:
df.dims()
setDTthreads(threads)
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/ 


# Load

In [47]:
genomes = read.delim(genomes_file, sep='\t')
genomes

Taxon,Fasta,Domain,Taxid
<chr>,<chr>,<chr>,<int>
GCA_014384805.1_ASM1438480v1.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GCA_014384805.1_ASM1438480v1_genomic.fna.fna,d__Bacteria,990719
GCA_900087015.1_PRJEB13910.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GCA_900087015.1_PRJEB13910_genomic.fna.fna,d__Bacteria,990719
⋮,⋮,⋮,⋮
GCA_900761075.1_SRS042628_64.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GCA_900761075.1_SRS042628_64_genomic.fna.fna,d__Bacteria,990719
GUT_GENOME097725.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GUT_GENOME097725.fna.fna,d__Bacteria,990719


# LLPRIMER

### Config

In [11]:
F = file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/samples_genomes_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/primers/clst0.8-perc0.8/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prokka: ""
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.8 -c 0.8
    vsearch: --id 0.94
    core_genes:
      cds: --perc-genomes-cds 80 --copies-per-genome-cds 1 --max-clusters-cds 500
      rRNA: --perc-genomes-rrna 80 --copies-per-genome-rrna 10 --max-clusters-rrna 500
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 10000 --consensus-threshold 0.3
      size: --opt-size 20 --min-size 18 --max-

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ ./snakemake_sge.sh experiments/christensenellaceae/primers/clst0.8-perc0.8/config.yaml 20 -Fqn
```

# Results

In [13]:
# primers
primer_info = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

No. of unique primer sets: 40 


gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cds,5,402,166,166,0,402f,PRIMER_LEFT,CBGGCGGYGTNGTNAAGAA,19,⋯,142,161,61.55090,2.635150,58.77193,5.189544,10.20467,17.76559,0.2535095,26.46468
cds,5,402,166,166,0,402r,PRIMER_RIGHT,GCWACNGGYCCRAARATACG,20,⋯,288,308,59.79884,2.222096,55.00000,5.000000,20.89467,18.59058,-1.8372890,23.07496
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,11,9962,132,132,0,9962f,PRIMER_LEFT,GACCARATGGCHGTDCAYG,19,⋯,1359,1378,58.86172,2.408386,56.14035,5.114870,39.83293,0.5380754,2.239179,11.31561
cds,11,9962,132,132,0,9962r,PRIMER_RIGHT,RACCATRTCCTGYGTMGGS,19,⋯,1472,1491,59.16021,2.727392,57.89474,5.263158,27.49098,21.3391895,-11.843880,21.34608


In [40]:
# target gene info
gene_annot = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'core_clusters_info.tsv'), 
                        sep='\t') %>%
    semi_join(primer_info, c('cluster_id')) 
gene_annot

gene_type,cluster_id,seq_uuid,seq_orig_name,contig_id,taxon,start,end,score,strand,annotation,cluster_name,clust_id
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>
cds,5,d68322c8ca364d04828552a0efd486a3,IDAENNDI_01588,JACRSS010000003.1,GCA_014384805.1_ASM1438480v1.fna,304023,304391,.,-,50S ribosomal protein L14,c817bb98130f4bfe85d1460703167e0b,5
cds,5,e8de575f15c84893ae8b2cc66c718df4,GAOLNLCB_00828,FLKP01000002.1,GCA_900087015.1_PRJEB13910.fna,568960,569328,.,+,50S ribosomal protein L14,c817bb98130f4bfe85d1460703167e0b,5
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,11,b39721f8437543599e39b83a413827ed,PGOCOPND_00111,GUT_GENOME127701_4,GUT_GENOME127701.fna,4667,8263,.,-,DNA-directed RNA polymerase subunit beta',be25748cb9f14982b9e9c26951a2bb9f,11
cds,11,5ece5e1d2dda400e85cf8f25ff63df03,NGKEMLEB_01211,GUT_GENOME097725_46,GUT_GENOME097725.fna,6033,9602,.,-,DNA-directed RNA polymerase subunit beta',be25748cb9f14982b9e9c26951a2bb9f,11


In [39]:
# non-target gene annotations
gene_nontarget = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'nontarget', 'cds_blastx.tsv'), 
                        sep='\t') %>%
    semi_join(primer_info, c('cluster_id')) 
gene_nontarget

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
<int>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<int>
5,c817bb98130f4bfe85d1460703167e0b,PWM37926.1,50S ribosomal protein L14 [Clostridiales bacterium],90.984,122,11,1,366,1,122,1.47e-62,122,369,Clostridiales bacterium,1898207,1
5,c817bb98130f4bfe85d1460703167e0b,HBU12873.1,50S ribosomal protein L14 [Clostridiales bacterium],86.885,122,16,1,366,1,122,1.19e-61,122,369,Clostridiales bacterium,1898207,2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
11,be25748cb9f14982b9e9c26951a2bb9f,WP_087679536.1,DNA-directed RNA polymerase subunit beta' [Garciella nitratireducens],73.305,1165,305,4,3498,3,1161,0,1191,3765,Garciella nitratireducens;Garciella nitratireducens DSM 15102,218205;1121911,20
11,be25748cb9f14982b9e9c26951a2bb9f,HHU64062.1,DNA-directed RNA polymerase subunit beta' [Clostridiales bacterium],73.842,1166,302,1,3495,2,1165,0,1179,3765,Clostridiales bacterium,1898207,16


## Primer quality

In [24]:
# summary
primer_info %>% unique_n('primer sets', primer_set)
primer_info %>% unique_n('gene clusters', cluster_id)

No. of unique primer sets: 40 
No. of unique gene clusters: 3 


In [26]:
# primers per cluster
primer_info %>%
    distinct(cluster_id, primer_set) %>%
    group_by(cluster_id) %>%
    summarize(n_primer_sets = n(), .groups='drop')

Unnamed: 0_level_0,cluster_id,n_primer_sets
Unnamed: 0_level_1,<int>,<int>
1,5,4
2,9,1
3,11,35


In [30]:
primer_info %>% colnames

In [31]:
# primer quality
primer_info %>% filter(primer_type == 'PRIMER_LEFT') %>% .$amplicon_size_avg %>% summary_x('mean amplicon size')
primer_info %>% .$degeneracy %>% summary_x('degeneracy')
primer_info %>% .$degeneracy_3prime %>% summary_x('degeneracy (3-prime)')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
mean amplicon size,100,129,137,150.35,166,245,49.531,20.221


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
degeneracy,16,36,64,62,96,128,40.393,16.49


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
degeneracy (3-prime),1,2,4,3.75,6,12,3.932,1.605


In [38]:
# arrange
df.dims(20,40)
primer_info %>%
    group_by(primer_set) %>%
    mutate(max_degeneracy_3prime = max(degeneracy_3prime),
           max_degeneracy = max(degeneracy)) %>%
    arrange(max_degeneracy_3prime, max_degeneracy) %>%
    head(n=20)
df.dims()

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd,max_degeneracy_3prime,max_degeneracy
<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>
cds,11,1049,184,184,0,1049f,PRIMER_LEFT,AARCGYGTNGAYTATTCCGG,20,32,1,1002,1022,59.0004,2.364568,50.0,5.0,13.721826,20.571387,2.1324727,14.52208,1,72
cds,11,1049,184,184,0,1049r,PRIMER_RIGHT,GVCGBACYTTYTCYACCAT,19,72,1,1167,1186,57.568,2.840075,51.75439,5.752139,7.23861,16.235568,-20.0896822,17.90398,1,72
cds,11,8754,185,185,0,8754f,PRIMER_LEFT,AARCGYGTNGAYTATTCCGG,20,32,1,1002,1022,59.0004,2.364568,50.0,5.0,13.721826,20.571387,2.1324727,14.52208,1,72
cds,11,8754,185,185,0,8754r,PRIMER_RIGHT,GGVCGBACYTTYTCYACCAT,20,72,1,1167,1187,60.1919,2.71436,54.16667,5.464532,22.153767,19.01177,-21.2695423,19.77932,1,72
cds,5,402,166,166,0,402f,PRIMER_LEFT,CBGGCGGYGTNGTNAAGAA,19,96,1,142,161,61.5509,2.63515,58.77193,5.189544,10.204665,17.765594,0.2535095,26.46468,1,96
cds,5,402,166,166,0,402r,PRIMER_RIGHT,GCWACNGGYCCRAARATACG,20,64,1,288,308,59.79884,2.222096,55.0,5.0,20.894667,18.590578,-1.837289,23.07496,1,96
cds,5,6199,164,164,0,6199f,PRIMER_LEFT,GGCGGYGTNGTNAAGAARAA,20,64,2,144,164,59.56126,2.338347,50.0,5.0,9.93412,17.215286,-12.4820591,22.94864,2,64
cds,5,6199,164,164,0,6199r,PRIMER_RIGHT,GCWACNGGYCCRAARATACG,20,64,1,288,308,59.79884,2.222096,55.0,5.0,20.894667,18.590578,-1.837289,23.07496,2,64
cds,9,9711,195,195,0,9711f,PRIMER_LEFT,YGGYGCDACHTAYCAGGT,18,72,1,242,260,58.49755,2.755565,56.48148,6.071702,27.525254,25.320481,-0.0674942,14.6949,2,72
cds,9,9711,195,195,0,9711r,PRIMER_RIGHT,TCNGCCATYYTRTGCATWTC,20,64,2,417,437,56.71061,2.310079,45.0,5.0,12.004334,17.85578,-6.8335623,15.52953,2,72


### Gene cluster annotations

In [42]:
gene_annot %>%
    distinct(cluster_id, annotation)

cluster_id,annotation
<int>,<chr>
5,50S ribosomal protein L14
9,30S ribosomal protein S7
11,DNA-directed RNA polymerase subunit beta'


### Most 

In [21]:
# most unique clusters
df.dims(10)
gene_annot %>%
    filter(pident_rank == 1) %>%
    arrange(pident) %>%
    head(n=10)
df.dims()

Unnamed: 0_level_0,cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<int>
1,11,be25748cb9f14982b9e9c26951a2bb9f,PWM39277.1,DNA-directed RNA polymerase subunit beta' [Clostridiales bacterium],82.95,1173,199,4,3522,3,1174,0.0,1197,3765,Clostridiales bacterium,1898207,1
2,5,c817bb98130f4bfe85d1460703167e0b,PWM37926.1,50S ribosomal protein L14 [Clostridiales bacterium],90.984,122,11,1,366,1,122,1.47e-62,122,369,Clostridiales bacterium,1898207,1
3,9,6a192bf234e3490f9fb15bda4d790b5a,WP_046442671.1,30S ribosomal protein S7 [Catabacter hongkongensis],94.872,156,8,1,468,1,156,2.9199999999999998e-108,156,471,Catabacter hongkongensis,270498,1
