# Goal

* Design primers with llprimer pipeline

# Var

In [1]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/'
genomes_file = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/LLG_output/samples_genomes_hq.txt'
# params
threads = 8

# Init

In [2]:
library(dplyr)
library(tidyr)
library(data.table)
library(tidytable)
library(ggplot2)
library(LeyLabRMisc)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last



Attaching package: ‘tidytable’


The following object is masked from ‘package:stats’:

    dt




In [3]:
df.dims()
setDTthreads(threads)
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/ 


# Load

In [4]:
genomes = read.delim(genomes_file, sep='\t')
genomes

Taxon,Fasta,Domain,Taxid
<chr>,<chr>,<chr>,<int>
1006216__metabat2__High.023.fna.gz,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/LLG_output/drep/drep/dereplicated_genomes/1006216__metabat2__High.023.fna.gz.fna,d__Bacteria,1262781
ERR3450721__metabat2__High.043.fna.gz,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/LLG_output/drep/drep/dereplicated_genomes/ERR3450721__metabat2__High.043.fna.gz.fna,d__Bacteria,1262781
⋮,⋮,⋮,⋮
MI-326-H__metabat2_low_PE.034.fna.gz,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/LLG_output/drep/drep/dereplicated_genomes/MI-326-H__metabat2_low_PE.034.fna.gz.fna,d__Bacteria,1262897
MI-408-H__metabat2_high_PE.004.fna.gz,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/LLG_output/drep/drep/dereplicated_genomes/MI-408-H__metabat2_high_PE.004.fna.gz.fna,d__Bacteria,1898207


# LLPRIMER

### Config

In [5]:
F = file.path(work_dir, 'primers', 'clst0.8-perc0.9', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes//LLG_output/samples_genomes_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/primers/clst0.8-perc0.8/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prokka: ""
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.8 -c 0.8
    vsearch: --id 0.94
    core_genes:
      cds: --perc-genomes-cds 90 --copies-per-genome-cds 1 --max-clusters-cds 500
      rRNA: --perc-genomes-rrna 90 --copies-per-genome-rrna 10 --max-clusters-rrna 500
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      consensus: --consensus-threshold 0.34
      number: --num-raw-primers 5000 --num-final-primers 20
   

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L -S llprimer-christ ./snakemake_sge.sh experiments/christensenellales/primers/clst0.8-perc0.9/config.yaml 30 -F
```

# --WAITING--

# Results

In [None]:
# primers
primer_info = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

In [None]:
# target gene info
gene_annot = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'core_clusters_info.tsv'), 
                        sep='\t') %>%
    semi_join(primer_info, c('cluster_id')) 
gene_annot

In [None]:
# non-target gene annotations
gene_nontarget = read.delim(file.path(work_dir, 'primers', 'clst0.8-perc0.8', 'cgp', 'nontarget', 'cds_blastx.tsv'), 
                        sep='\t') %>%
    semi_join(primer_info, c('cluster_id')) 
gene_nontarget

## Primer quality

In [None]:
# summary
primer_info %>% unique_n('primer sets', primer_set)
primer_info %>% unique_n('gene clusters', cluster_id)

In [None]:
# primers per cluster
primer_info %>%
    distinct(cluster_id, primer_set) %>%
    group_by(cluster_id) %>%
    summarize(n_primer_sets = n(), .groups='drop')

In [None]:
primer_info %>% colnames

In [None]:
# primer quality
primer_info %>% filter(primer_type == 'PRIMER_LEFT') %>% .$amplicon_size_avg %>% summary_x('mean amplicon size')
primer_info %>% .$degeneracy %>% summary_x('degeneracy')
primer_info %>% .$degeneracy_3prime %>% summary_x('degeneracy (3-prime)')

In [None]:
# arrange
df.dims(30,40)
primer_info %>%
    group_by(primer_set) %>%
    mutate(max_degeneracy_3prime = max(degeneracy_3prime),
           max_degeneracy = max(degeneracy)) %>%
    arrange(max_degeneracy_3prime, max_degeneracy) %>%
    head(n=30)
df.dims()

### Gene cluster annotations

In [None]:
gene_annot %>%
    distinct(cluster_id, annotation)

### Most unique clusters

In [None]:
# most unique clusters
df.dims(10)
gene_nontarget %>%
    filter(pident_rank == 1) %>%
    arrange(pident) %>%
    head(n=10)
df.dims()