# Goal

* Designing primers for Methanothermobacter
  * Download genomes
  * QC genomes
  * Design primers

# Var

In [1]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/'
clade = 'Methanothermobacter'
taxid = 145260

# Init

In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
df.dims()
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter 


# Genome download

* Downloading genomes from NCBI

```
OUTDIR=/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/
mkdir -p $OUTDIR
ncbi-genome-download -p 12 -s genbank -F fasta --genera Methanothermobacter -o $OUTDIR archaea
```

# Genome quality

In [4]:
D = file.path(work_dir, 'genbank')
files = list_files(D, '.fna.gz')
samps = data.frame(Name = files %>% as.character %>% basename,
                   Fasta = files,
                   Domain = 'Archaea',
                   Taxid = taxid) %>%
    mutate(Name = gsub('\\.fna\\.gz$', '', Name),
           Fasta = gsub('/+', '/', Fasta))
samps

# writing file
outfile = file.path(work_dir, 'genomes_raw.txt')
write_table(samps, outfile)

Name,Fasta,Domain,Taxid
<chr>,<chr>,<fct>,<dbl>
GCA_000008645.1_ASM864v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_000008645.1/GCA_000008645.1_ASM864v1_genomic.fna.gz,Archaea,145260
GCA_000145295.1_ASM14529v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_000145295.1/GCA_000145295.1_ASM14529v1_genomic.fna.gz,Archaea,145260
⋮,⋮,⋮,⋮
GCA_014889545.1_ASM1488954v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_014889545.1/GCA_014889545.1_ASM1488954v1_genomic.fna.gz,Archaea,145260
GCA_900095815.1_SIV6_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_900095815.1/GCA_900095815.1_SIV6_genomic.fna.gz,Archaea,145260


File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genomes_raw.txt 


## LLG

#### Config

In [None]:
cat_file(file.path(work_dir, 'config_llg.yaml'))

#### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llg
$ screen -L -S llg-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/config_llg.yaml 30 -F
```

### Samples table of high-quality genomes

In [5]:
# checkM summary
checkm = file.path(work_dir, 'LLG_output', 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<fct>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<lgl>,<fct>
GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,16.67,1751377,0,⋯,0,184,3,1,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.1
GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,0.00,1731018,0,⋯,0,188,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,44.44,1612916,30,⋯,3,176,9,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.9,markers_qa_summary.tsv.9,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.9
GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,0.00,1686891,0,⋯,0,188,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.10,markers_qa_summary.tsv.10,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.10


In [6]:
# dRep summary
drep = file.path(work_dir, 'LLG_output', 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<fct>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic


In [7]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'LLG_output', 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<fct>,<fct>
GCA_012521115.1_ASM1252111v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_012521115.1_ASM1252111v1_genomic.fna
GCA_900095815.1_SIV6_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_900095815.1_SIV6_genomic.fna
⋮,⋮
GCA_003264935.1_ASM326493v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_003264935.1_ASM326493v1_genomic.fna
GCA_012840175.1_ASM1284017v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_012840175.1_ASM1284017v1_genomic.fna


In [8]:
# GTDBTk summary
tax = file.path(work_dir, 'LLG_output', 'gtdbtk', 'gtdbtk_ar_summary.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy,
           -other_related_references.genome_id.species_name.radius.ANI.AF.)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<fct>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
GCA_000008645.1_ASM864v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter thermautotrophicus,GCF_000008645.1,95.0,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
GCA_000828575.1_ASM82857v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter sp000828575,GCF_000828575.1,95.0,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_014361435.1_ASM1436143v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter thermautotrophicus,GCF_000008645.1,95.0,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
GCA_900095815.1_SIV6_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter wolfeii,GCF_900095815.1,95.0,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [9]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 16 
setdiff(x,y): 0 
setdiff(y,x): 4 
union(x,y): 20 
-- checkm --
intersect(x,y): 20 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 20 
-- gtdbtk --
intersect(x,y): 20 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 20 


In [10]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [11]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 90,
           contamination < 5,
           Strain.heterogeneity < 50)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [12]:
hq_genomes %>% colnames

In [15]:
hq_genomes %>% summary_x('genome size', Genome.size..bp.)
hq_genomes %>% summary_x('contigs', X..contigs)
hq_genomes %>% summary_x('No. of genes', X..predicted.genes)
hq_genomes %>% summary_x('coding density', Coding.density)

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
genome size,1423235,1544220,1670554,1633141,1721110,1751377,122128.9,49858.92


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
contigs,1,1,2,49.75,21.5,331,130.382,53.228


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
No. of genes,1571,1743.25,1793,1772,1841.25,1889,109.607,44.747


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
coding density,89.25,89.4875,89.965,89.99062,90.38,91.02,0.634,0.259


In [16]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Family,Genus,n_genomes
<chr>,<chr>,<int>
f__Methanothermobacteraceae,g__Methanothermobacter,13
f__Methanothermobacteraceae_A,g__Methanothermobacter_A,3


In [18]:
# writing samples table for LLPRIMER
outfile = file.path(work_dir, 'samples_genomes_hq.txt')
hq_genomes %>%
    select(Bin.Id, Fasta) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Taxon = gsub('_genomic', '', Taxon),
           Taxon = gsub('_annotated_assembly', '', Taxon),
           Taxid = taxid,
           Domain = 'Archaea') %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt 


# Primer design

## cluster cutoff = 0.9

### Config

In [19]:
F = file.path(work_dir, 'primers', 'clust0.9', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt 

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.9/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prokka: ""
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.9 -c 0.8
    vsearch: --id 0.94
    core_genes:
      cds: --perc-genomes-cds 100 --copies-per-genome-cds 1 --max-clusters-cds 500
      rRNA: --perc-genomes-rrna 100 --copies-per-genome-rrna 10 --max-clusters-rrna 500
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 500
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-pr

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L  -S llprimer-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.9/config.yaml 30 -F
```

### Summary

In [32]:
# table on final primer metadata
F = file.path(work_dir, 'primers', 'clust0.9', 'cgp', 'primers_final_info.tsv')
info = read.delim(F, sep='\t')
info

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cds,5,333,130,130,0,333f,PRIMER_LEFT,CTNAARAGRACCGGVATGAC,20,⋯,36,56,57.12141,2.005692,50.83333,4.930066,0.00000,0.00000,4.884934,4.791501
cds,5,333,130,130,0,333r,PRIMER_RIGHT,GDAGCATBAGKATRTCGCCT,20,⋯,146,166,57.76496,2.461643,50.00000,4.859127,30.85192,16.69434,-16.710578,14.804801
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,9,321,143,143,0,321f,PRIMER_LEFT,CACCMTTCGCTGCMATGGA,19,⋯,154,173,61.36825,1.352074,57.89474,3.721615,23.78751,24.003296,18.82438,12.504622
cds,9,321,143,143,0,321r,PRIMER_RIGHT,YCTTATRGCYGCYTGTGCAC,20,⋯,277,297,60.64556,2.193200,55.00000,5.000000,41.99857,3.357727,12.39512,3.385369


In [36]:
info$primer_set %>% unique_n('Primer sets')
info$cluster_id %>% unique_n('Gene clusters')
info %>%
    group_by(cluster_id) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') %>%
    .$n_primer_sets %>% summary_x('Primer sets per gene cluster')

No. of unique Primer sets: 8 
No. of unique Gene clusters: 3 


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Primer sets per gene cluster,2,2,2,2.666667,3,4,0.8,0.327


In [34]:
# primer sets per gene type
info %>%
    distinct(gene_type, cluster_id, primer_set) %>%
    group_by(gene_type) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') 

gene_type,n_primer_sets
<fct>,<int>
cds,8


In [37]:
df.dims(10, 30)
info %>%
    head(n=10)
df.dims()

Unnamed: 0_level_0,gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,cds,5,333,130,130,0,333f,PRIMER_LEFT,CTNAARAGRACCGGVATGAC,20,48,1,36,56,57.12141,2.005692,50.83333,4.930066,0.0,0.0,4.884934,4.791501
2,cds,5,333,130,130,0,333r,PRIMER_RIGHT,GDAGCATBAGKATRTCGCCT,20,36,1,146,166,57.76496,2.461643,50.0,4.859127,30.85192,16.69434,-16.710578,14.804801
3,cds,5,334,131,131,0,334f,PRIMER_LEFT,CTNAARAGRACCGGVATGAC,20,48,1,36,56,57.12141,2.005692,50.83333,4.930066,0.0,0.0,4.884934,4.791501
4,cds,5,334,131,131,0,334r,PRIMER_RIGHT,AGDAGCATBAGKATRTCGCC,20,36,1,147,167,57.76496,2.461643,50.0,4.859127,20.09595,18.08273,-15.538548,14.168021
5,cds,6,213,190,190,0,213f,PRIMER_LEFT,GCCAGGAYTWTTCGCAGCAA,20,4,1,2,22,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
6,cds,6,213,190,190,0,213r,PRIMER_RIGHT,YCTTATDGCYGAGTTBGGCTGT,22,36,1,170,192,61.90299,1.866712,50.0,4.417388,44.21936,25.45319,6.06944,24.252183
7,cds,6,214,192,192,0,214f,PRIMER_LEFT,TTGCCAGGAYTWTTCGCAGC,20,4,1,0,20,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
8,cds,6,214,192,192,0,214r,PRIMER_RIGHT,YCTTATDGCYGAGTTBGGCTGT,22,36,1,170,192,61.90299,1.866712,50.0,4.417388,44.21936,25.45319,6.06944,24.252183
9,cds,6,342,191,191,0,342f,PRIMER_LEFT,GCCAGGAYTWTTCGCAGCAA,20,4,1,2,22,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
10,cds,6,342,191,191,0,342r,PRIMER_RIGHT,TYCTTATDGCYGAGTTBGGCTG,22,36,1,171,193,61.24301,1.992234,50.0,4.417388,44.21936,25.45319,6.946059,23.189566


In [40]:
# general stats
info$amplicon_size_avg %>% summary_x('amplicon_size_avg')
info$degeneracy %>% summary_x('degen')
info$Tm_avg %>% summary_x('Tm-avg')
info$Tm_sd %>% summary_x('Tm-sd')
info$GC_avg %>% summary_x('GC-avg')
info$GC_sd %>% summary_x('GC-sd')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
amplicon_size_avg,130,139.25,166.5,164,191.25,193,25.917,10.58


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
degen,4,4,26,22.5,36,48,17.46,7.128


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-avg,57.12141,59.92541,61.03726,60.29585,61.27432,61.90299,1.691,0.69


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-sd,1.028183,1.271101,1.929473,1.736803,2.035822,2.461643,0.524,0.214


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-avg,50,50,51.66667,52.49726,53.125,57.89474,2.922,1.193


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-sd,2.5,3.416211,4.417388,4.001331,4.859127,5,0.948,0.387


#### Best primer sets

In [74]:
# blastx on non-targets
F = file.path(work_dir, 'primers', 'clust0.9', 'cgp', 'nontarget', 'cds_blastx.tsv')
cds_blast = read.delim(F, sep='\t') %>%
    filter(!grepl('Methanothermobacter', sscinames)) %>%
    inner_join(info %>% 
                  filter(gene_type == 'cds') %>%
                  select(cluster_id),
              c('cluster_id')) %>%
    mutate(pident = ifelse(is.na(pident), 0, pident))
cds_blast 

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>
5,f570da5737fe43bbbd62003d768ba21e,WP_040681975.1,30S ribosomal protein S28e [Methanobrevibacter boviskoreani],95.588,68,3,1,204,1,68,3.04e-38,76,207,Methanobrevibacter boviskoreani JH1;Methanobrevibacter boviskoreani,1214066;1348249,1
5,f570da5737fe43bbbd62003d768ba21e,WP_040681975.1,30S ribosomal protein S28e [Methanobrevibacter boviskoreani],95.588,68,3,1,204,1,68,3.04e-38,76,207,Methanobrevibacter boviskoreani JH1;Methanobrevibacter boviskoreani,1214066;1348249,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
9,30c7323605f94b65b547d2f2a4aa6bdb,AXV38867.1,30S ribosomal protein S11 [Methanobacterium sp. BAmetb5],88.235,119,12,1,351,1,119,1.31e-21,132,393,Methanobacterium sp. BAmetb5,2025351,19
9,30c7323605f94b65b547d2f2a4aa6bdb,AXV38867.1,30S ribosomal protein S11 [Methanobacterium sp. BAmetb5],88.235,119,12,1,351,1,119,1.31e-21,132,393,Methanobacterium sp. BAmetb5,2025351,19


In [75]:
# clusters with the most distant non-target relatives
cds_blast %>%
    group_by(cluster_id) %>%
    mutate(max_pident = max(pident)) %>% 
    .$max_pident %>% summary_x('Clostest non-target (seqid)')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Clostest non-target (seqid),95.588,96.33275,97.936,97.68775,99.291,99.291,1.513,0.618


In [77]:
# clusters with the most distant non-target relatives
cds_blast_f = cds_blast %>%
    group_by(cluster_id) %>%
    mutate(max_pident = max(pident)) %>%
    ungroup() %>%
    filter(max_pident < 96)
cds_blast_f %>% unique_n('clusters', cluster_id)

No. of unique clusters: 1 


In [86]:
# filtering primers
info_f = info %>%
    filter(gene_type == 'cds') %>%
    semi_join(cds_blast_f, c('cluster_id')) %>%
    filter(degeneracy <= 32, Tm_sd <= 2) %>%
    group_by(gene_type, cluster_id, primer_set) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(n == 2) 

df.dims(30,30)
info_f %>%
    arrange(Tm_sd, degeneracy, primer_set) %>%
    head(n=30)
df.dims()

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd,n
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>


## cluster cutoff = 0.8

### Config

In [90]:
F = file.path(work_dir, 'primers', 'clust0.8', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt 

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.8/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prokka: ""
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.8 -c 0.8
    vsearch: --id 0.94
    core_genes:
      cds: --perc-genomes-cds 100 --copies-per-genome-cds 1 --max-clusters-cds 500
      rRNA: --perc-genomes-rrna 100 --copies-per-genome-rrna 10 --max-clusters-rrna 500
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 500
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-pr

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L  -S llprimer-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.8/config.yaml 30 -F
```

### Summary

In [128]:
# table on final primer metadata
F = file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'primers_final_info.tsv')
info = read.delim(F, sep='\t')
info

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
cds,3,213,190,190,0,213f,PRIMER_LEFT,GCCAGGAYTWTTCGCAGCAA,20,⋯,2,22,61.03726,1.028183,52.5,2.500000,0.00000,0.00000,-22.59059,0.00000
cds,3,213,190,190,0,213r,PRIMER_RIGHT,YCTTATDGCYGAGTTBGGCTGT,22,⋯,170,192,61.90299,1.866712,50.0,4.417388,44.21936,25.45319,6.06944,24.25218
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,125,408,109,109,0,408f,PRIMER_LEFT,HGGRCCHTGYAARACAGAGA,20,⋯,143,163,59.21052,2.552145,50.83333,5.464532,16.807718,23.77506,0.7344449,19.06387
cds,125,408,109,109,0,408r,PRIMER_RIGHT,RTGBCCYTGNACCTCTGAAC,20,⋯,232,252,59.95410,2.170267,55.83333,4.930066,7.937665,18.16043,-15.3833549,20.93216


In [129]:
# table on final primer metadata
F = file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'core_clusters_info.tsv')
clst = read.delim(F, sep='\t')
clst

gene_type,cluster_id,seq_uuid,seq_orig_name,contig_id,taxon,start,end,score,strand,annotation,cluster_name,clust_id
<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<int>
cds,3,d85616f3d68f4455844f7755aa79b6c9,HPDPEDLM_01030,AE000666.1,GCA_000008645.1_ASM864v1,962996,963421,.,+,30S ribosomal protein S12,d16595ef8d514d03ac5bbb85608b03a8,3
cds,3,e3f1768c4cbf472f9badc186476f4384,NBKGJNCK_00972,AP011952.1,GCA_000828575.1_ASM82857v1,945750,946175,.,+,30S ribosomal protein S12,d16595ef8d514d03ac5bbb85608b03a8,3
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,125,040c562ea89c4d84a09638e52f565ea9,MOKPFFML_01010,JACIVR010000031.1,GCA_014361435.1_ASM1436143v1,2157,2873,.,-,Tetrahydromethanopterin S-methyltransferase subunit A 1,b570f8fcf92943edad36db04c6b5605e,125
cds,125,8029893a987a4bbaabc9d94cbd95122f,LEKEIHCA_01522,LT608329.1,GCA_900095815.1_SIV6,1438914,1439630,.,-,Tetrahydromethanopterin S-methyltransferase subunit A 1,b570f8fcf92943edad36db04c6b5605e,125


In [130]:
info$primer_set %>% unique_n('Primer sets')
info$cluster_id %>% unique_n('Gene clusters')
info %>%
    group_by(cluster_id) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') %>%
    .$n_primer_sets %>% summary_x('Primer sets per gene cluster')

No. of unique Primer sets: 66 
No. of unique Gene clusters: 18 


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Primer sets per gene cluster,1,1,2,3.833333,4,24,8.93,3.645


In [131]:
# primer sets per gene type
info %>%
    distinct(gene_type, cluster_id, primer_set) %>%
    group_by(gene_type) %>%
    summarize(n_primer_sets = primer_set %>% unique %>% length,
              .groups='drop') 

gene_type,n_primer_sets
<fct>,<int>
cds,66


In [132]:
df.dims(10, 30)
info %>%
    head(n=10)
df.dims()

Unnamed: 0_level_0,gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,cds,3,213,190,190,0,213f,PRIMER_LEFT,GCCAGGAYTWTTCGCAGCAA,20,4,1,2,22,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
2,cds,3,213,190,190,0,213r,PRIMER_RIGHT,YCTTATDGCYGAGTTBGGCTGT,22,36,1,170,192,61.90299,1.866712,50.0,4.417388,44.21936,25.45319,6.06944,24.25218
3,cds,3,214,192,192,0,214f,PRIMER_LEFT,TTGCCAGGAYTWTTCGCAGC,20,4,1,0,20,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
4,cds,3,214,192,192,0,214r,PRIMER_RIGHT,YCTTATDGCYGAGTTBGGCTGT,22,36,1,170,192,61.90299,1.866712,50.0,4.417388,44.21936,25.45319,6.06944,24.25218
5,cds,3,342,191,191,0,342f,PRIMER_LEFT,GCCAGGAYTWTTCGCAGCAA,20,4,1,2,22,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
6,cds,3,342,191,191,0,342r,PRIMER_RIGHT,TYCTTATDGCYGAGTTBGGCTG,22,36,1,171,193,61.24301,1.992234,50.0,4.417388,44.21936,25.45319,6.946059,23.18957
7,cds,3,343,193,193,0,343f,PRIMER_LEFT,TTGCCAGGAYTWTTCGCAGC,20,4,1,0,20,61.03726,1.028183,52.5,2.5,0.0,0.0,-22.590589,0.0
8,cds,3,343,193,193,0,343r,PRIMER_RIGHT,TYCTTATDGCYGAGTTBGGCTG,22,36,1,171,193,61.24301,1.992234,50.0,4.417388,44.21936,25.45319,6.946059,23.18957
9,cds,4,36,114,114,0,36f,PRIMER_LEFT,GTRTTYCTDGCARMCCACCA,20,48,1,699,719,59.47537,2.613721,51.66667,5.527708,13.25298,18.84885,-13.066518,17.50463
10,cds,4,36,114,114,0,36r,PRIMER_RIGHT,HCCHCCCATBGTGAAGTTGT,20,27,1,793,813,60.0953,1.471249,51.66667,4.082483,19.83603,24.24986,-26.236222,29.11761


In [133]:
# general stats
info$amplicon_size_avg %>% summary_x('amplicon_size_avg')
info$degeneracy %>% summary_x('degen')
info$Tm_avg %>% summary_x('Tm-avg')
info$Tm_sd %>% summary_x('Tm-sd')
info$GC_avg %>% summary_x('GC-avg')
info$GC_sd %>% summary_x('GC-sd')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
amplicon_size_avg,100,118,135,139.2636,149,198,33.33,13.607


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
degen,4,24,42,41.5942,48,128,42.4,17.31


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-avg,54.22175,57.72056,58.754,58.73787,59.71873,63.95472,3.147,1.285


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Tm-sd,1.028183,1.663817,2.191219,2.108862,2.488311,2.982226,0.674,0.275


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-avg,37.12121,44.16667,50,48.90713,52.5,63.33333,8.746,3.571


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
GC-sd,2.5,4.287014,4.695301,4.728682,5.018892,6.614378,1.323,0.54


In [134]:
# blastx on non-targets
F = file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'nontarget', 'cds_blastx.tsv')
cds_blast = read.delim(F, sep='\t') %>%
    filter(!grepl('Methanothermobacter', sscinames)) %>%
    inner_join(info %>% 
                  filter(gene_type == 'cds') %>%
                  select(cluster_id),
              c('cluster_id')) %>%
    mutate(pident = ifelse(is.na(pident), 0, pident))
cds_blast 

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank
<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>
3,d16595ef8d514d03ac5bbb85608b03a8,MBC7100273.1,,99.291,141,1,1,423,1,141,1.02e-94,141,426,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,1
3,d16595ef8d514d03ac5bbb85608b03a8,MBC7100273.1,,99.291,141,1,1,423,1,141,1.02e-94,141,426,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
125,b570f8fcf92943edad36db04c6b5605e,WP_066973001.1,tetrahydromethanopterin S-methyltransferase subunit A [Methanobrevibacter filiformis],78.027,223,47,7,669,3,225,2.92e-119,241,717,Methanobrevibacter filiformis,55758,30
125,b570f8fcf92943edad36db04c6b5605e,WP_066973001.1,tetrahydromethanopterin S-methyltransferase subunit A [Methanobrevibacter filiformis],78.027,223,47,7,669,3,225,2.92e-119,241,717,Methanobrevibacter filiformis,55758,30


In [135]:
# clusters with the most distant non-target relatives
cds_blast %>%
    group_by(cluster_id) %>%
    mutate(max_pident = max(pident)) %>% 
    .$max_pident %>% summary_x('Clostest non-target (seqid)')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Clostest non-target (seqid),80.909,83.019,87.745,88.44145,90.476,100,6.699,2.735


In [136]:
# clusters with the most distant non-target relatives
cds_blast_f = cds_blast %>%
    group_by(cluster_id) %>%
    mutate(max_pident = max(pident)) %>%
    ungroup() %>%
    filter(max_pident < 90)
cds_blast_f %>% unique_n('clusters', cluster_id)

No. of unique clusters: 8 


In [137]:
# filtering primers
info_f = info %>%
    filter(gene_type == 'cds') %>%
    semi_join(cds_blast_f, c('cluster_id')) %>%
    filter(degeneracy <= 16, Tm_sd <= 2) %>%
    group_by(gene_type, cluster_id, primer_set) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(n == 2) 
info_f 

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,⋯,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd,n
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
cds,43,379,143,143,0,379f,PRIMER_LEFT,YTGCARGAYGARCACAGCC,19,⋯,340,60.67547,1.960398,57.89474,5.263158,30.84537,18.40668,-11.62453,15.38513,2
cds,43,379,143,143,0,379r,PRIMER_RIGHT,GAYTCGTGTGMTGGYTGTGC,20,⋯,464,61.63802,1.898903,57.50000,4.330127,0.00000,0.00000,-19.28684,12.79022,2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
cds,92,194,162,162,0,194f,PRIMER_LEFT,ATMARRACMCACTGCAGGGA,20,⋯,218,59.46321,1.950136,50.0000,5.00000,31.66945,25.00642,23.65815,2.913841,2
cds,92,194,162,162,0,194r,PRIMER_RIGHT,TCCGTGYTCAACYTTYTTCCT,21,⋯,360,59.45613,1.528516,45.2381,4.12393,0.00000,0.00000,-28.69132,6.035690,2


In [141]:
df.dims(30,30)
info_f %>%
    arrange(cluster_id, Tm_sd, degeneracy, primer_set) %>%
    head(n=30)
df.dims()

gene_type,cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,degeneracy_3prime,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd,n
<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
cds,43,379,143,143,0,379r,PRIMER_RIGHT,GAYTCGTGTGMTGGYTGTGC,20,8,1,444,464,61.63802,1.898903,57.5,4.330127,0.0,0.0,-19.28684,12.790224,2
cds,43,379,143,143,0,379f,PRIMER_LEFT,YTGCARGAYGARCACAGCC,19,16,1,321,340,60.67547,1.960398,57.89474,5.263158,30.84537,18.40668,-11.62453,15.385125,2
cds,92,174,161,161,0,174r,PRIMER_RIGHT,TCCGTGYTCAACYTTYTTCCT,21,8,1,339,360,59.45613,1.528516,45.2381,4.12393,0.0,0.0,-28.69132,6.03569,2
cds,92,194,162,162,0,194r,PRIMER_RIGHT,TCCGTGYTCAACYTTYTTCCT,21,8,1,339,360,59.45613,1.528516,45.2381,4.12393,0.0,0.0,-28.69132,6.03569,2
cds,92,174,161,161,0,174f,PRIMER_LEFT,TMARRACMCACTGCAGGGAC,20,16,1,199,219,60.41481,1.91584,55.0,5.0,31.66945,25.00642,23.65815,2.913841,2
cds,92,194,162,162,0,194f,PRIMER_LEFT,ATMARRACMCACTGCAGGGA,20,16,1,198,218,59.46321,1.950136,50.0,5.0,31.66945,25.00642,23.65815,2.913841,2


In [139]:
# annotationes of clusters hit
df.dims(30)
clst %>%
    semi_join(info_f, c('gene_type', 'cluster_id')) %>%
    distinct(cluster_id, annotation)
df.dims()

cluster_id,annotation
<int>,<fct>
43,Coenzyme F420 hydrogenase subunit gamma
92,30S ribosomal protein S19


In [140]:
# closest non-target hits
df.dims(30)
cds_blast_f %>%
    filter(pident == max_pident) %>%
    semi_join(info_f, c('cluster_id')) %>%
    distinct(cluster_id, subject_name, sscinames, max_pident)
df.dims()

cluster_id,subject_name,sscinames,max_pident
<int>,<fct>,<fct>,<dbl>
43,coenzyme F420 hydrogenase subunit gamma [Methanobacteriaceae archaeon],Methanobacteriaceae archaeon,89.956
92,30S ribosomal protein S19 [Methanobacterium sp. PtaB.Bin024],Methanobacterium sp. PtaB.Bin024,86.765
92,30S ribosomal protein S19 [Methanobacteriaceae archaeon],Methanobacteriaceae archaeon,86.765


#### Writing table

In [143]:
outF = file.path(work_dir, 'primers', 'clust0.8', 'primers_c43-379fr.tsv')
info_f %>%
    filter(primer_set == '379') %>%
    write_table(outF)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.8/primers_c43-379fr.tsv 


# sessionInfo

In [127]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/tidyverse/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] LeyLabRMisc_0.1.6 ggplot2_3.3.1     tidyr_1.1.0       dplyr_1.0.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     magrittr_1.5     munsell_0.5.0    tidyselect_1.1.0
 [5] uuid_0.1-4       colorspace_1.4-1 R6_2.4.1         rlang_0.4.6     
 [9] tools_3.6.3