# Goal

* Designing primers for Methanothermobacter
  * Download genomes
  * QC genomes
  * Design primers

# Var

In [79]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter'
clade = 'Methanothermobacter'
taxid = 145260

# Init

In [80]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)

In [81]:
df.dims()
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter 


# Genome download

* Downloading genomes from NCBI

```
OUTDIR=/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/
mkdir -p $OUTDIR
ncbi-genome-download -p 12 -s genbank -F fasta --genera Methanothermobacter -o $OUTDIR archaea
```

# Genome quality

In [11]:
D = file.path(work_dir, 'genbank')
files = list_files(D, '.fna.gz')
samps = data.frame(Name = files %>% as.character %>% basename,
                   Fasta = files,
                   Domain = 'Archaea',
                   Taxid = taxid) %>%
    mutate(Name = gsub('\\.fna\\.gz$', '', Name),
           Fasta = gsub('/+', '/', Fasta))
samps

# writing file
outfile = file.path(work_dir, 'genomes_raw.txt')
write_table(samps, outfile)

Name,Fasta,Domain,Taxid
<chr>,<chr>,<fct>,<dbl>
GCA_000008645.1_ASM864v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_000008645.1/GCA_000008645.1_ASM864v1_genomic.fna.gz,Bacteria,145260
GCA_000145295.1_ASM14529v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_000145295.1/GCA_000145295.1_ASM14529v1_genomic.fna.gz,Bacteria,145260
⋮,⋮,⋮,⋮
GCA_014889545.1_ASM1488954v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_014889545.1/GCA_014889545.1_ASM1488954v1_genomic.fna.gz,Bacteria,145260
GCA_900095815.1_SIV6_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genbank/archaea/GCA_900095815.1/GCA_900095815.1_SIV6_genomic.fna.gz,Bacteria,145260


File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genomes_raw.txt 


## LLG

#### Config

In [12]:
cat_file(file.path(work_dir, 'config_llg.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/genomes_raw.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 2 

# Domain of genomes ('Archaea' or 'Bacteria)
## Use "Skip" if provided as a "Domain" column in the genome table
Domain: Skip

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: ""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: ""
  # de-replication (requires checkm)
  drep: -comp 90 -con 5 -sa 0.999
  

#### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llg
$ screen -L -S llg-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/config_llg.yaml 30 -F
```

### Samples table of high-quality genomes

In [21]:
# checkM summary
checkm = file.path(work_dir, 'LLG_output', 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<fct>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<lgl>,<fct>
GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,16.67,1751377,0,⋯,0,184,3,1,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.1
GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,0.00,1731018,0,⋯,0,188,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,44.44,1612916,30,⋯,3,176,9,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.9,markers_qa_summary.tsv.9,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.9
GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,0.00,1686891,0,⋯,0,188,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|methanothermobacter|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.10,markers_qa_summary.tsv.10,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.10


In [22]:
# dRep summary
drep = file.path(work_dir, 'LLG_output', 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<fct>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic


In [23]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'LLG_output', 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<fct>,<fct>
GCA_012521115.1_ASM1252111v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_012521115.1_ASM1252111v1_genomic.fna
GCA_900095815.1_SIV6_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_900095815.1_SIV6_genomic.fna
⋮,⋮
GCA_003264935.1_ASM326493v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_003264935.1_ASM326493v1_genomic.fna
GCA_012840175.1_ASM1284017v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/LLG_output/drep/drep/dereplicated_genomes/GCA_012840175.1_ASM1284017v1_genomic.fna


In [24]:
# GTDBTk summary
tax = file.path(work_dir, 'LLG_output', 'gtdbtk', 'gtdbtk_ar_summary.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy,
           -other_related_references.genome_id.species_name.radius.ANI.AF.)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<fct>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
GCA_000008645.1_ASM864v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter thermautotrophicus,GCF_000008645.1,95.0,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
GCA_000828575.1_ASM82857v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter sp000828575,GCF_000828575.1,95.0,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_014361435.1_ASM1436143v1_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter thermautotrophicus,GCF_000008645.1,95.0,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
GCA_900095815.1_SIV6_genomic,d__Archaea,p__Methanobacteriota,c__Methanobacteria,o__Methanobacteriales,f__Methanothermobacteraceae,g__Methanothermobacter,s__Methanothermobacter wolfeii,GCF_900095815.1,95.0,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [25]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 16 
setdiff(x,y): 0 
setdiff(y,x): 4 
union(x,y): 20 
-- checkm --
intersect(x,y): 20 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 20 
-- gtdbtk --
intersect(x,y): 20 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 20 


In [26]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [27]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 90,
           contamination < 5,
           Strain.heterogeneity < 50)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000008645.1_ASM864v1_genomic.fna,100,0.31,GCA_000008645.1_ASM864v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.31,⋯,1.0,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,100,1,95.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000828575.1_ASM82857v1_genomic.fna,100,0.00,GCA_000828575.1_ASM82857v1_genomic,p__Euryarchaeota (UID3),148,188,125,100,0.00,⋯,1.0,GCF_000828575.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter sp000828575,100,1,97.19,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014361435.1_ASM1436143v1_genomic.fna,97.6,2.87,GCA_014361435.1_ASM1436143v1_genomic,p__Euryarchaeota (UID3),148,188,125,97.6,2.87,⋯,0.97,GCF_000008645.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter thermautotrophicus,97.33,0.97,88.56,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900095815.1_SIV6_genomic.fna,100.0,0.00,GCA_900095815.1_SIV6_genomic,p__Euryarchaeota (UID3),148,188,125,100.0,0.00,⋯,1.0,GCF_900095815.1,95,d__Archaea;p__Methanobacteriota;c__Methanobacteria;o__Methanobacteriales;f__Methanothermobacteraceae;g__Methanothermobacter;s__Methanothermobacter wolfeii,100.00,1.00,96.45,11,,


In [28]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Family,Genus,n_genomes
<chr>,<chr>,<int>
f__Methanothermobacteraceae,g__Methanothermobacter,13
f__Methanothermobacteraceae_A,g__Methanothermobacter_A,3


In [36]:
# writing samples table for LLPRIMER
outfile = file.path(work_dir, 'samples_genomes_hq.txt')
hq_genomes %>%
    select(Bin.Id, Fasta) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Taxon = gsub('_genomic', '', Taxon),
           Taxon = gsub('_annotated_assembly', '', Taxon),
           Taxid = taxid) %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt 


# Primer design

## cluster cutoff = 0.9

### Config

In [38]:
F = file.path(work_dir, 'primers', 'clust0.9', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.9/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prodigal: ""    
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.9 -c 0.8
    core_genes: --frac 1 --max-clusters 500
    blastx: -evalue 1e-10 -max_target_seqs 3
    blastx_nontarget: -evalue 1e-5 -max_target_seqs 30
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 500
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-prod-size 100 --max-prod-size 200
      Tm: --opt-tm 62 --min-t

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L  -S llprimer-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.9/config.yaml 30 -F
```

### Summary

In [42]:
primer_info = read.delim(file.path(work_dir, 'primers', 'clust0.9', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

No. of unique primer sets: 8 


cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
8,400,147,147,0,400f,PRIMER_LEFT,TGTAYACHGACCTYGCAAGC,20,12,⋯,844,864,59.37567,2.097183,51.66667,4.249183,18.79468,18.79919,6.816132,16.58762
8,400,147,147,0,400r,PRIMER_RIGHT,GGCCYTCKGTDATGTAACCT,20,12,⋯,971,991,58.43315,1.967470,51.66667,4.249183,27.17770,19.82780,-11.327806,13.77950
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
12,343,193,193,0,343f,PRIMER_LEFT,TTGCCAGGAYTWTTCGCAGC,20,4,⋯,0,20,61.03726,1.028183,52.5,2.500000,0.00000,0.00000,-22.590589,0.00000
12,343,193,193,0,343r,PRIMER_RIGHT,TYCTTATDGCYGAGTTBGGCTG,22,36,⋯,171,193,61.24301,1.992234,50.0,4.417388,44.21936,25.45319,6.946059,23.18957


In [43]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'clust0.9', 'cgp', 'core_clusters_blastx.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
8,b7814232e32442d8a235ae7e4a497c98,AAB85450.1,"ATP synthase, subunit B [Methanothermobacter thermautotrophicus str. Delta H]",100,463,0,1,1389,3,465,0,465,1392,Methanothermobacter thermautotrophicus str. Delta H,187420,2,"ATP synthase, subunit B",Methanothermobacter thermautotrophicus str. Delta H
8,b7814232e32442d8a235ae7e4a497c98,WP_013296141.1,MULTISPECIES: ATP synthase subunit B [Methanothermobacter],100,463,0,1,1389,1,463,0,463,1392,Methanothermobacter defluvii;Methanothermobacter marburgensis str. Marburg;Methanothermobacter;Methanothermobacter thermautotrophicus;Methanothermobacter thermautotrophicus str. Delta H;Methanothermobacter sp. CaT2;Methanothermobacter sp.;Methanothermobacter sp. EMTCatA1,49339;79929;145260;145262;187420;866790;1884223;2017966,2,MULTISPECIES: ATP synthase subunit B,Methanothermobacter
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
12,621c8390148945ba929b23b45a9ee5b3,WP_191216127.1,30S ribosomal protein S12 [Methanothermobacter sp. THM-1],97.872,141,3,1,423,1,141,6.29e-79,141,426,Methanothermobacter sp. THM-1,2606911,2,30S ribosomal protein S12,Methanothermobacter sp. THM-1
12,621c8390148945ba929b23b45a9ee5b3,WP_048175660.1,MULTISPECIES: 30S ribosomal protein S12 [Methanothermobacter],97.163,141,4,1,423,1,141,4.02e-78,141,426,Methanothermobacter defluvii;Methanothermobacter;Methanothermobacter thermautotrophicus;Methanothermobacter sp. CaT2;Methanothermobacter sp.;Methanothermobacter sp. EMTCatA1,49339;145260;145262;866790;1884223;2017966,3,MULTISPECIES: 30S ribosomal protein S12,Methanothermobacter


In [45]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'clust0.9', 'cgp', 'core_clusters_blastx_nontarget.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
8,b7814232e32442d8a235ae7e4a497c98,PKL67544.1,ATP synthase subunit B [Methanobacteriales archaeon HGW-Methanobacteriales-1],93.562,466,27,1,1389,1,466,0,466,1392,Methanobacteriales archaeon HGW-Methanobacteriales-1,2013815,1,ATP synthase subunit B,Methanobacteriales archaeon HGW-Methanobacteriales-1
8,b7814232e32442d8a235ae7e4a497c98,MBC7100188.1,,93.305,463,31,1,1389,1,463,0,463,1392,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,2,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
12,621c8390148945ba929b23b45a9ee5b3,WP_012956606.1,30S ribosomal protein S12 [Methanobrevibacter ruminantium],86.525,141,19,1,423,1,141,1.71e-70,141,426,Methanobrevibacter ruminantium;Methanobrevibacter ruminantium M1,83816;634498,29,30S ribosomal protein S12,Methanobrevibacter ruminantium
12,621c8390148945ba929b23b45a9ee5b3,TMS40846.1,30S ribosomal protein S12 [Methanobacterium sp.],86.525,141,19,1,423,1,141,1.73e-70,141,426,Methanobacterium sp.,2164,29,30S ribosomal protein S12,Methanobacterium sp.


## cluster cutoff = 0.8

### Config

In [41]:
F = file.path(work_dir, 'primers', 'clust0.8', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/samples_genomes_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.8/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prodigal: ""    
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.8 -c 0.8
    core_genes: --frac 1 --max-clusters 500
    blastx: -evalue 1e-10 -max_target_seqs 3
    blastx_nontarget: -evalue 1e-5 -max_target_seqs 30
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 500
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-prod-size 100 --max-prod-size 200
      Tm: --opt-tm 62 --min-t

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L  -S llprimer-thermo ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/primers/clust0.8/config.yaml 30 -F
```

### Summary

In [47]:
primer_info = read.delim(file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

No. of unique primer sets: 81 


cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4,249,171,171,0,249f,PRIMER_LEFT,TGMCAARGCAGAAAGRACAGT,21,8,⋯,113,134,59.73447,1.852851,45.23810,4.123930,23.66502,23.66589,-21.07523,18.42182
4,249,171,171,0,249r,PRIMER_RIGHT,GTYTTRCTBAGGGGTCTGCA,20,12,⋯,264,284,59.61943,2.155484,53.33333,4.249183,20.66544,20.66737,-25.50694,13.02455
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
138,156,141,138.1875,0.7261844,156f,PRIMER_LEFT,GAGRRTMTGYCCDGTYGACGC,21,96,⋯,605,626,63.34397,2.607826,61.11111,5.777865,41.45388,17.01681,-0.3463405,20.56855
138,156,141,138.1875,0.7261844,156r,PRIMER_RIGHT,TCYTGGCACCAKCCRCAGTT,20,8,⋯,726,746,63.95472,2.193368,57.50000,4.330127,41.81445,27.59680,-9.1657602,19.13659


In [49]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'core_clusters_blastx.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
4,00f2e6d9f0a7445d854bec355188e716,SCM56344.1,30S ribosomal protein S17P {ECO:0000255|HAMAP-Rule:MF_01345} [Methanothermobacter wolfeii],100,107,0,1,321,1,107,1.40e-55,107,324,Methanothermobacter wolfeii,145261,2,30S ribosomal protein S17P {ECO:0000255|HAMAP-Rule:MF_01345},Methanothermobacter wolfeii
4,00f2e6d9f0a7445d854bec355188e716,WP_074358574.1,30S ribosomal protein S17 [Methanothermobacter sp. THM-1],100,106,0,4,321,1,106,1.21e-54,106,324,Methanothermobacter wolfeii;Methanothermobacter sp. THM-1,145261;2606911,2,30S ribosomal protein S17,Methanothermobacter sp. THM-1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
138,586b20f2afb7485f9ac6220c90feb817,MBC7101338.1,4Fe-4S binding protein [Methanobacteriales archaeon],99.714,350,1,1,1050,1,350,0,350,1053,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,1,4Fe-4S binding protein,Methanobacteriales archaeon
138,586b20f2afb7485f9ac6220c90feb817,HHW17164.1,4Fe-4S binding protein [Methanothermobacter sp.],97.429,350,9,1,1050,1,350,0,350,1053,Methanothermobacter sp.,1884223,2,4Fe-4S binding protein,Methanothermobacter sp.


In [52]:
# non-target
gene_annot = read.delim(file.path(work_dir, 'primers', 'clust0.8', 'cgp', 'core_clusters_blastx_nontarget.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
4,00f2e6d9f0a7445d854bec355188e716,AXV38277.1,30S ribosomal protein S17 [Methanobacterium sp. BRmetb2],90.476,105,10,4,318,1,105,3.29e-48,106,324,Methanobacterium sp. BRmetb2,2025350,1,30S ribosomal protein S17,Methanobacterium sp. BRmetb2
4,00f2e6d9f0a7445d854bec355188e716,CDG64881.1,30S ribosomal protein S17 [Methanobacterium sp. MB1],87.619,105,13,4,318,1,105,3.15e-46,107,324,Methanobacterium sp. MB1,1379702,2,30S ribosomal protein S17,Methanobacterium sp. MB1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
138,586b20f2afb7485f9ac6220c90feb817,PKL66942.1,ferredoxin [Methanobacteriales archaeon HGW-Methanobacteriales-1],59.587,339,128,34,1044,11,342,1.11e-136,344,1053,Methanobacteriales archaeon HGW-Methanobacteriales-1,2013815,28,ferredoxin,Methanobacteriales archaeon HGW-Methanobacteriales-1
138,586b20f2afb7485f9ac6220c90feb817,WP_069584504.1,MULTISPECIES: 4Fe-4S binding protein [Methanobacterium],59.357,342,131,25,1044,9,344,4.22e-133,346,1053,Methanobacterium;Methanobacterium bryantii;Methanobacterium sp. A39,2160;2161;1860100,29,MULTISPECIES: 4Fe-4S binding protein,Methanobacterium


In [56]:
# most unique clusters
df.dims(10)
gene_annot %>%
    filter(pident_rank == 1) %>%
    arrange(pident) %>%
    head(n=10)
df.dims()

Unnamed: 0_level_0,cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
1,79,d6ab2803147540b392ad2801ff6b0179,MBC7101283.1,,80.909,220,42,1,660,1,220,1.57e-134,220,663,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,1,,
2,79,d6ab2803147540b392ad2801ff6b0179,PKL68928.1,RNA ligase partner protein [Methanobacteriales archaeon HGW-Methanobacteriales-1],80.909,220,42,1,660,1,220,1.94e-134,220,663,Methanobacteriales archaeon HGW-Methanobacteriales-1,2013815,1,RNA ligase partner protein,Methanobacteriales archaeon HGW-Methanobacteriales-1
3,135,b6a70d967b744f34b354c83fe5992ffa,KUK01467.1,Glycerol-1-phosphate dehydrogenase [NAD(P)+] [Methanobacteriaceae archaeon 41_258],81.034,348,65,1,1041,18,365,0.0,365,1044,Methanobacteriaceae archaeon 41_258,1635284,1,Glycerol-1-phosphate dehydrogenase,Methanobacteriaceae archaeon 41_258
4,135,b6a70d967b744f34b354c83fe5992ffa,MBC7089473.1,NAD(P)-dependent glycerol-1-phosphate dehydrogenase [Methanobacteriaceae archaeon],81.034,348,65,1,1041,1,348,0.0,348,1044,Methanobacteriaceae archaeon,2099680,1,NAD(P)-dependent glycerol-1-phosphate dehydrogenase,Methanobacteriaceae archaeon
5,102,62a350b243c7432696c3cc9e2ba235d2,RJS49305.1,exosome complex exonuclease Rrp41 [Methanobacterium sp.],82.589,224,39,49,720,16,239,5.89e-133,239,723,Methanobacterium sp.,2164,1,exosome complex exonuclease Rrp41,Methanobacterium sp.
6,10,7468b3de4335494c9d06818d41bb5d06,HGZ27079.1,rubredoxin [Methanobacteriaceae archaeon],83.019,53,9,1,159,1,53,6.6699999999999995e-25,53,162,Methanobacteriaceae archaeon,2099680,1,rubredoxin,Methanobacteriaceae archaeon
7,10,7468b3de4335494c9d06818d41bb5d06,MBC7101474.1,,83.019,53,9,1,159,1,53,1.18e-24,53,162,Methanobacteriaceae archaeon;Methanobacteriales archaeon,2099680;2478476,1,,
8,128,59de51d2c6da4240b97c96a62524522f,OPY25465.1,50S ribosomal protein L19e [Methanobacterium sp. PtaU1.Bin242],85.811,148,21,1,444,1,148,7.54e-87,148,447,Methanobacterium sp. PtaU1.Bin242,1811676,1,50S ribosomal protein L19e,Methanobacterium sp. PtaU1.Bin242
9,45,62a0ffdf41544e5a9b86e837579339cf,OPX60279.1,30S ribosomal protein S19 [Methanobacterium sp. PtaB.Bin024],86.765,136,18,1,408,1,136,1.19e-72,136,411,Methanobacterium sp. PtaB.Bin024,1811674,1,30S ribosomal protein S19,Methanobacterium sp. PtaB.Bin024
10,45,62a0ffdf41544e5a9b86e837579339cf,NYB52739.1,30S ribosomal protein S19 [Methanobacteriaceae archaeon],86.765,136,18,1,408,1,136,3.22e-72,136,411,Methanobacteriaceae archaeon,2099680,1,30S ribosomal protein S19,Methanobacteriaceae archaeon


In [74]:
df.dims(30, 40)
primer_info %>%
    filter(cluster_id %in% c(79, 135, 102, 128, 45)) %>%
    filter(Tm_sd <= 2) %>%
    group_by(primer_set) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(n == 2) %>%
    select(cluster_id, primer_set, amplicon_size_avg, primer_id, sequence, length, degeneracy, 
           position_start, position_end, Tm_avg, Tm_sd)
df.dims()

cluster_id,primer_set,amplicon_size_avg,primer_id,sequence,length,degeneracy,position_start,position_end,Tm_avg,Tm_sd
<int>,<int>,<dbl>,<fct>,<fct>,<int>,<int>,<int>,<int>,<dbl>,<dbl>
45,174,161,174f,TMARRACMCACTGCAGGGAC,20,16,199,219,60.41481,1.91584
45,174,161,174r,TCCGTGYTCAACYTTYTTCCT,21,8,339,360,59.45613,1.528516
45,194,162,194f,ATMARRACMCACTGCAGGGA,20,16,198,218,59.46321,1.950136
45,194,162,194r,TCCGTGYTCAACYTTYTTCCT,21,8,339,360,59.45613,1.528516


In [78]:
# writing out primers
outF = file.path(work_dir, 'Methanothermobacter_c45-174f-174r.tsv')
primer_info %>%
    filter(cluster_id %in% c(79, 135, 102, 128, 45)) %>%
    filter(Tm_sd <= 2) %>%
    group_by(primer_set) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(n == 2) %>%
    head(n=2) %>%
    write_table(outF)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/methanothermobacter/Methanothermobacter_c45-174f-174r.tsv 


# sessionInfo

In [75]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/tidyverse/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] LeyLabRMisc_0.1.6 ggplot2_3.3.1     tidyr_1.1.0       dplyr_1.0.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     magrittr_1.5     munsell_0.5.0    tidyselect_1.1.0
 [5] uuid_0.1-4       colorspace_1.4-1 R6_2.4.1         rlang_0.4.6     
 [9] tools_3.6.3