# Goal

* Primer design for clade of interest

# Var

In [1]:
base_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/'
clade = 'Butyricicoccus'
taxid = 580596 

# Init

In [4]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)

In [3]:
df.dims()
work_dir = file.path(base_dir, clade)
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricicoccus 


# Genome download

* Downloading genomes from NCBI

```
OUTDIR=/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/
mkdir -p $OUTDIR
ncbi-genome-download -p 12 -s genbank -F fasta --genera Butyricicoccus -o $OUTDIR bacteria
```

# Genome quality

* Filtering genomes by quality

In [5]:
D = file.path(base_dir, clade, 'genbank')
files = list_files(D, '.fna.gz')
samps = data.frame(Name = files %>% as.character %>% basename,
                   Fasta = files,
                   Domain = 'Bacteria',
                   Taxid = taxid) %>%
    mutate(Name = gsub('\\.fna\\.gz$', '', Name),
           Fasta = gsub('/+', '/', Fasta))
samps

# writing file
outfile = file.path(D, 'samples.txt')
write_table(samps, outfile)

Name,Fasta,Domain,Taxid
<chr>,<chr>,<fct>,<dbl>
GCA_000398925.1_Buty_pull_1_20_V1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/genbank/bacteria/GCA_000398925.1/GCA_000398925.1_Buty_pull_1_20_V1_genomic.fna.gz,Bacteria,580596
GCA_001643385.1_ASM164338v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/genbank/bacteria/GCA_001643385.1/GCA_001643385.1_ASM164338v1_genomic.fna.gz,Bacteria,580596
⋮,⋮,⋮,⋮
GCA_900167005.1_IMG-taxon_2585428076_annotated_assembly_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/genbank/bacteria/GCA_900167005.1/GCA_900167005.1_IMG-taxon_2585428076_annotated_assembly_genomic.fna.gz,Bacteria,580596
GCA_902374965.1_MGYG-HGUT-01434_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/genbank/bacteria/GCA_902374965.1/GCA_902374965.1_MGYG-HGUT-01434_genomic.fna.gz,Bacteria,580596


File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricicoccus/genbank/samples.txt 


### LLG

#### Config

In [6]:
cat_file(file.path(work_dir, 'config_llg.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/genbank/samples.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 2 

# Domain of genomes ('Archaea' or 'Bacteria)
## Use "Skip" if provided as a "Domain" column in the genome table
Domain: Skip

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: ""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: ""
  # de-replication (requires checkm)
  drep: -c

#### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llg
$ screen -L -S llg ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/config_llg.yaml 20 -F
```

### Samples table of high-quality genomes

In [7]:
# checkM summary
checkm = file.path(work_dir, 'LLG_output', 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<fct>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<lgl>,<fct>
GCA_000398925.1_Buty_pull_1_20_V1_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0,0,3325721,26204,⋯,2,261,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricicoccus|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.1
GCA_001643435.1_ASM164343v1_genomic,o__Clostridiales (UID1212),172,263,149,67.11,0,0,1452273,1058,⋯,78,185,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricicoccus|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_900143195.1_501571.5_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0,0,3284487,5872,⋯,2,261,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricicoccus|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.17,markers_qa_summary.tsv.17,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.17
GCA_902374965.1_MGYG-HGUT-01434_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0,0,3325721,26204,⋯,2,261,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricicoccus|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.18,markers_qa_summary.tsv.18,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.18


In [8]:
# dRep summary
drep = file.path(work_dir, 'LLG_output', 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<fct>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000398925.1_Buty_pull_1_20_V1_genomic.fna,98.66,0,GCA_000398925.1_Buty_pull_1_20_V1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001643435.1_ASM164343v1_genomic.fna,67.11,0,GCA_001643435.1_ASM164343v1_genomic
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900143195.1_501571.5_genomic.fna,98.66,0,GCA_900143195.1_501571.5_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_902374965.1_MGYG-HGUT-01434_genomic.fna,98.66,0,GCA_902374965.1_MGYG-HGUT-01434_genomic


In [9]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'LLG_output', 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<fct>,<fct>
GCA_002157465.1_ASM215746v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/drep/drep/dereplicated_genomes/GCA_002157465.1_ASM215746v1_genomic.fna
GCA_016297755.1_ASM1629775v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/drep/drep/dereplicated_genomes/GCA_016297755.1_ASM1629775v1_genomic.fna
⋮,⋮
GCA_003202935.1_ASM320293v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/drep/drep/dereplicated_genomes/GCA_003202935.1_ASM320293v1_genomic.fna
GCA_009881305.1_ASM988130v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricicoccus/LLG_output/drep/drep/dereplicated_genomes/GCA_009881305.1_ASM988130v1_genomic.fna


In [10]:
# GTDBTk summary
tax = file.path(work_dir, 'LLG_output', 'gtdbtk', 'gtdbtk_bac_summary.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy,
           -other_related_references.genome_id.species_name.radius.ANI.AF.)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<fct>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
GCA_000398925.1_Buty_pull_1_20_V1_genomic,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Oscillospirales,f__Butyricicoccaceae,g__Butyricicoccus,s__Butyricicoccus pullicaecorum,GCF_900167005.1,95.0,⋯,1.0,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,99.99,1.0,97.26,11,,
GCA_001643435.1_ASM164343v1_genomic,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Oscillospirales,f__Butyricicoccaceae,g__Agathobaculum,s__Agathobaculum butyriciproducens,GCF_003096535.1,95.0,⋯,0.9,GCF_003096535.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Agathobaculum;s__Agathobaculum butyriciproducens,97.16,0.9,69.94,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_900143195.1_501571.5_genomic,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Oscillospirales,f__Butyricicoccaceae,g__Butyricicoccus,s__Butyricicoccus pullicaecorum,GCF_900167005.1,95.0,⋯,0.99,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,99.97,0.99,97.14,11,,
GCA_902374965.1_MGYG-HGUT-01434_genomic,d__Bacteria,p__Firmicutes_A,c__Clostridia,o__Oscillospirales,f__Butyricicoccaceae,g__Butyricicoccus,s__Butyricicoccus pullicaecorum,GCF_900167005.1,95.0,⋯,1.0,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,99.99,1.0,97.26,11,,


In [11]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 26 
setdiff(x,y): 0 
setdiff(y,x): 10 
union(x,y): 36 
-- checkm --
intersect(x,y): 36 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 36 
-- gtdbtk --
intersect(x,y): 36 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 36 


In [12]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000398925.1_Buty_pull_1_20_V1_genomic.fna,98.66,0.00,GCA_000398925.1_Buty_pull_1_20_V1_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0.00,⋯,1.0,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,99.99,1.0,97.26,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_002157465.1_ASM215746v1_genomic.fna,98.66,2.68,GCA_002157465.1_ASM215746v1_genomic,o__Clostridiales (UID1212),172,263,149,98.66,2.68,⋯,0.99,GCF_002157465.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus_A;s__Butyricicoccus_A porcorum,100.0,0.99,96.27,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014648015.1_ASM1464801v1_genomic.fna,99.33,0.67,GCA_014648015.1_ASM1464801v1_genomic,o__Clostridiales (UID1212),172,263,149,99.33,0.67,⋯,0.83,GCF_003096535.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Agathobaculum;s__Agathobaculum butyriciproducens,95.65,0.83,97.10,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_016293105.1_ASM1629310v1_genomic.fna,92.88,0.67,GCA_016293105.1_ASM1629310v1_genomic,o__Clostridiales (UID1212),172,263,149,92.88,0.67,⋯,0.82,GCA_002395695.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus_A;s__Butyricicoccus_A sp002395695,98.29,0.82,75.63,11,,


In [13]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 90,
           contamination < 5,
           Strain.heterogeneity < 50,
           X..contigs < 300,
           Mean.contig.length..bp. > 10000)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000398925.1_Buty_pull_1_20_V1_genomic.fna,98.66,0,GCA_000398925.1_Buty_pull_1_20_V1_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0,⋯,1.0,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,99.99,1.0,97.26,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_002160385.1_ASM216038v1_genomic.fna,98.66,0,GCA_002160385.1_ASM216038v1_genomic,o__Clostridiales (UID1212),172,263,149,98.66,0,⋯,0.8,GCF_900167005.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus;s__Butyricicoccus pullicaecorum,96.44,0.8,97.26,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014648015.1_ASM1464801v1_genomic.fna,99.33,0.67,GCA_014648015.1_ASM1464801v1_genomic,o__Clostridiales (UID1212),172,263,149,99.33,0.67,⋯,0.83,GCF_003096535.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Agathobaculum;s__Agathobaculum butyriciproducens,95.65,0.83,97.10,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_016293105.1_ASM1629310v1_genomic.fna,92.88,0.67,GCA_016293105.1_ASM1629310v1_genomic,o__Clostridiales (UID1212),172,263,149,92.88,0.67,⋯,0.82,GCA_002395695.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Oscillospirales;f__Butyricicoccaceae;g__Butyricicoccus_A;s__Butyricicoccus_A sp002395695,98.29,0.82,75.63,11,,


In [14]:
# summarizing the genome stats
hq_genomes %>% summary_x('completeness', completeness)
hq_genomes %>% summary_x('contamination', contamination)
hq_genomes %>% summary_x('No. of contigs', X..contigs)
hq_genomes %>% summary_x('N50', N50..contigs.)
hq_genomes %>% summary_x('Mean contig length', Mean.contig.length..bp.)

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
completeness,92.88,98.66,99.33,98.67429,99.33,99.92,2.616,1.068


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
contamination,0,0.67,0.67,0.6128571,0.67,1.34,0.424,0.173


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
No. of contigs,4,55,71,70.28571,82,143,44.834,18.303


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
N50,31284,93692,110619,280090.9,218166,2682118,1038941,424145.8


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Mean contig length,18642,36058,44722,84265.19,53821,737295,282444.8,115307.6


In [15]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Family,Genus,n_genomes
<chr>,<chr>,<int>
f__Butyricicoccaceae,g__Agathobaculum,17
f__Butyricicoccaceae,g__Butyricicoccus,3
f__Butyricicoccaceae,g__Butyricicoccus_A,1


In [None]:
# filtering by taxonomy
#hq_genomes = hq_genomes %>%
#    filter(Genus == 'g__Anaerotruncus')
#hq_genomes

In [16]:
# writing samples table for LLPRIMER
outfile = file.path(work_dir, 'samples_genomes_hq.txt')
hq_genomes %>%
    select(Bin.Id, Fasta) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Taxon = gsub('_genomic', '', Taxon),
           Taxon = gsub('_annotated_assembly', '', Taxon),
           Taxid = taxid) %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricicoccus/samples_genomes_hq.txt 


# Primer design

# --TODO--

### Config

In [None]:
F = file.path(work_dir, 'primers', 'config.yaml')
cat_file(F)

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L -S llprimer-aner ./snakemake_sge.sh experiments/HMP_most-wanted/Anaerotruncus/primers/config.yaml 50 --notemp -F
```

### Summary

In [None]:
primer_info = read.delim(file.path(work_dir, 'primers', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

### Gene cluster annotations

In [None]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'cgp', 'core_clusters_blastx.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

In [None]:
df.dims(50)
gene_annot %>%
    distinct(cluster_id, gene_name) 
df.dims()

In [None]:
df.dims(50)
gene_annot %>%
    distinct(cluster_id, gene_taxonomy) 
df.dims()

### Gene cluster: closest related

In [None]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'cgp', 'core_clusters_blastx_nontarget.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

In [None]:
df.dims(50)
gene_annot %>%
    filter(pident > 80,
           pident_rank <= 3) %>%
    select(cluster_id, gene_name, gene_taxonomy, pident)
    
df.dims()

In [None]:
df.dims(50)
gene_annot %>%
    distinct(cluster_id, gene_name) 
df.dims()

# sessionInfo

In [None]:
pipelineInfo('/ebio/abt3_projects/software/dev/ll_pipelines/llg/')

In [None]:
pipelineInfo('/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/')

In [None]:
sessionInfo()