# Goal

* Primer design for clade of interest

# Var

In [16]:
base_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/'
clade = 'Butyricimonas'
taxid = 574697 

# Init

In [17]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)

In [18]:
df.dims()
work_dir = file.path(base_dir, clade)
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricimonas 


# Genome download

* Downloading genomes from NCBI

```
OUTDIR=/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/
mkdir -p $OUTDIR
ncbi-genome-download -p 12 -s genbank -F fasta --genera Butyricimonas -o $OUTDIR bacteria
```

# Genome quality

* Filtering genomes by quality

In [7]:
D = file.path(base_dir, clade, 'genbank')
files = list_files(D, '.fna.gz')
samps = data.frame(Name = files %>% as.character %>% basename,
                   Fasta = files,
                   Domain = 'Bacteria',
                   Taxid = taxid) %>%
    mutate(Name = gsub('\\.fna\\.gz$', '', Name),
           Fasta = gsub('/+', '/', Fasta))
samps

# writing file
outfile = file.path(D, 'samples.txt')
write_table(samps, outfile)

Name,Fasta,Domain,Taxid
<chr>,<chr>,<fct>,<dbl>
GCA_000379665.1_ASM37966v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/genbank/bacteria/GCA_000379665.1/GCA_000379665.1_ASM37966v1_genomic.fna.gz,Bacteria,244127
GCA_000519105.1_ASM51910v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/genbank/bacteria/GCA_000519105.1/GCA_000519105.1_ASM51910v1_genomic.fna.gz,Bacteria,244127
⋮,⋮,⋮,⋮
GCA_902364335.1_MGYG-HGUT-00203_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/genbank/bacteria/GCA_902364335.1/GCA_902364335.1_MGYG-HGUT-00203_genomic.fna.gz,Bacteria,244127
GCA_902376195.1_MGYG-HGUT-01552_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/genbank/bacteria/GCA_902376195.1/GCA_902376195.1_MGYG-HGUT-01552_genomic.fna.gz,Bacteria,244127


File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricimonas/genbank/samples.txt 


### LLG

#### Config

In [11]:
cat_file(file.path(work_dir, 'config_llg.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/genbank/samples.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 2 

# Domain of genomes ('Archaea' or 'Bacteria)
## Use "Skip" if provided as a "Domain" column in the genome table
Domain: Skip

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: ""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: ""
  # de-replication (requires checkm)
  drep: -com

#### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llg
$ screen -L -S llg ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/config_llg.yaml 20 -F
```

### Samples table of high-quality genomes

In [7]:
# checkM summary
checkm = file.path(work_dir, 'LLG_output', 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<fct>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<lgl>,<fct>
GCA_000379665.1_ASM37966v1_genomic,k__Bacteria (UID2569),434,278,186,100.00,0.00,0,4770838,131,⋯,0,278,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricimonas|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.1
GCA_001915615.1_ASM191561v1_genomic,k__Bacteria (UID2569),434,278,186,88.69,0.27,0,4167409,2477,⋯,44,233,1,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricimonas|LLG_output|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_900184685.1_PRJEB20857_genomic,k__Bacteria (UID2569),434,278,186,100.00,0,0,5300003,0,⋯,0,278,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricimonas|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.13,markers_qa_summary.tsv.13,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.13
GCA_902364335.1_MGYG-HGUT-00203_genomic,k__Bacteria (UID2569),434,278,186,99.46,0,0,4537596,250,⋯,1,277,0,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|HMP_most-wanted|Butyricimonas|LLG_output|checkM|2|checkm|markers_qa_summary.tsv.14,markers_qa_summary.tsv.14,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/checkM/2/checkm/markers_qa_summary.tsv.14


In [8]:
# dRep summary
drep = file.path(work_dir, 'LLG_output', 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<fct>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000379665.1_ASM37966v1_genomic.fna,100.00,0.00,GCA_000379665.1_ASM37966v1_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001915615.1_ASM191561v1_genomic.fna,88.69,0.27,GCA_001915615.1_ASM191561v1_genomic
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900184685.1_PRJEB20857_genomic.fna,100.00,0,GCA_900184685.1_PRJEB20857_genomic
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_902364335.1_MGYG-HGUT-00203_genomic.fna,99.46,0,GCA_902364335.1_MGYG-HGUT-00203_genomic


In [9]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'LLG_output', 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<fct>,<fct>
GCA_003991565.1_ASM399156v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/drep/drep/dereplicated_genomes/GCA_003991565.1_ASM399156v1_genomic.fna
GCA_900258545.1_PRJEB22910_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/drep/drep/dereplicated_genomes/GCA_900258545.1_PRJEB22910_genomic.fna
⋮,⋮
GCA_014287455.1_ASM1428745v1_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/drep/drep/dereplicated_genomes/GCA_014287455.1_ASM1428745v1_genomic.fna
GCA_900184685.1_PRJEB20857_genomic,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/LLG_output/drep/drep/dereplicated_genomes/GCA_900184685.1_PRJEB20857_genomic.fna


In [10]:
# GTDBTk summary
tax = file.path(work_dir, 'LLG_output', 'gtdbtk', 'gtdbtk_bac_summary.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy,
           -other_related_references.genome_id.species_name.radius.ANI.AF.)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<dbl>,⋯,<dbl>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
GCA_000379665.1_ASM37966v1_genomic,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Marinifilaceae,g__Butyricimonas,s__Butyricimonas synergistica,GCF_000379665.1,95,⋯,1.00,GCF_000379665.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas synergistica,100.00,1.00,96.57,11,,
GCA_001915615.1_ASM191561v1_genomic,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Marinifilaceae,g__Butyricimonas,s__Butyricimonas faecihominis,GCF_003851945.2,95,⋯,0.78,GCF_003851945.2,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas faecihominis,98.09,0.78,84.37,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GCA_900184685.1_PRJEB20857_genomic,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Marinifilaceae,g__Butyricimonas,s__Butyricimonas sp900184685,GCF_900184685.1,95,⋯,1.00,GCF_900184685.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas sp900184685,100.00,1.00,96.71,11,,
GCA_902364335.1_MGYG-HGUT-00203_genomic,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Marinifilaceae,g__Butyricimonas,s__Butyricimonas virosa,GCF_000519105.1,95,⋯,0.85,GCF_000519105.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas virosa,97.98,0.85,96.79,11,,


In [11]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 17 
setdiff(x,y): 0 
setdiff(y,x): 12 
union(x,y): 29 
-- checkm --
intersect(x,y): 29 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 29 
-- gtdbtk --
intersect(x,y): 29 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 29 


In [12]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<dbl>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000379665.1_ASM37966v1_genomic.fna,100.00,0,GCA_000379665.1_ASM37966v1_genomic,k__Bacteria (UID2569),434,278,186,100.00,0,⋯,1.00,GCF_000379665.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas synergistica,100.00,1.00,96.57,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_002320175.1_ASM232017v1_genomic.fna,98.39,0,GCA_002320175.1_ASM232017v1_genomic,k__Bacteria (UID2569),434,278,186,98.39,0,⋯,0.87,GCF_000519105.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas virosa,98.27,0.87,94.37,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014287455.1_ASM1428745v1_genomic.fna,100,0,GCA_014287455.1_ASM1428745v1_genomic,k__Bacteria (UID2569),434,278,186,100,0,⋯,0.81,GCF_900184685.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas sp900184685,97.68,0.81,96.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900184685.1_PRJEB20857_genomic.fna,100,0,GCA_900184685.1_PRJEB20857_genomic,k__Bacteria (UID2569),434,278,186,100,0,⋯,1.00,GCF_900184685.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas sp900184685,100.00,1.00,96.71,11,,


In [13]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 90,
           contamination < 5,
           Strain.heterogeneity < 50)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<dbl>,<fct>,<dbl>,<fct>,<dbl>,<dbl>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_000379665.1_ASM37966v1_genomic.fna,100.00,0,GCA_000379665.1_ASM37966v1_genomic,k__Bacteria (UID2569),434,278,186,100.00,0,⋯,1.00,GCF_000379665.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas synergistica,100.00,1.00,96.57,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_002320175.1_ASM232017v1_genomic.fna,98.39,0,GCA_002320175.1_ASM232017v1_genomic,k__Bacteria (UID2569),434,278,186,98.39,0,⋯,0.87,GCF_000519105.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas virosa,98.27,0.87,94.37,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014287455.1_ASM1428745v1_genomic.fna,100,0,GCA_014287455.1_ASM1428745v1_genomic,k__Bacteria (UID2569),434,278,186,100,0,⋯,0.81,GCF_900184685.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas sp900184685,97.68,0.81,96.73,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900184685.1_PRJEB20857_genomic.fna,100,0,GCA_900184685.1_PRJEB20857_genomic,k__Bacteria (UID2569),434,278,186,100,0,⋯,1.00,GCF_900184685.1,95,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Marinifilaceae;g__Butyricimonas;s__Butyricimonas sp900184685,100.00,1.00,96.71,11,,


In [14]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Family,Genus,n_genomes
<chr>,<chr>,<int>
f__Marinifilaceae,g__Butyricimonas,17


In [15]:
# writing samples table for LLPRIMER
outfile = file.path(work_dir, 'samples_genomes_hq.txt')
hq_genomes %>%
    select(Bin.Id, Fasta) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Taxon = gsub('_genomic', '', Taxon),
           Taxon = gsub('_annotated_assembly', '', Taxon),
           Taxid = taxid) %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted//Butyricimonas/samples_genomes_hq.txt 


# Primer design

### Config

In [19]:
F = file.path(work_dir, 'primers', 'config.yaml')
cat_file(F)

#-- I/O --#
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/samples_genomes_hq.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/primers/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

#-- software parameters --#
# See the README for a description
params:
  ionice: -c 3
  cgp:
    prodigal: ""    
    mmseqs:
      method: cluster    # or linclust (faster)
      run: --min-seq-id 0.8 -c 0.8
    core_genes: --frac 1 --max-clusters 500
    blastx: -evalue 1e-10 -max_target_seqs 3
    blastx_nontarget: -evalue 1e-5 -max_target_seqs 30
    align:
      method: linsi
      params: --auto --maxiterate 1000
    primer3:
      number: --num-primers 500
      size: --opt-size 20 --min-size 18 --max-size 24
      product: --opt-prod-size 150 --min-prod-size 100 --max-prod-size 200
      Tm: --opt-tm

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer
$ screen -L -S llprimer-But ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/HMP_most-wanted/Butyricimonas/primers/config.yaml 50 -F
```

### Summary

In [20]:
primer_info = read.delim(file.path(work_dir, 'primers', 'cgp', 'primers_final_info.tsv'), sep='\t')
primer_info %>% unique_n('primer sets', primer_set)
primer_info

No. of unique primer sets: 495 


cluster_id,primer_set,amplicon_size_consensus,amplicon_size_avg,amplicon_size_sd,primer_id,primer_type,sequence,length,degeneracy,⋯,position_start,position_end,Tm_avg,Tm_sd,GC_avg,GC_sd,hairpin_avg,hairpin_sd,homodimer_avg,homodimer_sd
<int>,<int>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<int>,<int>,⋯,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4,13,172,172,0,13f,PRIMER_LEFT,GAAYCARGGACAYWGCCACC,20,16,⋯,161,181,60.40519,1.951478,57.50000,4.330127,10.70543,18.661427,-23.02055,15.93829
4,13,172,172,0,13r,PRIMER_RIGHT,MGCYTCDGCACCRCTATTCA,20,24,⋯,313,333,61.39287,2.160181,54.16667,4.930066,42.22950,2.453775,-19.36984,24.02227
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
500,499,133,133,0,499f,PRIMER_LEFT,ACBCCMTGYGCBGCAAAAAT,20,36,⋯,573,593,63.11378,2.487889,51.66667,4.859127,31.85494,24.19495,6.798857,10.52647
500,499,133,133,0,499r,PRIMER_RIGHT,GTTGRTCCCGGTGTTGCGTG,20,2,⋯,686,706,63.96520,1.143588,62.50000,2.500000,0.00000,0.00000,-1.736426,0.00000


### Gene cluster annotations

In [21]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'cgp', 'core_clusters_blastx.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
4,4dd2fadf7df94a0babc664b2d3c2b220,NJC17858.1,ornithine--oxo-acid transaminase [Butyricimonas paravirosa],99.015,406,4,1,1218,1,406,0,406,1221,Butyricimonas paravirosa,1472417,1,ornithine--oxo-acid transaminase,Butyricimonas paravirosa
4,4dd2fadf7df94a0babc664b2d3c2b220,MBB4027972.1,ornithine--oxo-acid transaminase [Butyricimonas faecihominis],98.768,406,5,1,1218,1,406,0,406,1221,Butyricimonas faecihominis,1472416,2,ornithine--oxo-acid transaminase,Butyricimonas faecihominis
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
500,466e10a5838d4b9ebeea2eb8446771b8,WP_099293516.1,4Fe-4S binding protein [Butyricimonas sp. Marseille-P3923],99.776,447,1,1,1341,1,447,0,447,1344,Butyricimonas sp. Marseille-P3923,1987504,1,4Fe-4S binding protein,Butyricimonas sp. Marseille-P3923
500,466e10a5838d4b9ebeea2eb8446771b8,WP_040371279.1,MULTISPECIES: 4Fe-4S binding protein [Butyricimonas],95.302,447,21,1,1341,1,447,0,447,1344,Butyricimonas synergistica;Butyricimonas;Butyricimonas synergistica DSM 23225;Butyricimonas sp. Marseille-P2440,544644;574697;1121129;1816677,2,MULTISPECIES: 4Fe-4S binding protein,Butyricimonas


In [22]:
df.dims(50)
gene_annot %>%
    distinct(cluster_id, gene_name) 
df.dims()

cluster_id,gene_name
<dbl>,<chr>
4,ornithine--oxo-acid transaminase
7,MULTISPECIES: 50S ribosomal protein L27
9,MULTISPECIES: bifunctional methylenetetrahydrofolate dehydrogenase/methenyltetrahydrofolate cyclohydrolase FolD
9,bifunctional methylenetetrahydrofolate dehydrogenase/methenyltetrahydrofolate cyclohydrolase FolD
13,NADH:ubiquinone reductase (Na(+)-transporting) subunit E
14,MULTISPECIES: homocysteine biosynthesis protein
14,hypothetical protein BHV81_15115
14,homocysteine biosynthesis protein
16,MULTISPECIES: DNA polymerase III subunit delta
16,DNA polymerase III subunit delta


In [23]:
df.dims(50)
gene_annot %>%
    distinct(cluster_id, gene_taxonomy) 
df.dims()

cluster_id,gene_taxonomy
<dbl>,<chr>
4,Butyricimonas paravirosa
4,Butyricimonas faecihominis
4,Butyricimonas virosa
7,Butyricimonas
7,unclassified Butyricimonas
9,Butyricimonas
9,Butyricimonas virosa
13,Butyricimonas paravirosa
13,Butyricimonas virosa
13,Odoribacter sp. AF15-53


### Gene cluster: clostest related

In [24]:
gene_annot = read.delim(file.path(work_dir, 'primers', 'cgp', 'core_clusters_blastx_nontarget.tsv'), 
                        sep='\t') %>%
    mutate(cluster_id = gsub('cluster_', '', cluster_id) %>% as.Num) %>%
    semi_join(primer_info, c('cluster_id')) %>%
    mutate(gene_name = gsub(' \\[.+', '', subject_name),
           gene_taxonomy = gsub('.+\\[', '', subject_name),
           gene_taxonomy = gsub('\\]', '', gene_taxonomy))
gene_annot

cluster_id,query,subject,subject_name,pident,length,mismatch,qstart,qend,sstart,send,evalue,slen,qlen,sscinames,staxids,pident_rank,gene_name,gene_taxonomy
<dbl>,<fct>,<fct>,<fct>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<fct>,<fct>,<int>,<chr>,<chr>
4,4dd2fadf7df94a0babc664b2d3c2b220,RHR78904.1,,98.276,406,7,1,1218,1,406,0,406,1221,Odoribacter sp. AF15-53,2292236,2,,
4,4dd2fadf7df94a0babc664b2d3c2b220,RGG42580.1,,99.007,403,4,10,1218,1,403,0,403,1221,Odoribacter sp. AM16-33;Odoribacter sp. AF21-41,2292048;2293111,1,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
500,466e10a5838d4b9ebeea2eb8446771b8,OHD46869.1,ferredoxin [Spirochaetes bacterium GWF1_31_7],49.888,445,221,10,1341,6,449,1.12e-167,461,1344,Spirochaetes bacterium GWF1_31_7,1802187,29,ferredoxin,Spirochaetes bacterium GWF1_31_7
500,466e10a5838d4b9ebeea2eb8446771b8,MBI9053771.1,4Fe-4S binding protein [Bacteroidales bacterium],49.213,445,223,10,1341,6,448,1.18e-167,450,1344,Bacteroidales bacterium,2030927,30,4Fe-4S binding protein,Bacteroidales bacterium


In [25]:
df.dims(50)
gene_annot %>%
    filter(pident > 80,
           pident_rank <= 3) %>%
    select(cluster_id, gene_name, gene_taxonomy, pident)
    
df.dims()

cluster_id,gene_name,gene_taxonomy,pident
<dbl>,<chr>,<chr>,<dbl>
4,,,98.276
4,,,99.007
4,,,98.263
7,,,91.860
7,50S ribosomal protein L27,Odoribacter sp. AF15-53,89.535
7,50S ribosomal protein L27,Sanguibacteroides justesenii,84.884
7,50S ribosomal protein L27,bacterium,84.884
9,,,97.938
9,bifunctional methylenetetrahydrofolate dehydrogenase/methenyltetrahydrofolate cyclohydrolase FolD,Odoribacter sp. AF15-53,94.158
9,bifunctional methylenetetrahydrofolate dehydrogenase/methenyltetrahydrofolate cyclohydrolase FolD,Sanguibacteroides justesenii,83.505


# sessionInfo

In [None]:
pipelineInfo('/ebio/abt3_projects/software/dev/ll_pipelines/llg/')

In [None]:
pipelineInfo('/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/')

In [None]:
sessionInfo()