# Goal

* Select genomes for primer design

# Var

In [7]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank'

# Init

In [8]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(LeyLabRMisc)

In [9]:
df.dims()

# genome download

```
ncbi-genome-download -p 12 -s genbank -F fasta --genus Christensenella -o $WORKDIR bacteria
```

# genome QC

## LLG

* LLG pipeline for running:
  * checkM
  * dRep
  * GTDBTk

### Config

In [10]:
cat_file(file.path(work_dir, 'config_llg.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/samples_llg.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 2 

# Domain of genomes ('Archaea' or 'Bacteria)
## Use "Skip" if provided as a "Domain" column in the genome table
Domain: Skip

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: ""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: Skip #""
  # de-replication (requires checkm)
  drep: Skip #-comp 90 -con 5 -sa

### Run

```
./snakemake_sge.sh ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/config_llg.yaml 30 -F
```

## Selecting genomes

In [68]:
# checkM summary
checkm = file.path(work_dir, 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(, sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<fct>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<fct>,<lgl>,<fct>
CAADUP010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11615,o__Clostridiales (UID1120),304,250,143,82.75,0.70,0,1686076,0,⋯,35,214,1,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenella|genbank|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/checkM/1/checkm/markers_qa_summary.tsv.1
CAAFJJ010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11614,o__Clostridiales (UID1120),304,250,143,80.38,1.05,0,1855163,0,⋯,42,205,3,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenella|genbank|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21,o__Clostridiales (UID1120),304,250,143,99.3,1.17,0,2907470,0,⋯,1,246,3,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenella|genbank|checkM|2|checkm|markers_qa_summary.tsv.21,markers_qa_summary.tsv.21,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/checkM/2/checkm/markers_qa_summary.tsv.21
WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39,o__Clostridiales (UID1120),304,250,143,98.6,1.17,0,2910731,0,⋯,2,245,3,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenella|genbank|checkM|2|checkm|markers_qa_summary.tsv.22,markers_qa_summary.tsv.22,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/checkM/2/checkm/markers_qa_summary.tsv.22


In [69]:
# dRep summary
drep = file.path(work_dir, 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<fct>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CAADUP010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11615.fna,82.75,0.70,CAADUP010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11615
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CAAFJJ010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11614.fna,80.38,1.05,CAAFJJ010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11614
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21.fna,99.3,1.17,WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39.fna,98.6,1.17,WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39


In [70]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<fct>,<fct>
DUQK01000013_1_TPA_asm_Christensenellaceae_bacterium_isolate_AS08sgBPME_399_100239_AS08,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/drep/drep/dereplicated_genomes/DUQK01000013_1_TPA_asm_Christensenellaceae_bacterium_isolate_AS08sgBPME_399_100239_AS08.fna
JAAYRU010000004_1_Christensenellaceae_bacterium_isolate_AS23ysBPME_237,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/drep/drep/dereplicated_genomes/JAAYRU010000004_1_Christensenellaceae_bacterium_isolate_AS23ysBPME_237.fna
⋮,⋮
WGWU01000001_1_Christensenellaceae_bacterium_isolate_COPD125_SB1790_bin_24,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/drep/drep/dereplicated_genomes/WGWU01000001_1_Christensenellaceae_bacterium_isolate_COPD125_SB1790_bin_24.fna
WGWI01000001_1_Christensenellaceae_bacterium_isolate_COPD113_SB1790_bin_5,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/drep/drep/dereplicated_genomes/WGWI01000001_1_Christensenellaceae_bacterium_isolate_COPD113_SB1790_bin_5.fna


In [71]:
# GTDBTk summary
tax = file.path(work_dir, 'gtdbtk', 'gtdbtk_bac_summary.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,other_related_references.genome_id.species_name.radius.ANI.AF.,msa_percent,translation_table,red_value,warnings
<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<fct>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
CAADUP010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11615,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__UMGS743,s__UMGS743 sp900545085,GCA_900545085.1,95.0,⋯,GCA_900545085.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__UMGS743;s__UMGS743 sp900545085,99.7,0.81,,78.27,11,,
CAAFJJ010000001_1_TPA_asm_Christensenellaceae_bacterium_isolate_HGM11614,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__,s__,,,⋯,,,,,,,73.73,11,0.8049018204100806,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__CAG-74,g__UBA11524,s__UBA11524 sp000437595,GCA_000437595.1,95.0,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,97.32,0.89,,92.40,11,,
WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__CAG-74,g__UBA11524,s__UBA11524 sp000437595,GCA_000437595.1,95.0,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,98.03,0.88,,92.16,11,,


In [72]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 30 
setdiff(x,y): 0 
setdiff(y,x): 14 
union(x,y): 44 
-- checkm --
intersect(x,y): 44 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 44 
-- gtdbtk --
intersect(x,y): 44 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 44 


In [73]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
    #mutate(genome = gsub('/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/',
    #                     )
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,other_related_references.genome_id.species_name.radius.ANI.AF.,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550.fna,97.98,0.81,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,c__Clostridia (UID1118),387,223,124,97.98,0.81,⋯,GCF_900087015.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella_A;s__Christensenella_A timonensis,100.0,1.0,,95.42,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome.fna,98.39,0.81,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,c__Clostridia (UID1118),387,223,124,98.39,0.81,⋯,GCF_003628755.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella;s__Christensenella minuta,100.0,0.99,"GCF_001678845.1, s__Christensenella sp001678845, 95.0, 83.51, 0.69; GCF_900155415.1, s__Christensenella massiliensis, 95.0, 80.27, 0.47",95.42,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21.fna,99.3,1.17,WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21,o__Clostridiales (UID1120),304,250,143,99.3,1.17,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,97.32,0.89,,92.40,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39.fna,98.6,1.17,WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39,o__Clostridiales (UID1120),304,250,143,98.6,1.17,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,98.03,0.88,,92.16,11,,


### High quality genomes

In [103]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 95,
           contamination < 5,
           Strain.heterogeneity < 50,
           Genome.size..bp. >= 2e6,
           X..predicted.genes >= 2000,
           X..contigs <= 200)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,other_related_references.genome_id.species_name.radius.ANI.AF.,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550.fna,97.98,0.81,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,c__Clostridia (UID1118),387,223,124,97.98,0.81,⋯,GCF_900087015.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella_A;s__Christensenella_A timonensis,100.0,1.0,,95.42,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome.fna,98.39,0.81,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,c__Clostridia (UID1118),387,223,124,98.39,0.81,⋯,GCF_003628755.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella;s__Christensenella minuta,100.0,0.99,"GCF_001678845.1, s__Christensenella sp001678845, 95.0, 83.51, 0.69; GCF_900155415.1, s__Christensenella massiliensis, 95.0, 80.27, 0.47",95.42,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21.fna,99.3,1.17,WGWR01000001_1_Christensenellaceae_bacterium_isolate_COPD122_SB1785_bin_21,o__Clostridiales (UID1120),304,250,143,99.3,1.17,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,97.32,0.89,,92.40,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39.fna,98.6,1.17,WGWS01000001_1_Christensenellaceae_bacterium_isolate_COPD123_SB1786_bin_39,o__Clostridiales (UID1120),304,250,143,98.6,1.17,⋯,GCA_000437595.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,98.03,0.88,,92.16,11,,


In [104]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Family,Genus,n_genomes
<chr>,<chr>,<int>
f__,g__,1
f__CAG-138,g__PeH17,1
f__CAG-138,g__Phil1,2
f__CAG-74,g__Firm-11,2
f__CAG-74,g__UBA11524,6
f__Christensenellaceae,g__,1
f__Christensenellaceae,g__Christensenella,4
f__Christensenellaceae,g__Christensenella_A,1
f__GCA-900066905,g__GCA-900066905,1
f__MARSEILLE-P3954,g__MARSEILLE-P3954,1


In [105]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Order, Family) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Order,Family,n_genomes
<chr>,<chr>,<int>
o__Christensenellales,f__,1
o__Christensenellales,f__CAG-138,3
o__Christensenellales,f__CAG-74,8
o__Christensenellales,f__Christensenellaceae,6
o__Christensenellales,f__GCA-900066905,1
o__Christensenellales,f__MARSEILLE-P3954,1


In [106]:
hq_genomes = hq_genomes %>%
    filter(Genus %in% c('g__Christensenella', 'g__Christensenella_A'))
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,other_related_references.genome_id.species_name.radius.ANI.AF.,msa_percent,translation_table,red_value,warnings
<fct>,<dbl>,<dbl>,<chr>,<fct>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<int>,<fct>,<fct>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550.fna,97.98,0.81,CABKWJ010000001_1_Christensenella_timonensis_isolate_MGYG_HGUT_01550,c__Clostridia (UID1118),387,223,124,97.98,0.81,⋯,GCF_900087015.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella_A;s__Christensenella_A timonensis,100.0,1.0,,95.42,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome.fna,98.39,0.81,CP029256_1_Christensenella_minuta_strain_DSM_22607_chromosome,c__Clostridia (UID1118),387,223,124,98.39,0.81,⋯,GCF_003628755.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella;s__Christensenella minuta,100.0,0.99,"GCF_001678845.1, s__Christensenella sp001678845, 95.0, 83.51, 0.69; GCF_900155415.1, s__Christensenella massiliensis, 95.0, 80.27, 0.47",95.42,11,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/CABMKF010000001_1_Christensenellaceae_bacterium_isolate_MGYG_HGUT_02411.fna,99.19,0.81,CABMKF010000001_1_Christensenellaceae_bacterium_isolate_MGYG_HGUT_02411,c__Clostridia (UID1118),387,223,124,99.19,0.81,⋯,GCF_001678845.1,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella;s__Christensenella sp001678845,100.0,1.0,"GCF_003628755.1, s__Christensenella minuta, 95.0, 83.51, 0.69; GCF_900155415.1, s__Christensenella massiliensis, 95.0, 79.67, 0.42",95.34,11,,
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/JACOON010000001_1_Christensenella_sp__NSJ_35_HPD3569.fna,99.19,0.81,JACOON010000001_1_Christensenella_sp__NSJ_35_HPD3569,c__Clostridia (UID1118),387,223,124,99.19,0.81,⋯,,,,,,"GCF_003628755.1, s__Christensenella minuta, 95.0, 83.81, 0.61; GCF_001678845.1, s__Christensenella sp001678845, 95.0, 81.27, 0.53; GCF_900155415.1, s__Christensenella massiliensis, 95.0, 80.1, 0.42",95.36,11,0.9760390971090127,


# Formatting for LLPRIMER

The samples file requires the following columns:

* Taxon
* Groups
* Taxid
* Fasta

In [98]:
hq_genomes %>% .$Fasta

In [100]:
outfile = file.path(work_dir, 'samples_christ_hq.txt')
hq_genomes %>%
    select(Bin.Id, genome, Fasta) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Groups = 'Christenenella',
           Taxid = 990721) %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenella/genbank/samples_christ_hq.txt 


# sessionInfo

In [101]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/tidyverse/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] LeyLabRMisc_0.1.6 ggplot2_3.3.1     tidyr_1.1.0       dplyr_1.0.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     magrittr_1.5     munsell_0.5.0    tidyselect_1.1.0
 [5] uuid_0.1-4       colorspace_1.4-1 R6_2.4.1         rlang_0.4.6     
 [9] fansi_0.4.1