# Goal

Create genome collection of Christensenellaceae MAGs and isolate genomes in order to produce Christensenellales-specific primers

# Var

In [125]:
work_dir = '/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/'
clade = 'Christensenellaceae'
taxid = 990719  
threads = 8

# Init

In [126]:
library(dplyr)
library(tidyr)
library(data.table)
library(tidytable)
library(ggplot2)
library(LeyLabRMisc)

In [127]:
library(curl)

In [128]:
df.dims()
setDTthreads(threads)
make_dir(work_dir)

Directory already exists: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/ 


# Genomes

## From genbank

```
OUTDIR=/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellales/genomes/NCBI/Christensenellaceae
mkdir -p $OUTDIR
ncbi-genome-download -p 12 -s genbank -F fasta -t 990719 -o $OUTDIR bacteria
```

## From UHGG

In [32]:
F = file.path('/ebio/abt3_projects/databases_no-backup/UHGG/2019_09', 'genomes-nr_metadata.tsv')
genomes = Fread(F) %>%
    filter.(grepl('o__Christensenellales', Lineage))
genomes

Genome,Original_name,Study_set,Genome_type,Length,N_contigs,N50,GC_content,Completeness,Contamination,⋯,tRNAs,Genome_accession,Species_rep,MGnify_accession,Lineage,Sample_accession,Study_accession,Country,Continent,FTP_download
<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,⋯,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GUT_GENOME000279,14207_7_64,HBC,Isolate,2615708,20,268821,56.68,99.19,0.81,⋯,20,,GUT_GENOME000279,MGYG-HGUT-00073,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__GCA-900066905;g__GCA-900066905;s__GCA-900066905 sp900066905,ERS852553,ERP012217,United Kingdom,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-000/MGYG-HGUT-00073/genomes1/GUT_GENOME000279.gff.gz
GUT_GENOME001938,AsnicarF_2017__MV_FEM2_t1Q14__bin.14,CIBIO,MAG,2965995,141,30137,58.20,99.18,1.17,⋯,19,,GUT_GENOME247421,MGYG-HGUT-03891,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__CAG-74;g__UBA11524;s__UBA11524 sp000437595,SRS1634658,SRP082656,Italy,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-038/MGYG-HGUT-03891/genomes2/GUT_GENOME001938.gff.gz
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GUT_GENOME286732,ZellerG_2014__CCMD90311071ST-21-0__bin.32,CIBIO,MAG,1752856,264,8460,49.37,91.90,1.88,⋯,17,,GUT_GENOME014725,MGYG-HGUT-00530,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__CAG-138;g__PeH17;s__PeH17 sp000435055,ERS436832,ERP005534,Germany,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-005/MGYG-HGUT-00530/genomes1/GUT_GENOME286732.gff.gz
GUT_GENOME286757,ZellerG_2014__CCMD93344354ST-21-0__bin.14,CIBIO,MAG,1369885,414,3602,49.46,70.82,2.42,⋯,10,,GUT_GENOME014725,MGYG-HGUT-00530,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__CAG-138;g__PeH17;s__PeH17 sp000435055,ERS436826,ERP005534,Germany,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-005/MGYG-HGUT-00530/genomes2/GUT_GENOME286757.gff.gz


In [33]:
genomes_f = genomes %>%
    filter.(grepl('f__Christensenellaceae', Lineage))
genomes_f 

Genome,Original_name,Study_set,Genome_type,Length,N_contigs,N50,GC_content,Completeness,Contamination,⋯,tRNAs,Genome_accession,Species_rep,MGnify_accession,Lineage,Sample_accession,Study_accession,Country,Continent,FTP_download
<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,⋯,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GUT_GENOME067772,ERS235577_13,HGM,MAG,1340409,583,2515,47.52,69.73,2.05,⋯,15,,GUT_GENOME105882,MGYG-HGUT-01747,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__Christensenellaceae;g__;s__,ERS235577,ERP002469,Sweden,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME067772.gff.gz
GUT_GENOME076687,ERS473114_58,HGM,MAG,1168678,554,2303,47.91,61.17,0.14,⋯,9,,GUT_GENOME105882,MGYG-HGUT-01747,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__Christensenellaceae;g__;s__,ERS473114,ERP005989,Sweden,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME076687.gff.gz
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GUT_GENOME256796,XieH_2016__YSZC12003_37133__bin.84,CIBIO,MAG,1317940,315,4959,47.31,71.77,1.4,⋯,16,,GUT_GENOME105882,MGYG-HGUT-01747,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__Christensenellaceae;g__;s__,ERS746898,ERP010700,United Kingdom,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME256796.gff.gz
GUT_GENOME260354,XieH_2016__YSZC12003_37405__bin.112,CIBIO,MAG,2315891,575,4680,50.52,84.22,3.5,⋯,17,,GUT_GENOME260354,MGYG-HGUT-04235,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Christensenellales;f__Christensenellaceae;g__;s__,ERS746923,ERP010700,United Kingdom,Europe,ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-042/MGYG-HGUT-04235/genomes1/GUT_GENOME260354.gff.gz


In [49]:
# downloading
get_file = function(url, base_dir){
    outfile = file.path(base_dir, 'genomes', 'UHGG', gsub('.+/', '', url))
    message('Downloading: ', url)
    curl_download(url, outfile, mode = "wb")
}

ret = genomes_f$FTP_download %>%
    lapply(get_file, work_dir)
ret %>% length

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME067772.gff.gz

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME076687.gff.gz

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME076875.gff.gz

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME078695.gff.gz

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME091014.gff.gz

Downloading: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v1.0/all_genomes/MGYG-HGUT-017/MGYG-HGUT-01747/genomes1/GUT_GENOME091497.gff.gz

Down

Parsing gff files

```
(genome) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/UHGG
$ find . -name "*.gff.gz" | xargs -I % /ebio/abt3_projects/databases_no-backup/UHGG/2019_09/prokka_gff2fasta.py %
```

## TUK MAGs

In [50]:
F = '/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/final_MAGs.tsv'
TUK = Fread(F)
TUK

Name,Fasta,Domain,Phylum,Class,Order,Family,Genus,Species,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..contigs,N50..contigs.,Mean.contig.length..bp.,Longest.contig..bp.,GC,Coding.density,X..predicted.genes
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>
1002320__metabat2__HighNoCov.035,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/1002320__metabat2__HighNoCov.035.fna,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter inops,92.2,4.61,7.14,2261725,365,8314,6196,65550,56.9,86.29,2235
1002335__maxbin2__High.035,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/1002335__maxbin2__High.035.fna,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Alistipes_A,Alistipes_A ihumii,100.0,0.00,0.00,2620923,29,339045,90376,457503,58.5,83.91,2100
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
ERR3451530__metabat2__High.008,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/ERR3451530__metabat2__High.008.fna,Bacteria,Firmicutes_A,Clostridia,Oscillospirales,Oscillospiraceae,CAG-103,CAG-103 sp000432375,88.26,0.67,100,1925297,28,172712,68760,283992,62.2,89.32,1782
ERR3451530__vamb__High.213,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/ERR3451530__vamb__High.213.fna,Bacteria,Firmicutes_A,Clostridia,Oscillospirales,Ruminococcaceae,CAG-115,CAG-115 sp003531585,95.97,0.84,100,2811853,271,15050,10375,90239,52.4,88.48,2685


In [52]:
TUK = TUK %>%
    filter.(Family == 'Christensenellaceae')
TUK

Name,Fasta,Domain,Phylum,Class,Order,Family,Genus,Species,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..contigs,N50..contigs.,Mean.contig.length..bp.,Longest.contig..bp.,GC,Coding.density,X..predicted.genes
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>
ERR3451522__vamb__Low.036,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/ERR3451522__vamb__Low.036.fna,Bacteria,Firmicutes_A,Clostridia_A,Christensenellales,Christensenellaceae,QANA01,QANA01 sp900554725,81.99,3.38,11.11,1445289,318,5631,4544,21582,50.7,89.05,1626
1002868__metabat2__HighNoCov.035,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/1002868__metabat2__HighNoCov.035.fna,Bacteria,Firmicutes_A,Clostridia_A,Christensenellales,Christensenellaceae,UMGS743,UMGS743 sp900545085,88.09,0.70,0.00,1520326,322,5894,4721,21753,47.6,88.80,1705
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
1003367__vamb__Med.024,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/1003367__vamb__Med.024.fna,Bacteria,Firmicutes_A,Clostridia_A,Christensenellales,Christensenellaceae,UMGS743,UMGS743 sp900545085,71.38,2.04,0,1408546,437,3519,3223,14284,46.9,87.81,1747
1002853__maxbin2__High.073_sub,/ebio/abt3_projects/Anxiety_Twins_Metagenomes/data/metagenome/TUK-5projects/LLMGA/v0.12/LLG/rnd1/drep/drep/dereplicated_genomes/1002853__maxbin2__High.073_sub.fna,Bacteria,Firmicutes_A,Clostridia_A,Christensenellales,Christensenellaceae,UMGS743,UMGS743 sp900545085,85.63,1.75,0,2094087,511,4991,4098,22183,47.2,87.95,2468


In [57]:
copy_file = function(F, base_dir){
    outfile = file.path(base_dir, basename(F))
    stopifnot(F != outfile)
    file.copy(F, outfile)
}
TUK$Fasta %>%
    lapply(copy_file, base_dir=file.path(work_dir, 'TUK'))

## List of all genomes

In [62]:
files = list_files(file.path(work_dir, 'genomes'), '.fna')
samps = data.frame(Name = files %>% as.character %>% basename,
                   Fasta = files,
                   Domain = 'Bacteria',
                   Taxid = taxid) %>%
    mutate(Fasta = gsub('/+', '/', Fasta))
samps

Name,Fasta,Domain,Taxid
<chr>,<chr>,<chr>,<dbl>
GCA_001571425.1_ASM157142v1_genomic.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/NCBI/GCA_001571425.1_ASM157142v1_genomic.fna,Bacteria,990719
GCA_001652705.1_ASM165270v1_genomic.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/NCBI/GCA_001652705.1_ASM165270v1_genomic.fna,Bacteria,990719
⋮,⋮,⋮,⋮
GUT_GENOME256796.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/UHGG/GUT_GENOME256796.fna,Bacteria,990719
GUT_GENOME260354.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/UHGG/GUT_GENOME260354.fna,Bacteria,990719


In [63]:
# writing file
outfile = file.path(work_dir, 'genomes_raw.txt')
write_table(samps, outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes//genomes_raw.txt 


# LLG

### Config

In [68]:
cat_file(file.path(work_dir, '../config_llg.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/genomes_raw.txt

# output location
output_dir: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/LLG_output/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 5

# Domain of genomes ('Archaea' or 'Bacteria)
## Use "Skip" if provided as a "Domain" column in the genome table
Domain: Skip

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: ""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: ""
  # de-replication (CheckM recommended)
  drep:
    algorithm: aut

### Run

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/ll_pipelines/llg
$ screen -L -s llg-christ ./snakemake_sge.sh /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/config_llg.yaml 30 -F
```

## Samples table of high quality genomes

In [130]:
# checkM summary
checkm = file.path(work_dir, 'genomes', 'LLG_output', 'checkM', 'checkm_qa_summary.tsv') %>%
    read.delim(sep='\t') 
checkm

Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,Strain.heterogeneity,Genome.size..bp.,X..ambiguous.bases,⋯,X0,X1,X2,X3,X4,X5.,assembly.Id,assembler.Id,taxon.Id,File
<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<lgl>,<chr>
1003528__metabat2__Low.052_sub.fna,o__Clostridiales (UID1120),304,250,143,74.22,0.70,0,1649188,0,⋯,66,183,1,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenellaceae|genomes|checkM|1|checkm|markers_qa_summary.tsv.1,markers_qa_summary.tsv.1,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/checkM/1/checkm/markers_qa_summary.tsv.1
GCA_001571425.1_ASM157142v1_genomic.fna,c__Clostridia (UID1118),387,223,124,97.58,0.81,0,2906526,1400,⋯,3,219,1,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenellaceae|genomes|checkM|1|checkm|markers_qa_summary.tsv.2,markers_qa_summary.tsv.2,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/checkM/1/checkm/markers_qa_summary.tsv.2
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GUT_GENOME252750.fna,o__Clostridiales (UID1120),304,250,143,53.75,0.7,0,817347,0,⋯,131,118,1,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenellaceae|genomes|checkM|5|checkm|markers_qa_summary.tsv.19,markers_qa_summary.tsv.19,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/checkM/5/checkm/markers_qa_summary.tsv.19
GUT_GENOME256796.fna,o__Clostridiales (UID1120),304,250,143,71.77,1.4,0,1317940,0,⋯,85,163,2,0,0,0,|ebio|abt3_projects|software|dev|ll_pipelines|llprimer|experiments|christensenellaceae|genomes|checkM|5|checkm|markers_qa_summary.tsv.20,markers_qa_summary.tsv.20,,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/checkM/5/checkm/markers_qa_summary.tsv.20


In [131]:
# dRep summary
drep = file.path(work_dir, 'genomes', 'LLG_output', 'drep', 'checkm_markers_qa_summary.tsv') %>%
    read.delim(sep='\t') %>%
    mutate(Bin.Id = gsub('.+/', '', genome),
           Bin.Id = gsub('\\.fna$', '', Bin.Id))
drep

genome,completeness,contamination,Bin.Id
<chr>,<dbl>,<dbl>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/1003528__metabat2__Low.052_sub.fna.fna,74.22,0.70,1003528__metabat2__Low.052_sub.fna
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001571425.1_ASM157142v1_genomic.fna.fna,97.58,0.81,GCA_001571425.1_ASM157142v1_genomic.fna
⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME252750.fna.fna,53.75,0.7,GUT_GENOME252750.fna
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME256796.fna.fna,71.77,1.4,GUT_GENOME256796.fna


In [132]:
# de-replicated genomes
drep_gen = file.path(work_dir, 'genomes', 'LLG_output', 'drep', 'dereplicated_genomes.tsv') %>%
    read.delim(sep='\t')
drep_gen

Name,Fasta
<chr>,<chr>
GCA_012518615.1_ASM1251861v1_genomic.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/drep/drep/dereplicated_genomes/GCA_012518615.1_ASM1251861v1_genomic.fna.fna
GCA_012837835.1_ASM1283783v1_genomic.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GCA_012837835.1_ASM1283783v1_genomic.fna.fna
⋮,⋮
GCA_900155415.1_PRJEB13909b_genomic.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GCA_900155415.1_PRJEB13909b_genomic.fna.fna
GUT_GENOME097725.fna,/ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae/genomes/LLG_output/drep/drep/dereplicated_genomes/GUT_GENOME097725.fna.fna


In [133]:
# GTDBTk summary
tax = file.path(work_dir, 'genomes', 'LLG_output', 'gtdbtk', 'gtdbtk_summary_wTaxid.tsv') %>%
    read.delim(, sep='\t') %>%
    separate(classification, 
             c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
             sep=';') %>%
    select(-note, -classification_method, -pplacer_taxonomy,
           -other_related_references.genome_id.species_name.radius.ANI.AF.)
tax

user_genome,Domain,Phylum,Class,Order,Family,Genus,Species,fastani_reference,fastani_reference_radius,⋯,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings,taxid,taxid_rank
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<chr>
1003528__metabat2__Low.052_sub.fna,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__QAKS01,s__,,,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__QAKS01;s__QAKS01 sp003343685,98.23,0.51,65.99,11,0.9999039146726262,,222105,genus
GCA_001571425.1_ASM157142v1_genomic.fna,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__Christensenella,s__Christensenella minuta,GCF_003628755.1,95.0,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella;s__Christensenella minuta,99.99,1.0,95.42,11,,,160030,species
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GUT_GENOME252750.fna,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__UMGS743,s__UMGS743 sp900545085,GCA_900545085.1,95.0,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__UMGS743;s__UMGS743 sp900545085,97.55,0.92,39.33,11,,,230958,species
GUT_GENOME256796.fna,d__Bacteria,p__Firmicutes_A,c__Clostridia_A,o__Christensenellales,f__Christensenellaceae,g__UMGS743,s__UMGS743 sp900545085,GCA_900545085.1,95.0,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__UMGS743;s__UMGS743 sp900545085,99.39,0.89,58.77,11,,,230958,species


In [134]:
# checking overlap
cat('-- drep --\n')
overlap(basename(as.character(drep_gen$Fasta)), 
        basename(as.character(drep$genome)))
cat('-- checkm --\n')
overlap(drep$Bin.Id, checkm$Bin.Id)
cat('-- gtdbtk --\n')
overlap(drep$Bin.Id, tax$user_genome)

-- drep --
intersect(x,y): 61 
setdiff(x,y): 0 
setdiff(y,x): 40 
union(x,y): 101 
-- checkm --
intersect(x,y): 101 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 101 
-- gtdbtk --
intersect(x,y): 101 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 101 


In [135]:
# joining based on Bin.Id
drep = drep %>%
    inner_join(checkm, c('Bin.Id')) %>%
    mutate(GEN = genome %>% as.character %>% basename) %>%
    inner_join(drep_gen %>% mutate(GEN = Fasta %>% as.character %>% basename),
               by=c('GEN')) %>%
    inner_join(tax, c('Bin.Id'='user_genome')) #%>%
drep

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings,taxid,taxid_rank
<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/1003528__metabat2__Low.052_sub.fna.fna,74.22,0.7,1003528__metabat2__Low.052_sub.fna,o__Clostridiales (UID1120),304,250,143,74.22,0.7,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__QAKS01;s__QAKS01 sp003343685,98.23,0.51,65.99,11,0.9999039146726262,,222105,genus
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001940855.1_ASM194085v1_genomic.fna.fna,98.39,0.0,GCA_001940855.1_ASM194085v1_genomic.fna,c__Clostridia (UID1118),387,223,124,98.39,0.0,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-138;g__Phil1;s__Phil1 sp001940855,100.0,1.0,93.63,11,,,153230,species
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME252750.fna.fna,53.75,0.7,GUT_GENOME252750.fna,o__Clostridiales (UID1120),304,250,143,53.75,0.7,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__UMGS743;s__UMGS743 sp900545085,97.55,0.92,39.33,11,,,230958,species
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME256796.fna.fna,71.77,1.4,GUT_GENOME256796.fna,o__Clostridiales (UID1120),304,250,143,71.77,1.4,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__UMGS743;s__UMGS743 sp900545085,99.39,0.89,58.77,11,,,230958,species


In [136]:
# summarizing the taxonomy
df.dims(20)
drep %>%
    group_by(Order, Family, Genus) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Unnamed: 0_level_0,Order,Family,Genus,n_genomes
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>
1,o__Christensenellales,f__,g__,1
2,o__Christensenellales,f__CAG-138,g__PeH17,2
3,o__Christensenellales,f__CAG-138,g__Phil1,2
4,o__Christensenellales,f__CAG-138,g__SFEL01,1
5,o__Christensenellales,f__CAG-74,g__,1
6,o__Christensenellales,f__CAG-74,g__DTU024,2
7,o__Christensenellales,f__CAG-74,g__Firm-11,2
8,o__Christensenellales,f__CAG-74,g__SFFH01,1
9,o__Christensenellales,f__CAG-74,g__UBA11524,12
10,o__Christensenellales,f__CAG-917,g__CAG-349,1


In [137]:
# filtering by quality
hq_genomes = drep %>%
    filter(completeness >= 90,
           contamination < 5,
           Strain.heterogeneity < 50)
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings,taxid,taxid_rank
<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001940855.1_ASM194085v1_genomic.fna.fna,98.39,0.00,GCA_001940855.1_ASM194085v1_genomic.fna,c__Clostridia (UID1118),387,223,124,98.39,0.00,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-138;g__Phil1;s__Phil1 sp001940855,100.0,1.0,93.63,11,,,153230,species
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014384805.1_ASM1438480v1_genomic.fna.fna,97.90,0.93,GCA_014384805.1_ASM1438480v1_genomic.fna,o__Clostridiales (UID1120),304,250,143,97.90,0.93,⋯,,,,,93.95,11,0.8021254376949831,,160028,family
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014866795.1_ASM1486679v1_genomic.fna.fna,97.58,0.27,GCA_014866795.1_ASM1486679v1_genomic.fna,c__Clostridia (UID1118),387,223,124,97.58,0.27,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__CAG-74;g__Firm-11;s__Firm-11 sp900540045,97.55,0.82,93.25,11,,,226554,species
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME097725.fna.fna,96.50,1.81,GUT_GENOME097725.fna,o__Clostridiales (UID1120),304,250,143,96.50,1.81,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Catabacter;s__Catabacter hongkongensis,99.06,0.96,89.29,11,,,171084,species


In [138]:
# filtering by taxonomy
hq_genomes = hq_genomes %>%
    filter(Family == 'f__Christensenellaceae') 
hq_genomes

genome,completeness,contamination,Bin.Id,Marker.lineage,X..genomes,X..markers,X..marker.sets,Completeness,Contamination,⋯,closest_placement_radius,closest_placement_taxonomy,closest_placement_ani,closest_placement_af,msa_percent,translation_table,red_value,warnings,taxid,taxid_rank
<chr>,<dbl>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<int>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014384805.1_ASM1438480v1_genomic.fna.fna,97.90,0.93,GCA_014384805.1_ASM1438480v1_genomic.fna,o__Clostridiales (UID1120),304,250,143,97.90,0.93,⋯,,,,,93.95,11,0.8021254376949831,,160028,family
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900087015.1_PRJEB13910_genomic.fna.fna,97.98,0.81,GCA_900087015.1_PRJEB13910_genomic.fna,c__Clostridia (UID1118),387,223,124,97.98,0.81,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Christensenella_A;s__Christensenella_A timonensis,100.0,1.0,95.42,11,,,218247,species
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME127701.fna.fna,93.84,1.40,GUT_GENOME127701.fna,o__Clostridiales (UID1120),304,250,143,93.84,1.40,⋯,,,,,87.14,11,0.804426009146511,,160028,family
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME097725.fna.fna,96.50,1.81,GUT_GENOME097725.fna,o__Clostridiales (UID1120),304,250,143,96.50,1.81,⋯,95.0,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o__Christensenellales;f__Christensenellaceae;g__Catabacter;s__Catabacter hongkongensis,99.06,0.96,89.29,11,,,171084,species


In [139]:
# summarizing the taxonomy
df.dims(20)
hq_genomes %>%
    group_by(Order, Family, Genus, Species) %>%
    summarize(n_genomes = n(), .groups='drop')
df.dims()

Unnamed: 0_level_0,Order,Family,Genus,Species,n_genomes
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>
1,o__Christensenellales,f__Christensenellaceae,g__,s__,2
2,o__Christensenellales,f__Christensenellaceae,g__Catabacter,s__Catabacter hongkongensis,1
3,o__Christensenellales,f__Christensenellaceae,g__Christensenella,s__,1
4,o__Christensenellales,f__Christensenellaceae,g__Christensenella,s__Christensenella massiliensis,1
5,o__Christensenellales,f__Christensenellaceae,g__Christensenella,s__Christensenella minuta,1
6,o__Christensenellales,f__Christensenellaceae,g__Christensenella,s__Christensenella sp001678845,1
7,o__Christensenellales,f__Christensenellaceae,g__Christensenella_A,s__Christensenella_A timonensis,1
8,o__Christensenellales,f__Christensenellaceae,g__UMGS743,s__UMGS743 sp900545085,2


In [140]:
# summarizing
hq_genomes$Completeness %>% summary_x('Completeness')
hq_genomes$X..contigs %>% summary_x('No. of contigs')
hq_genomes$Mean.contig.length..bp. %>% summary_x('Mean contig length')
hq_genomes$X..predicted.genes %>% summary_x('No. of genes')
hq_genomes$N50..contigs. %>% summary_x('N50')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Completeness,90.31,94.9325,97.94,96.65,98.69,99.19,3.305,1.349


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
No. of contigs,1,4,25.5,86.9,172.75,285,113.273,46.243


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
Mean contig length,6782,13449.75,142898.5,740514.5,1049146,2969292,1135658,463630.4


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
No. of genes,1725,1929.5,2433.5,2347.9,2761.25,2870,450.282,183.827


Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
N50,9164,18118.25,308570,888548.6,1827399,2969292,1183129,483010.4


In [143]:
# listing taxonomy
df.dims(40)
hq_genomes %>%
    select(genome, Genus, Species)
df.dims()

genome,Genus,Species
<chr>,<chr>,<chr>
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014384805.1_ASM1438480v1_genomic.fna.fna,g__,s__
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900087015.1_PRJEB13910_genomic.fna.fna,g__Christensenella_A,s__Christensenella_A timonensis
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME194019.fna.fna,g__UMGS743,s__UMGS743 sp900545085
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_900155415.1_PRJEB13909b_genomic.fna.fna,g__Christensenella,s__Christensenella massiliensis
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_001678845.1_ASM167884v1_genomic.fna.fna,g__Christensenella,s__Christensenella sp001678845
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_003628755.1_ASM362875v1_genomic.fna.fna,g__Christensenella,s__Christensenella minuta
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GCA_014287795.1_ASM1428779v1_genomic.fna.fna,g__Christensenella,s__
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME094667.fna.fna,g__UMGS743,s__UMGS743 sp900545085
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME127701.fna.fna,g__,s__
/ebio/abt3_scratch/nyoungblut/LLG_62325884640/genomes/GUT_GENOME097725.fna.fna,g__Catabacter,s__Catabacter hongkongensis


In [124]:
 # writing samples table for LLPRIMER
outfile = file.path(work_dir, 'LLG_output', 'samples_genomes_hq.txt')
hq_genomes %>%
    select(Bin.Id, Fasta, Domain) %>%
    rename('Taxon' = Bin.Id) %>%
    mutate(Taxon = gsub('_chromosome.+', '', Taxon),
           Taxon = gsub('_bin_.+', '', Taxon),
           Taxon = gsub('_genomic', '', Taxon),
           Taxon = gsub('_annotated_assembly', '', Taxon),
           Taxid = taxid) %>%
    write_table(outfile)

File written: /ebio/abt3_projects/software/dev/ll_pipelines/llprimer/experiments/christensenellaceae//LLG_output/samples_genomes_hq.txt 


# sessionInfo

In [122]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Anxiety_Twins_Metagenomes/envs/tidyverse2/lib/libopenblasp-r0.3.12.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] curl_4.3          tidytable_0.5.8   data.table_1.13.6 LeyLabRMisc_0.1.8
[5] ggplot2_3.3.3     tidyr_1.1.2       dplyr_1.0.3      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6        plyr_1.8.6        pillar_1.4.7      compiler_4.0.3   
 [5] base64enc_0.1-3   