<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Database-download" data-toc-modified-id="Database-download-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Database download</a></span></li><li><span><a href="#LLG" data-toc-modified-id="LLG-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>LLG</a></span><ul class="toc-item"><li><span><a href="#Creating-samples-table" data-toc-modified-id="Creating-samples-table-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Creating samples table</a></span></li><li><span><a href="#Config" data-toc-modified-id="Config-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Config</a></span></li><li><span><a href="#Run" data-toc-modified-id="Run-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Run</a></span></li></ul></li><li><span><a href="#Summary" data-toc-modified-id="Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summary</a></span><ul class="toc-item"><li><span><a href="#fastANI" data-toc-modified-id="fastANI-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>fastANI</a></span></li></ul></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* Compare the MAGs of this animal gut dataset to the Unified Human Gastrointestinal Genome (UHGG) collection

# Var

In [70]:
# base dir for MAGs
base_dir = file.path('/ebio', 'abt3_projects', 'Georg_animal_feces', 'data',
                     'metagenome', 'multi-study', 'BioProjects', 'summary',
                     'LLMGA', 'wGeorgAnimal', 'drep-0.95')

# workdir 
work_dir = file.path(base_dir, 'LLG', 'UHGG_compare')

# de-rep'd MAG metadata
MAG_meta_file = file.path(base_dir, 'drep-MAG_metadata.tsv')


# params
threads = 16

# Init

In [71]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(dtplyr)
library(tidytable)
library(LeyLabRMisc)

In [72]:
df.dims()
setDTthreads(threads)
make_dir(work_dir)

Created directory: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/LLG/UHGG_compare 


# Database download

In [73]:
download_dir = '/ebio/abt3_projects/databases_no-backup/UHGG/'

From ftp site. All species rep genome data

```
screen -S UHGG_dl wget -r ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/2019_09/uhgg_catalogue/*
```

# LLG

## Creating samples table

* animal MAGs from this study and UHGP rep genomes

In [74]:
# drep'd animal gut genomes 
MAG_meta = fread(MAG_meta_file, sep='\t') %>%
    dt_mutate(binID = gsub('_+', '.', binID),
              binID = gsub('\\.+', '.', binID)) %>%
    dt_select(binID, fasta.x) %>%
    dt_rename('Name' = binID,
              'Fasta' = fasta.x) %>%
    dt_mutate('Dataset' = 'animal_gut') %>%
    as_tibble 

MAG_meta

Name,Fasta,Dataset
<chr>,<chr>,<chr>
artificially.reared.1.metabat2.low.PE.002.contigs,/ebio/abt3_projects/Georg_animal_feces/data//metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/dereplicated_genomes/artificially_reared_1__metabat2_low_PE.002.contigs.fa,animal_gut
artificially.reared.2.maxbin2.low.prob.006.contigs,/ebio/abt3_projects/Georg_animal_feces/data//metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/dereplicated_genomes/artificially_reared_2__maxbin2_low_prob.006.contigs.fa,animal_gut
⋮,⋮,⋮
X95.Meadow.Viper.metabat2.low.PE.002.contigs,/ebio/abt3_projects/Georg_animal_feces/data//metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/dereplicated_genomes/X95_Meadow_Viper__metabat2_low_PE.002.contigs.fa,animal_gut
X96.European.Hare.metabat2.high.PE.013.contigs,/ebio/abt3_projects/Georg_animal_feces/data//metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/dereplicated_genomes/X96_European_Hare__metabat2_high_PE.013.contigs.fa,animal_gut


In [43]:
# drep'd UHGG genomes
UHGG_genome_files = list_files(file.path(download_dir, '2019_09'), '*.fna') 
UHGG_genome_files = UHGG_genome_files[grepl('\\.fna$', UHGP_genome_files)]
UHGG_genome_files %>% unique_n('genomes')

No. of unique genomes: 4644 


In [75]:
# formatting
UHGG_genomes = data.frame(Name = gsub('\\.fna$', '', basename(UHGP_genome_files)),
                          Fasta = UHGP_genome_files,
                          Dataset = 'UHGG')

UHGG_genomes

Name,Fasta,Dataset
<fct>,<fct>,<fct>
MGYG-HGUT-00001,/ebio/abt3_projects/databases_no-backup/UHGG//2019_09/uhgg_catalogue/MGYG-HGUT-000/MGYG-HGUT-00001/genome/MGYG-HGUT-00001.fna,UHGG
MGYG-HGUT-00002,/ebio/abt3_projects/databases_no-backup/UHGG//2019_09/uhgg_catalogue/MGYG-HGUT-000/MGYG-HGUT-00002/genome/MGYG-HGUT-00002.fna,UHGG
⋮,⋮,⋮
MGYG-HGUT-04643,/ebio/abt3_projects/databases_no-backup/UHGG//2019_09/uhgg_catalogue/MGYG-HGUT-046/MGYG-HGUT-04643/genome/MGYG-HGUT-04643.fna,UHGG
MGYG-HGUT-04644,/ebio/abt3_projects/databases_no-backup/UHGG//2019_09/uhgg_catalogue/MGYG-HGUT-046/MGYG-HGUT-04644/genome/MGYG-HGUT-04644.fna,UHGG


In [76]:
# checking name formatting
MAG_meta %>%
    rbind(UHGG_genomes) %>% 
    dt_mutate(ends_w_contigs = grepl('\\.contigs$', Name)) %>%
    dt_count(by=list(Dataset, ends_w_contigs))

Dataset,ends_w_contigs,N
<chr>,<lgl>,<int>
animal_gut,True,1522
UHGG,False,4644


In [46]:
# writing table
all_genome_table = file.path(work_dir, 'animal_UHGG_genomes.tsv')
MAG_meta %>%
    rbind(UHGG_genomes) %>%   
    write_table(all_genome_table)

File written: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/LLG/UHGP_compare/animal_UHGP_genomes.tsv 


## Config

In [57]:
cat_file(file.path(work_dir, 'config.yaml'))

# table with genome --> fasta_file information
samples_file: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/LLG/UHGP_compare/animal_UHGP_genomes.tsv

# output location
output_dir: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/LLG/UHGP_compare/

# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/

# batch processing of genomes for certain steps
## increase to better parallelize
batches: 300

# software parameters
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# dRep MAGs are not further analyzed, but you can de-rep & then use the de-rep genome table as input.
params:
  ionice: -c 3
  # assembly assessment
  seqkit: Skip #""
  quast: Skip #""
  multiqc_on_quast: "" 
  checkm: Skip #""
  # de-replication (requires checkm)
  drep: Skip #-comp 50 -con 5 -sa 0.95
  # taxonomy
  sourma

## Run

```
(snakemake) @ rick:/ebio/abt3_projects/Georg_animal_feces/bin/llg
screen -L -S llg-ga ./snakemake_sge.sh /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.95/LLG/UHGG_compare/config.yaml 50
```

# Summary

## fastANI

In [80]:
# loading distance matrix
fastANI_file = file.path(work_dir, 'ani', 'fastANI', 'ANI.tsv.gz')
#fastANI_file = file.path(work_dir, 'ani', 'fastANI', 'ANI_TEST.tsv.gz')  # debug
cmd = glue::glue('gunzip -c {file}', file=fastANI_file)
fastANI = Fread(cmd=cmd, header=FALSE) %>%
    dt_mutate(V1 = V1 %>% as.character %>% basename %>% gsub('\\.fna$', '', .),
              V2 = V2 %>% as.character %>% basename %>% gsub('\\.fna$', '', .)) %>%
    dt_mutate(Dataset_x = ifelse(grepl('\\.contigs$', V1), 'animal_gut', 'UHGG'),
              Dataset_y = ifelse(grepl('\\.contigs$', V2), 'animal_gut', 'UHGG')) %>%
    dt_filter(Dataset_x != Dataset_y) %>%
    dt_rename('genome_x' = V1,
              'genome_y' = V2,
              'ANI' = V3) 

c(fastANI %>% dt_pull(genome_x), fastANI %>% dt_pull(genome_x)) %>% unique_n('genomes')
fastANI

No. of unique genomes: 3553 


genome_x,genome_y,ANI,V4,V5,Dataset_x,Dataset_y
<chr>,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
artificially.reared.1.metabat2.low.PE.002.contigs,MGYG-HGUT-00042,85.7165,598,977,animal_gut,UHGG
artificially.reared.1.metabat2.low.PE.002.contigs,MGYG-HGUT-01599,85.3741,582,977,animal_gut,UHGG
⋮,⋮,⋮,⋮,⋮,⋮,⋮
MGYG-HGUT-04478,SAMEA104315426.metabat2.low.PE.019.contigs,78.0126,111,693,UHGG,animal_gut
MGYG-HGUT-04478,SAMEA104315424.metabat2.low.PE.038.contigs,77.7769,139,693,UHGG,animal_gut


In [79]:
# UHGG genomes
overlap(c(fastANI %>% dt_filter(Dataset_x == 'UHGG') %>% dt_pull(genome_x),
          fastANI %>% dt_filter(Dataset_y == 'UHGG') %>% dt_pull(genome_y)), 
        UHGG_genomes, NULL, Name)

intersect(x,y): 2659 
setdiff(x,y): 0 
setdiff(y,x): 1985 
union(x,y): 4644 


In [83]:
# animal gut genomes
overlap(c(fastANI %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x),
          fastANI %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)), 
        MAG_meta %>% dt_pull(Name))

intersect(x,y): 936 
setdiff(x,y): 0 
setdiff(y,x): 586 
union(x,y): 1522 


##### Notes

* genomes that are too divergent will not be reported in the fastANI output

In [84]:
# ANI distribution
fastANI %>% dt_pull(ANI) %>% summary_x('ANI')

Unnamed: 0,Min.,1st Qu.,Median,Mean,3rd Qu.,Max.,sd,sd_err_of_mean
ANI,75.0683,78.42618,79.2431,80.57528,79.98592,99.782,8.838,3.608


In [87]:
# amount of species-level overlap
fastANI_f = fastANI %>% 
    dt_filter(ANI >= 95) 

c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>% 
    unique_n('animal gut genomes')

c(fastANI_f %>% dt_filter(Dataset_x == 'UHGG') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'UHGG') %>% dt_pull(genome_y)) %>% 
    unique_n('UHGG genomes')

No. of unique animal gut genomes: 471 
No. of unique UHGG genomes: 479 


In [90]:
# amount of species-level overlap
n_overlap = c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
              fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>%
    unique %>% length 

n_total = MAG_meta %>% dt_pull(Name) %>% unique %>% length

cat('% of animal gut genomes in UHGG:',  n_overlap / n_total  * 100, '\n')

% of animal gut genomes in UHGG: 30.94612 


In [91]:
# amount of genus?-level overlap overlap
fastANI_f = fastANI %>% 
    dt_filter(ANI >= 90) 

c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>% 
    unique_n('animal gut genomes')

c(fastANI_f %>% dt_filter(Dataset_x == 'UHGG') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'UHGG') %>% dt_pull(genome_y)) %>% 
    unique_n('UHGG genomes')

No. of unique animal gut genomes: 519 
No. of unique UHGG genomes: 1227 


In [92]:
# amount of genus?-level overlap
n_overlap = c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
              fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>%
    unique %>% length 

n_total = MAG_meta %>% dt_pull(Name) %>% unique %>% length

cat('% of animal gut genomes in UHGG:',  n_overlap / n_total  * 100, '\n')

% of animal gut genomes in UHGG: 34.09987 


In [93]:
# amount of ?-level overlap overlap
fastANI_f = fastANI %>% 
    dt_filter(ANI >= 80) 

c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>% 
    unique_n('animal gut genomes')

c(fastANI_f %>% dt_filter(Dataset_x == 'UHGG') %>% dt_pull(genome_x), 
  fastANI_f %>% dt_filter(Dataset_y == 'UHGG') %>% dt_pull(genome_y)) %>% 
    unique_n('UHGG genomes')

No. of unique animal gut genomes: 728 
No. of unique UHGG genomes: 1963 


In [94]:
# amount of genus?-level overlap
n_overlap = c(fastANI_f %>% dt_filter(Dataset_x == 'animal_gut') %>% dt_pull(genome_x), 
              fastANI_f %>% dt_filter(Dataset_y == 'animal_gut') %>% dt_pull(genome_y)) %>%
    unique %>% length 

n_total = MAG_meta %>% dt_pull(Name) %>% unique %>% length

cat('% of animal gut genomes in UHGG:',  n_overlap / n_total  * 100, '\n')

% of animal gut genomes in UHGG: 47.8318 


# sessionInfo

In [95]:
sessionInfo()

R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/phyloseq/lib/libopenblasp-r0.3.7.so

Random number generation:
 RNG:     Mersenne-Twister 
 Normal:  Inversion 
 Sample:  Rounding 
 
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] LeyLabRMisc_0.1.3 tidytable_0.3.2   dtplyr_1.0.0      data.table_1.12.8
[5] ggplot2_3.2.1     tidyr_1.0.0       dplyr_0.8.3      

loaded via a namespace (and not attached):
 [1] Rcp