# MMETSP stats

In [192]:
# metrics files
dib_v_ncgr <- read.csv("../assembly_evaluation_data/transrate_reference_trinity2.2.0_v_ncgr.cds.csv")
ncgr_v_dib <- read.csv("../assembly_evaluation_data/transrate_reverse_ncgr.nt_v_trinity2.2.0.csv")
score_ncgr <- read.csv("../assembly_evaluation_data/transrate_scores_imicrobe_cds.csv")
score_dib <- read.csv("../assembly_evaluation_data/transrate_scores_trinity-2.2.0.csv")
BUSCO_dib_data <- read.csv("../assembly_evaluation_data/busco_scores_MMETSP_protist_trinity2.2.0.csv")
BUSCO_ncgr_data <- read.csv("../assembly_evaluation_data/busco_scores_imicrobe_protist.csv")

## 1. Size of table with [Transrate](http://hibberdlab.com/transrate/metrics.html) comparative metrics

(comparing DIB re-assemblies to NCGR assemblies and vice versa)

### dib_v_ncgr (rows x columns)

In [193]:
dim(dib_v_ncgr)

### ncgr_v_dib (rows x columns)

In [194]:

dim(ncgr_v_dib)

## Size of table with [Transrate](http://hibberdlab.com/transrate/metrics.html) score  metrics  

#### NCGR

In [195]:
dim(score_ncgr)

#### DIB


In [196]:
dim(score_dib)

## 2. Number of contigs in transcriptome (mean ± std)

### DIB re-assemblies

In [197]:
contigs_dib_v_ncgr <- dib_v_ncgr$n_seqs

#length(contigs_dib_v_ncgr)

mean(contigs_dib_v_ncgr)

sd(contigs_dib_v_ncgr)


### NCGR assemblies from [imicrobe](https://imicrobe.us/?#/projects/104)

In [198]:
contigs_ncgr_v_dib <- ncgr_v_dib$n_seqs
#length(contigs_ncgr_v_dib)
mean(contigs_ncgr_v_dib)
sd(contigs_ncgr_v_dib)

### Kolmogorov–Smirnov test comparing distributions of DIB vs. NCGR num contigs

In [199]:
ks.test(contigs_dib_v_ncgr,contigs_ncgr_v_dib)

“p-value will be approximate in the presence of ties”


	Two-sample Kolmogorov-Smirnov test

data:  contigs_dib_v_ncgr and contigs_ncgr_v_dib
D = 0.29793, p-value < 2.2e-16
alternative hypothesis: two-sided


## 3. [BUSCO](http://busco.ezlab.org/) v2 content, [Protists database](http://busco.ezlab.org/frame_protists.html)

### DIB (mean ± std)

In [200]:
BUSCO_dib <- BUSCO_dib_data$Complete_BUSCO_perc
#length(BUSCO_dib)
mean(BUSCO_dib)
sd(BUSCO_dib)


### NCGR (mean ± std)

In [201]:
BUSCO_ncgr <- BUSCO_ncgr_data$Complete_BUSCO_perc
#length(BUSCO_ncgr)
mean(BUSCO_ncgr)
sd(BUSCO_ncgr)

### Kolmogorov–Smirnov test comparing BUSCO content distributions DIB vs. NCGR

In [202]:
ks.test(BUSCO_dib,BUSCO_ncgr)

“p-value will be approximate in the presence of ties”


	Two-sample Kolmogorov-Smirnov test

data:  BUSCO_dib and BUSCO_ncgr
D = 0.058348, p-value = 0.2096
alternative hypothesis: two-sided


## 4. [Transrate score](http://hibberdlab.com/transrate/metrics.html)

### DIB (mean ± std)

In [203]:
transrate_score_dib <- score_dib$score
#length(transrate_score_dib)
mean(transrate_score_dib)
sd(transrate_score_dib)

### NCGR (mean ± std)

In [204]:
transrate_score_ncgr <- score_ncgr$score
#length(transrate_score_ncgr)
mean(transrate_score_ncgr)
sd(transrate_score_ncgr)

### Kolmogorov–Smirnov test for Transrate score

In [205]:
ks.test(transrate_score_dib,transrate_score_ncgr)

“p-value will be approximate in the presence of ties”


	Two-sample Kolmogorov-Smirnov test

data:  transrate_score_dib and transrate_score_ncgr
D = 0.48827, p-value < 2.2e-16
alternative hypothesis: two-sided


## 5. Conditional Recriprocal Best Blast (CRBB) (described in [Aubry et al. 2014](http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1004365))

### DIB (mean ± std)

In [206]:
CRBB_dib_v_ncgr <- dib_v_ncgr$p_refs_with_CRBB
#length(CRBB_dib_v_ncgr)
mean(CRBB_dib_v_ncgr)
sd(CRBB_dib_v_ncgr)

### NCGR (mean ± std)

In [207]:
CRBB_ncgr_v_dib <- ncgr_v_dib$p_refs_with_CRBB
#length(CRBB_ncgr_v_dib)
mean(CRBB_ncgr_v_dib)
sd(CRBB_ncgr_v_dib)

### Kolmogorov–Smirnov test for Transrate score

In [208]:
ks.test(CRBB_dib_v_ncgr,CRBB_ncgr_v_dib)

“p-value will be approximate in the presence of ties”


	Two-sample Kolmogorov-Smirnov test

data:  CRBB_dib_v_ncgr and CRBB_ncgr_v_dib
D = 0.7616, p-value < 2.2e-16
alternative hypothesis: two-sided


## 6. Open Reading Frame (ORF) content

[According to the Transrate documentation](https://github.com/blahah/transrate/blob/cc873b8247165d1651b60dde5eab51fc55170f02/ext/transrate/transrate.c): An ORF is defined as the number of bases between either the start of the sequence or a start codon and either the end of the sequence or a stop codon

### DIB (mean ± std)

In [209]:
ORF_dib <- dib_v_ncgr$mean_orf_percent
#length(ORF_dib)
mean(ORF_dib)
sd(ORF_dib)

### NCGR (mean ± std)

In [210]:
ORF_ncgr <- ncgr_v_dib$mean_orf_percent
#length(ORF_ncgr)
mean(ORF_ncgr)
sd(ORF_ncgr)

### Kolmogorov–Smirnov test for ORF

In [211]:
ks.test(ORF_dib,ORF_ncgr)

“p-value will be approximate in the presence of ties”


	Two-sample Kolmogorov-Smirnov test

data:  ORF_dib and ORF_ncgr
D = 0.27863, p-value < 2.2e-16
alternative hypothesis: two-sided
