In [23]:
import pandas as pd
from mgnifyextract.analyses import get_analysis
from mgnifyextract.downloads import Download, FastaDownload, MseqDownload, TsvDownload
from mgnifyextract.util import clean_taxonomy_string

In [24]:
analysis = get_analysis("MGYA00593805")
analysis

<Analysis https://www.ebi.ac.uk/metagenomics/analyses/MGYA00593805>

In [25]:
downloads = analysis.get_downloads()

marker = "SSU"

fasta_files = [download for download in downloads if isinstance(download, FastaDownload) and download.marker == marker]
mseq_files = [download for download in downloads if isinstance(download, MseqDownload) and download.marker == marker]
tsv_files = [download for download in downloads if isinstance(download, TsvDownload) and download.marker == marker]

Let's take a look at the number of rows in the mseq and OTU files.

In [26]:
mseq = mseq_files[0].read()
mseq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12178 entries, 0 to 12177
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   #query       12178 non-null  object 
 1   dbhit        12178 non-null  object 
 2   bitscore     12178 non-null  int64  
 3   identity     12178 non-null  float64
 4   matches      12178 non-null  int64  
 5   mismatches   12178 non-null  int64  
 6   gaps         12178 non-null  int64  
 7   query_start  12178 non-null  int64  
 8   query_end    12178 non-null  int64  
 9   dbhit_start  12178 non-null  int64  
 10  dbhit_end    12178 non-null  int64  
 11  strand       12178 non-null  object 
 12  Unnamed: 12  0 non-null      float64
 13  SILVA        12178 non-null  object 
 14  Unnamed: 14  0 non-null      float64
dtypes: float64(3), int64(8), object(4)
memory usage: 1.4+ MB


In [27]:
otu = tsv_files[0].read()
otu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   # OTU ID  121 non-null    int64  
 1   SSU_rRNA  121 non-null    float64
 2   taxonomy  121 non-null    object 
 3   taxid     121 non-null    int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 3.9+ KB


The same number of distinct taxonomy values in the mseq file as well as the OTU suggests that the OTUs correspond to all distinct taxonomic assignments regardless of sequence similarity. All reads with assignment Bacteria for example are collapsed into a single OTU.

In [28]:
pd.Series([clean_taxonomy_string(tax) for tax in otu["taxonomy"]]).nunique()

121

In [29]:
pd.Series([clean_taxonomy_string(tax) for tax in mseq["SILVA"]]).nunique()

121