Written by JG 3/9/18
This workbook converts a DADA2 biom generated via the DADA2_processing R markdown notebook  into a qiime compatible biom file.

Output:
1. QIIME1 compatible biom file: A txt file where rows are ASVs and samples are columns. 
2. QIIME taxonomy file: A 2 column txt file of ASV ID -> taxonomy string
3. Reference Sequence Fasta: Fasta file where headers are ASV IDs and sequences are reference sequences. 

Usage:
Run the import/class definition cell. 
Input your mapping file

It also transfers the sequence data into a separate file and renames the OTUs
since often this data isn't necessary.



In [4]:
# Imports:
import pandas as pd
import os

### Mapping file: Make a dataframe with a column of sample names called "SampleName"

In [63]:
map_df = pd.read_csv('/Users/jimbo/Desktop/Yubo_SCBNR_16S_rnd2/scbnr_round2_map.txt', sep='\t')
map_df['SampleName'] = map_df['Date'].str.replace('/', '.')
map_df['SampleName'] = map_df['Reactor'] + "_" + map_df['SampleName']
map_df.to_csv('/Users/jimbo/Desktop/Yubo_SCBNR_16S_rnd2/scbnr_round2_map.txt', sep='\t')
# Name format used above: R1_MM.DD.YY etc.

In [62]:
# 10/11/18 SCBNR 16S Data Round 2:
scbnr_biom_file = '/Users/jimbo/Desktop/Yubo_SCBNR_16S_rnd2/DADA2_16S_biom.txt'
scbnr_biom = DADA_biom(scbnr_biom_file, 'SCBNR2', taxonomy_file= '/Users/jimbo/Desktop/Yubo_SCBNR_16S_rnd2/tax_labels.txt')
scbnr_biom.biom.columns = map_df['SampleName']
scbnr_biom.write_output_files("scbnr_round2_relabeled")

In [61]:

class DADA_biom:
    """ Converts DADA2 biom files to QIIME/other more typical formats.
    Methods (Args/Returns):    
    Init: 
        path_in: DADA2-Biom of sample X exact sequence variants (ASVs) 
    Split_write: Splits biom into two files - fasta sequence file and re-indexed biom file.
        file_out: prefix for output files. 
        Files will be labeled "{file_out}_biom.txt" and "{file_out}_rep_seqs.fasta"
    """
    def __init__(self, path_in, otu_prefix, taxonomy_file = None):
        self.biom = pd.read_csv(path_in, sep='\t').transpose()
        self.path = path_in.rsplit('/',1)[0] 
        self.sequences = self.biom.index.values
        self.biom.index = ['{0}_{1}'.format(otu_prefix, x) for x in range(len(self.biom.index))]
        self.sequences_df = pd.DataFrame(index=self.biom.index, data=self.sequences,columns=['RepSeq'])
        if taxonomy_file: 
            self.taxonomy = pd.read_csv(taxonomy_file, sep = '\t')
            self.taxonomy['ID'] = self.biom.index
            self.taxonomy.set_index('ID', inplace=True)
            self.taxonomy.fillna('', inplace=True)
            self.taxonomy['tax_string'] = self.taxonomy[['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']].apply(lambda x : 'Root;k__{0};p__{1};c__{2};o__{3};f__{4};g__{5};s__{6}'.format(x[0],x[1],x[2],x[3],x[4],x[5],x[6]), axis=1)
                                                                        
            
    def write_output_files(self, file_out):
        biom_out = '{0}/{1}_biom.txt'.format(self.path, file_out)
        fasta_out = '{0}/{1}_reference.fasta'.format(self.path, file_out)
        tax_out = '{0}/{1}_taxonomy.txt'.format(self.path, file_out)
        
        # Write biom:
        self.biom['taxonomy'] = self.taxonomy['tax_string'] 
        self.biom.to_csv(biom_out, sep='\t')
        # Write ref seqs
        with open(fasta_out, 'w') as f:
            for ind,seq in self.sequences_df.iterrows():
                f.write("{0}\n{1}\n".format(ind, seq.RepSeq))
        # Write tax:
        self.taxonomy.to_csv(tax_out, sep='\t')


In [5]:
# Load taxonomy labels:
tax_labels = 

In [42]:

# QIIME style label:

   

In [37]:
 
        # Fixed names
                
        
        
        #self.write_seqs(fasta_out, otu_prefix)
        
        
    def write_seqs(self, file_out, otu_prefix):
        """ Writes sequences to a new fasta file"""
        ind=0
        with open(file_out, 'w') as f:
            for i in self.df.index:
                header = '>{0}_{1}\n'.format(otu_prefix, ind)
                f.write(header)
                f.write(i+'\n')
                ind+=1
    
    update_biom_labels(self, file_out, otu_prefix):
        # Relabel ASVs with prefix_#:
                
        
        
        biom_out  = 
        
        #fasta_out = '{0}/{1}_rep_seqs.fasta'.format(self.path, file_out)
    


    

In [52]:
scbnr_biom.taxonomy.tax_string[0]

'Root;k__Bacteria;p__Bacteroidetes;c__Ignavibacteria;o__Ignavibacteriales;f__PHOS-HE36;g__;s__'

In [46]:
scbnr_biom.taxonomy.tax_string[0]

'Root;k__Bacteria;p__Bacteroidetes;c__Ignavibacteria;o__Ignavibacteriales;f__PHOS-HE36;g__nan;s__nan'