In [37]:
from grant import grant

import pandas as pd
import numpy as np

In [154]:
tax = pd.read_csv('test.csv').set_index('OTU')
tax

Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
OTU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Uniq114339,Bacteria,Firmicutes,,,,,
Uniq53046,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Veillonella,Veillonella_atypica
Uniq5707,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Lachnospiraceae_FCS020_group,
Uniq45364,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Escherichia-Shigella,Enterobacter
Uniq80019,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Escherichia-Shigella,Escherichia_coli
...,...,...,...,...,...,...,...
Uniq103183,Bacteria,Firmicutes,Bacilli,,,,
Uniq371,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Parabacteroides,Parabacteroides_distasonis
Uniq75647,Bacteria,Proteobacteria,,,,,
Uniq12824,Bacteria,Firmicutes,Bacilli,Bacillales,Family_XI,Gemella,Gemella_haemolysans


In [155]:
def fill_tax_table(tax):
    """Fills missing values in the taxonomy table. Will recognize only 'np.nan' data types as empty values.

    Args:
        tax (pd.DataFrame): Dataframe with index of ASV/OTU and columns of left -> right increasing specificity in taxonomy (e.g., Kingdom -> Species)

    Output:
        new_tax (pd.DataFrame): Properly-filled taxonomy dataframe
    """
    if len(tax.index) != len(tax.index.unique()):
        print('Repeated OTUs/ASVs in the taxonomy index. Check to make sure there is only _one_ entry per OTU in taxonomy table.')

    # MUST be in increasing specificity order (Kingdom -> Species)
    # OTU/ASV must be the INDEX.
    tax_labels = tax.columns
    table_name = tax.index.name # Important - don't remove this and its corresponding stpe below.

    # Gather all OTUs to iterate over
    otus = tax.index.unique()

    new_tax = [] # Collector for new taxonomy pd.Series
    for otu in otus:

        series = tax.loc[otu]

        # If there are no NaNs in the OTU, don't do anything.
        if (~series.isna()).all():
            new_tax.append(series)

        # However, if NaNs do exist, fill the taxonomy "from-the-left"
        else:
            first_nan = np.argwhere(series.isna().values == True)[0][0]

            # In case "Kingdom" is NaN (or other highest level taxa)
            if first_nan == 0:
                last_not_nan = first_nan
            else:
                last_not_nan = first_nan - 1


            ##### Below commented-out code I'm saving here, ignore #####
            # for i in range(first_nan, len(series)):
            #     series.iloc[i] = f'unk_{series.index[i]}_of_{series.index[i-1]}_{series.iloc[i-1]}'
            #####                                                  #####

            # Perform "fill-from-the-left"
            # For each and every NaN, fill it with the last non-NaN taxonomy, and append the ASV/OTU name at the end as well.
            for i in range(first_nan, len(series)):

                # In case "Kingdom" is NaN (or other highest level taxa)
                if i == 0:
                    series.iloc[i] = f'unk_{series.index[i]}'
                else:
                    series.iloc[i] = f'unk_{series.index[i]}_of_{series.index[last_not_nan]}_{series.iloc[last_not_nan]}'

            # Add in the ASV/OTU name to the end of every unknown

            for i in range(first_nan, len(series)):
                series.iloc[i] = f'{series.iloc[i]}__{otu}'


            new_tax.append(series)

    new_tax = pd.concat(new_tax, axis=1).T

    # This name gets erased in the above transformation, so return it.
    new_tax.index.name = table_name

    return new_tax
            

In [156]:
fill_tax_table(tax)

Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
OTU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Uniq114339,Bacteria,Firmicutes,unk_Class_of_Phylum_Firmicutes__Uniq114339,unk_Order_of_Phylum_Firmicutes__Uniq114339,unk_Family_of_Phylum_Firmicutes__Uniq114339,unk_Genus_of_Phylum_Firmicutes__Uniq114339,unk_Species_of_Phylum_Firmicutes__Uniq114339
Uniq53046,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Veillonella,Veillonella_atypica
Uniq5707,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Lachnospiraceae_FCS020_group,unk_Species_of_Genus_Lachnospiraceae_FCS020_gr...
Uniq45364,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Escherichia-Shigella,Enterobacter
Uniq80019,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Escherichia-Shigella,Escherichia_coli
...,...,...,...,...,...,...,...
Uniq103183,Bacteria,Firmicutes,Bacilli,unk_Order_of_Class_Bacilli__Uniq103183,unk_Family_of_Class_Bacilli__Uniq103183,unk_Genus_of_Class_Bacilli__Uniq103183,unk_Species_of_Class_Bacilli__Uniq103183
Uniq371,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Parabacteroides,Parabacteroides_distasonis
Uniq75647,Bacteria,Proteobacteria,unk_Class_of_Phylum_Proteobacteria__Uniq75647,unk_Order_of_Phylum_Proteobacteria__Uniq75647,unk_Family_of_Phylum_Proteobacteria__Uniq75647,unk_Genus_of_Phylum_Proteobacteria__Uniq75647,unk_Species_of_Phylum_Proteobacteria__Uniq75647
Uniq12824,Bacteria,Firmicutes,Bacilli,Bacillales,Family_XI,Gemella,Gemella_haemolysans
