## Feature Reductions

### Implementation of feature reduction methods:

`make_df_up_level` - converts OSU_ID to 'genus', 'family'... level

### These methods convert dataframe of data to dataset for making train/test split:

`SVD_truncate` - reduces dimensionality of features by SVD truncate

`make_dataset_osu_diff` - Rank features by difference in mean abundance between labels

`make_dataset` - Converts dataframe to dataset for making train/test split

`make_dataset_zscore` - Rank features by Z-score between labels

`feature_from_correlation` - Rank features by their correlation with the label


In [21]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def make_df_up_level(osu_df,tax_df,level,norm_type):
    '''
    Changes taxonomic level of dataframe to a user-specified level
    Input: osu_df - dataframe of abundacnes
           tax_df - dataframe of taxonomic information
           level - level to which taxonomic information should be moved to.
                   Can be None, genus, family, order, class, phylum
           norm_type - type of normalization used
    Returns: dataframe with new taxonomy information
    '''
    #Make dictionary of the OSU ids the the taxonomy at a specific level
    up_dict = dict(zip(tax_df.osu_id, tax_df[level]))
    
    #Replace brackets with underscores because of issues
    for key in up_dict:
        val = str(up_dict[key])
        val = val.replace(']','_')
        val = val.replace('[','_')
        up_dict[key]=val
    
    #Replace osu ids with the taxonomy level
    df = osu_df.rename(columns=up_dict)
    cols = df.columns.tolist()
    
    #Drop osu_ids without taxonomy information
    cols = [x for x in cols if "_" not in str(x)]
    df = df.drop(cols, axis=1)
    
    #Convert abundance data from log to normal if the clr normalization is used
    if norm_type == 'clr':
        df = df.applymap(np.exp)
    
    #Sum data with the same taxonomy
    df = df.groupby(by=df.columns, axis=1).sum()
    
    #Re-log the data if clr normalization was used
    if norm_type == 'clr':
        df = df.applymap(np.log)
        
    #Reconcatenate group information onto dataframe
    df = pd.concat([df,osu_df.Group],axis=1)
    
    return df

In [1]:
def SVD_truncate(df,ncomp,cutoff):
    '''
    Uses SVD-truncate to reduce features
    Inputs: df - dataframe containing the composition data
            ncomp - number of components
            cutoff - the abundance level cutoff
    Return: X - array of sample-feature data
            Y - binary labels for data
            labels - text labels for data
    '''
    #Convert cutoff to log scale if clr normalization used
    if norm_type == 'clr':
        cutoff = np.e**cutoff
        
    #Identify unique labels
    labels = df.Group.unique()

    #Convert group labels to binary labels and save as Y
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    
    #Drop group data, transpose, remove data where all counts are under the cutoff and re-transpose
    df = df.drop(['Group'], axis=1)
    dfT = df.T
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    df = dfT.T
    
    #Make dataframe 
    X = np.array(df)
    
    #Use Truncated SVD and plot
    svd = TruncatedSVD(n_components=ncomp, n_iter=10, random_state=42)
    svd_result = svd.fit(X) 
    X = svd.fit_transform(X)
    print("Explained variance ratios:")
    print(svd_result.explained_variance_ratio_)
    print('Sum of explained variance ratios:',np.sum(svd_result.explained_variance_ratio_))
    plt.figure(figsize=((15,4)))
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.bar(x=np.linspace(0,ncomp,ncomp),height = svd_result.explained_variance_ratio_)
    
    return X, Y, labels

In [6]:
def make_dataset_osu_diff(df, ncomp, cutoff):
    '''
    Returns top <ncomp> features based on difference in mean abundance between labels
    Inputs: df - dataframe containing the composition data
            ncomp - number of components
            cutoff - the abundance level cutoff
    Return: X - array of sample-feature data
            Y - binary labels for data
            labels - text labels for data
    '''
    #Change cutoff if norm_type is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
        
    #Get labels and convert labels to binary
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    
    #Set Group as index and transpose
    df = df.set_index(['Group'])
    dfT = df.T
    
    #Remove features that don't pass cutoff filter
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    
    #Group feature data by label using mean
    dfT = dfT.groupby(by=dfT.columns, axis=1).mean()

    #Calculate difference in mean
    dfT['diff']=abs(dfT[dfT.columns[0]]-dfT[dfT.columns[1]])
    
    #Sort by difference in mean
    dfT = dfT.sort_values(by=['diff'],ascending=False)
    
    #Select top <ncomp> features
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [15]:
def make_dataset(df, cutoff):
    '''
    Returns dataset in format for X, Y, labels
    Inputs: df - dataframe containing the composition data
            ncomp - number of components
            cutoff - the abundance level cutoff
    Return: X - array of sample-feature data
            Y - binary labels for data
            labels - text labels for data
    '''
    
    #Change cutoff if norm_type is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    
    #Get labels and convert labels to binary
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    
    #Set Group as index and transpose
    df = df.set_index(['Group'])
    dfT = df.T
    
    #Remove features that don't pass cutoff filter
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    inds = dfT.index.tolist()
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [13]:
def make_dataset_zscore(df, ncomp, cutoff, plot_cutoff,norm_type):
    '''
    Returns top <ncomp> features based on zscore
    Inputs: df - dataframe containing the composition data
            ncomp - number of components
            cutoff - the abundance level cutoff
    Return: X - array of sample-feature data
            Y - binary labels for data
            labels - text labels for data
    '''
    #convert cutoff to log scale if normalization is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
        
    #rename DF
    dfn = df
    
    #get group labels and convert to binary
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    
    #Set index to Group
    dfn = dfn.set_index(['Group'])
    
    #Take transpose
    dfT = dfn.T
    
    #Remove columns where all values below cutoff
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    
    #Take mean of each feature for the two groups
    dfTmean = dfT.groupby(by=dfT.columns, axis=1).mean()
    
    #Take std of each feature for the two groups
    dfTstd = dfT.groupby(by=dfT.columns, axis=1).std()
    
    #Rename std columns and concatenate the two
    dfTstd.columns = [x+"_std" for x in dfTstd.columns.tolist()]
    dfT = pd.concat([dfTmean,dfTstd],axis=1)

    #Make z-score column and sort
    dfT['zscore']=abs((dfT[dfT.columns[0]]-dfT[dfT.columns[1]])/((dfT[dfT.columns[2]]+dfT[dfT.columns[3]])/2))
    dfT = dfT.sort_values(by=['zscore'],ascending=False)
    
    #Plot
    plt.figure(figsize=((15,4)))
    dfT[dfT['zscore']>plot_cutoff].plot.bar(y = 'zscore')
    
    #Take top components
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [20]:
def feature_from_correlation(df,ncomp,cutoff,norm_type):
    '''
    Returns top <ncomp> features based on correlation with labels
    Inputs: df - dataframe containing the composition data
            ncomp - number of components
            cutoff - the abundance level cutoff
    Return: X - array of sample-feature data
            Y - binary labels for data
            labels - text labels for data
    '''
    #convert cutoff to log scale if normalization is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    
    #rename DF
    dfn = df
    
    #make a list of the Group labels
    group = dfn.Group
    
    #Drop group labels
    dfn = dfn.drop(['Group'],axis=1)
    
    #Transpose, remove columns where all values are below cutoff, retranspose
    dfT = dfn.T
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    dfn = dfT.T
    
    #Add Group labels back
    dfn = pd.concat([dfn,group],axis=1)

    #find unique group labels and replace with a binary
    labels = df.Group.unique()
    for i,cat in enumerate(labels):
        dfn = dfn.replace(cat,i)

    #calculate correlation
    cor = dfn.corr()
    
    #Correlation with output variable
    cor_target = abs(cor["Group"])
    
    #Selecting highly correlated features
    relevant_features = cor_target.sort_values(ascending=False).head(ncomp+1)
    
    #select top correlating features and drop group
    dfn = dfn[relevant_features.index]
    Y = dfn['Group'].tolist()
    X = dfn.drop(['Group'],axis=1)
    
    return X,Y,labels,relevant_features.index.tolist()[1:]