In [21]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def make_df_up_level(osu_df,tax_df,level,norm_type):
    up_dict = dict(zip(tax_df.osu_id, tax_df[level]))
    for key in up_dict:

        val = str(up_dict[key])
        val = val.replace(']','_')
        val = val.replace('[','_')
        up_dict[key]=val
    
    df = osu_df.rename(columns=up_dict)
    cols = df.columns.tolist()
    cols = [x for x in cols if "_" not in str(x)]
    df = df.drop(cols, axis=1)
    
    if norm_type == 'clr':
        df = df.applymap(np.exp)
    
    df = df.groupby(by=df.columns, axis=1).sum()
    if norm_type == 'clr':
        df = df.applymap(np.log)
    df = pd.concat([df,osu_df.Group],axis=1)
    
    return df

In [25]:
def SVD_truncate(df,ncomp,cutoff):
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    df = df.drop(['Group'], axis=1)
    dfT = df.T
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    df = dfT.T
    X = np.array(df)
    svd = TruncatedSVD(n_components=ncomp, n_iter=10, random_state=42)
    svd_result = svd.fit(X) 
    svd_X = svd.fit_transform(X)
    print("Explained variance ratios:")
    print(svd_result.explained_variance_ratio_)
    print('Sum of explained variance ratios:',np.sum(svd_result.explained_variance_ratio_))
    plt.figure(figsize=((15,4)))
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.bar(x=np.linspace(0,ncomp,ncomp),height = svd_result.explained_variance_ratio_)
    
    return svd_X, Y, labels

In [6]:
def make_dataset_osu_diff(df, ncomp, cutoff):
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    df = df.set_index(['Group'])
    dfT = df.T
    dfT = dfT.groupby(by=dfT.columns, axis=1).mean()

    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    dfT['diff']=abs(dfT[dfT.columns[0]]-dfT[dfT.columns[1]])
    dfT = dfT.sort_values(by=['diff'],ascending=False)
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [15]:
def make_dataset(df, cutoff):
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    df = df.set_index(['Group'])
    dfT = df.T

    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    inds = dfT.index.tolist()
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [13]:
def make_dataset_zscore(df, ncomp, cutoff, plot_cutoff,norm_type):
    
    #convert cutoff to log scale if normalization is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
        
    #rename DF
    dfn = df
    
    #get group labels and convert to binary
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    
    #Set index to Group
    dfn = dfn.set_index(['Group'])
    
    #Take transpose
    dfT = dfn.T
    
    #Remove columns where all values below cutoff
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    
    #Take mean of each feature for the two groups
    dfTmean = dfT.groupby(by=dfT.columns, axis=1).mean()
    
    #Take std of each feature for the two groups
    dfTstd = dfT.groupby(by=dfT.columns, axis=1).std()
    
    #Rename std columns and concatenate the two
    dfTstd.columns = [x+"_std" for x in dfTstd.columns.tolist()]
    dfT = pd.concat([dfTmean,dfTstd],axis=1)

    #Make z-score column and sort
    dfT['zscore']=abs((dfT[dfT.columns[0]]-dfT[dfT.columns[1]])/((dfT[dfT.columns[2]]+dfT[dfT.columns[3]])/2))
    dfT = dfT.sort_values(by=['zscore'],ascending=False)
    
    #Plot
    plt.figure(figsize=((15,4)))
    dfT[dfT['zscore']>plot_cutoff].plot.bar(y = 'zscore')
    
    #Take top components
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [20]:
def feature_from_correlation(df,ncomp,cutoff,norm_type):
    
    #convert cutoff to log scale if normalization is clr
    if norm_type == 'clr':
        cutoff = np.e**cutoff
    
    #rename DF
    dfn = df
    
    #make a list of the Group labels
    group = dfn.Group
    
    #Drop group labels
    dfn = dfn.drop(['Group'],axis=1)
    
    #Transpose, remove columns where all values are below cutoff, retranspose
    dfT = dfn.T
    dfT = dfT[~(dfT[dfT.columns] < cutoff).all(axis=1)]
    dfn = dfT.T
    
    #Add Group labels back
    dfn = pd.concat([dfn,group],axis=1)

    #find unique group labels and replace with a binary
    labels = df.Group.unique()
    for i,cat in enumerate(labels):
        dfn = dfn.replace(cat,i)

    #calculate correlation
    cor = dfn.corr()
    
    #Correlation with output variable
    cor_target = abs(cor["Group"])
    
    #Selecting highly correlated features
    relevant_features = cor_target.sort_values(ascending=False).head(ncomp+1)
    
    #select top correlating features and drop group
    dfn = dfn[relevant_features.index]
    Y = dfn['Group'].tolist()
    X = dfn.drop(['Group'],axis=1)
    
    return X,Y,labels,relevant_features.index.tolist()[1:]