In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def make_df_up_level(osu_df,tax_df,level,norm_type):
    up_dict = dict(zip(tax_df.osu_id, tax_df[level]))
    for key in up_dict:

        val = str(up_dict[key])
        val = val.replace(']','_')
        val = val.replace('[','_')
        up_dict[key]=val
    
    df = osu_df.rename(columns=up_dict)
    cols = df.columns.tolist()
    cols = [x for x in cols if "_" not in str(x)]
    df = df.drop(cols, axis=1)
    
    if norm_type == 'clr':
        df = df.applymap(np.exp)
    
    df = df.groupby(by=df.columns, axis=1).sum()
    if norm_type == 'clr':
        df = df.applymap(np.log)
    df = pd.concat([df,osu_df.Group],axis=1)
    
    return df

In [3]:
def SVD_truncate(df,ncomp):
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    X = np.array(df.drop(['Group'], axis=1))
    svd = TruncatedSVD(n_components=ncomp, n_iter=10, random_state=42)
    svd_result = svd.fit(X) 
    svd_X = svd.fit_transform(X)
    print("Explained variance ratios:")
    print(svd_result.explained_variance_ratio_)
    
    return svd_X, Y, labels

In [4]:
def make_dataset_osu_diff(df, ncomp, cutoff):
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    df = df.set_index(['Group'])
    dfT = df.T
    dfT = dfT.groupby(by=dfT.columns, axis=1).mean()

    dfT = dfT[~(dfT[dfT.columns] < cutoff).any(axis=1)]
    dfT['diff']=abs(dfT[dfT.columns[0]]-dfT[dfT.columns[1]])
    dfT = dfT.sort_values(by=['diff'],ascending=False)
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [9]:
def make_dataset(df, cutoff):
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    df = df.set_index(['Group'])
    dfT = df.T
    dfT = dfT.groupby(by=dfT.columns, axis=1).mean()
    
    dfT = dfT[~(dfT[dfT.columns] < cutoff).any(axis=1)]
    inds = dfT.index.tolist()
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [6]:
def make_dataset_zscore(df, ncomp, cutoff, plot_cutoff):
    dfn = df
    labels = df.Group.unique()
    d_cat ={}
    for i,cat in enumerate(labels):
        d_cat[cat]=i
    Y = [d_cat[x] for x in df['Group'].tolist()]
    dfn = dfn.set_index(['Group'])
    dfT = dfn.T
    dfTmean = dfT.groupby(by=dfT.columns, axis=1).mean()
    dfTstd = dfT.groupby(by=dfT.columns, axis=1).std()
    dfTstd.columns = [x+"_std" for x in dfTstd.columns.tolist()]
    dfT = pd.concat([dfTmean,dfTstd],axis=1)

    dfT = dfT[~(dfT[dfT.columns] < cutoff).any(axis=1)]
    dfT['zscore']=abs((dfT[dfT.columns[0]]-dfT[dfT.columns[1]])/((dfT[dfT.columns[2]]+dfT[dfT.columns[3]])/2))
    dfT = dfT.sort_values(by=['zscore'],ascending=False)
    plt.figure(figsize=((15,4)))
    dfT[dfT['zscore']>plot_cutoff].plot.bar(y = 'zscore')
    inds = dfT.index.tolist()[0:ncomp]
    df = df[inds]
    X = np.array(df)

    return X,Y,labels,inds

In [23]:
def feature_from_correlation(df,ncomp):
    dfn = df
    labels = df.Group.unique()
    d_cat = {}
    for i,cat in enumerate(labels):
        dfn = dfn.replace(cat,i)
    Y = dfn['Group'].tolist()
    
    cor = dfn.corr()
    
    #Correlation with output variable
    cor_target = abs(cor["Group"])
    relevant_features = cor_target.sort_values(ascending=False).head(ncomp+1)
    
    dfn = dfn[relevant_features.index]
    dfn = dfn.drop(['Group'],axis=1)
    X = np.array(dfn)
    inds = dfn.columns.tolist()
    
    return X,Y,labels,inds