In [1]:
import numpy as np
import pandas as pd
from skbio.stats.composition import clr
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Input variables for CSS normalization
instability_metric = 0.05 #difference between nearest difference in qL variance
N = 100000 #count multiplier

def clr_normalization(osu_df):
    '''Atchinson centered-log ratio of the composition data'''
    
    
    #pivot the dataframe so samples are rows and osu_ids (features) are columns
    inds = 'sample_id'
    vals = 'osu_count'
    cols = 'osu_id'
    osu_df = pivot_osu_df(osu_df, inds,vals,cols)

    #Replace zeros with a pseudocount of 0.5
    osu_df = osu_df.replace(0,0.5)

    cols = osu_df.columns
    inds = osu_df.index
    
    #Do the center-log ratio
    osu_lt = clr(osu_df)
    osu_df = pd.DataFrame(data=osu_lt, columns=cols, index=inds)
    
    return osu_df

In [3]:
def tss_normalization(osu_df):
    '''Total sum scaling of dataset'''
    
    #normalize columns by the sum of the column
    osu_df = osu_df.assign(norm_count=osu_df.groupby('sample_id', group_keys=False)
                           .apply(lambda x: x.osu_count/x.osu_count.sum())) 
    
    #Pivot dataframe so sampeles are rows and osu_ids (features) are columns
    inds = 'sample_id'
    vals = 'norm_count'
    cols = 'osu_id'
    osu_df = pivot_osu_df(osu_df, inds,vals,cols)
    
    return osu_df
    

In [4]:
def css_normalization(osu_df):
    '''Cumulative sum-scaling'''
    
    # Pivot DataFrame
    inds = 'osu_id'
    vals = 'osu_count'
    cols = 'sample_id'
    osu_df = pivot_osu_df(osu_df, inds,vals,cols)

    #Remove Group from DataFrame
    cols = osu_df.columns.tolist()
    #osu_df = osu_df.drop(['Group'],axis=1)

    osu_df = osu_df.fillna(0)
    #osu_df.columns = [col[1] for col in osu_df.columns]

    #Dictionary of total counds in columns
    n_counts = {}
    for col in osu_df.columns.tolist():
        n_counts[col] = osu_df[col].astype(bool).sum(axis=0)

    #For quantile from 0 to 1, count the number of taxons with at least 
    #this many read counts
    cs = pd.DataFrame()

    for col in osu_df.columns.tolist():
        
        sorted_counts = osu_df[col].sort_values()
        sorted_counts = sorted_counts[sorted_counts >0]
        for l in np.linspace(0,1,num=100):
            ln = int(l*(n_counts[col]-1))
            temp_dict = {'sample_id':col,'l':l,'ql':sorted_counts.iloc[ln]}
            cs = pd.concat([cs,pd.DataFrame.from_records([temp_dict], index='sample_id')])
            
    inds = 'l'
    vals = 'ql'
    cols = 'sample_id'

    #pivot table
    cs = pd.pivot_table(cs, 
                        index=[inds],
                        values=[vals],
                        columns=[cols])
    cs.columns = [col[1] for col in cs.columns]

    #The median qL for all samples at each quantile
    dl =cs.median(axis=1).tolist()

    #The median difference between the median qL and and each samples quantiles
    dl_median = cs.sub(dl, axis='index').abs().median(axis=1)
    dl_median.plot()

    dl_med_list = dl_median.tolist()

    #Calculate the cut-off value based on difference in median qL variance from one
    #quantile to the next being less than the <instability_metric> * qL value
    for i in range(len(dl_med_list[:-1])):

        dl_i = dl_med_list[len(dl_med_list)-i-1]
        dl_i1 = dl_med_list[len(dl_med_list)-i-2]

        dl_diff = abs(dl_i-dl_i1)
        if dl_diff < dl_i*instability_metric:
            l_hat_index = i
            break
        else:
            l_hat_index = len(dl_med_list)

    # The quantile that indicates the normalization constant
    l_hat = dl_median.index.tolist()[len(dl_med_list)-l_hat_index]

    #Normalization constant 
    sjs = cs[cs.index > l_hat].sum(axis=0)

    #Divide counts by the normalization constants
    osu_df = osu_df.div(sjs,axis=1)
    osu_df = osu_df*N #multiple by the count multiplier
    osu_df = osu_df.T #take the Transform so DF is in the same format at others
    
    return osu_df
 

In [5]:
   def pivot_osu_df(osu_df, inds,vals,cols):
    '''Function to Pivot data frame, drop the multilevel column and fill NaNs with zeros.'''
    osu_df = pd.pivot_table(osu_df, 
                                index=[inds],
                                values=[vals],
                                columns=[cols],
                                aggfunc=np.sum)
    osu_df = osu_df.sort_index(axis=0)
    osu_df.columns = osu_df.columns.droplevel()
    osu_df = osu_df.fillna(0)
    
    return osu_df
    

In [6]:
def join_osus(files,norm_type):
    '''
    Joins osu_abundance files and normalizes in one of three ways:
    
    Parameters:
    files: 
    list of files that are output from HiMAP
    
    Norm_type:
    Type of normalization from the following:
    tss - total sum scaling: divide each sample by total sum of reads
    clr - Atchinson's centered log-ratio
    css - Cumulative sum scaling
    
    Returns a dataframe containing the samples as rows and OSUs as columns
    '''
    osu_df = pd.DataFrame()

    for file in files:
        df = pd.read_csv(file,sep='\t')
        osu_df = pd.concat([osu_df, df])
    osu_df = osu_df.reset_index()
    
    if norm_type == 'tss':
        
        osu_df = tss_normalization(osu_df)
        
    elif norm_type == 'clr':
        
        osu_df = clr_normalization(osu_df)
        
    elif norm_type == 'css':
        
        osu_df = css_normalization(osu_df)
        
    return osu_df
    
        

In [1]:
def join_taxonomy(files):
    
    tax_df = pd.DataFrame()
    
    for file in files:
        df = pd.read_csv(file,sep='\t')
        tax_df = pd.concat([tax_df, df])
    
    tax_df = tax_df.drop_duplicates(keep='first')
    
    try:
        tax_df = tax_df.drop(['pctsim'], axis=1)
    except:
        None
    
    return tax_df

def get_labels(meta_file):
    labels = 'labels'
    meta_df = pd.read_csv(meta_file,sep=',',index_col='run_accession')

    return meta_df
        
def normalize_counts(osu_df):
    '''Normalizes counts of an OSU DataFrame grouping by <sample_id> and summing <osu_count> '''
    
    osu_df = osu_df.assign(norm_count=osu_df.groupby('sample_id', group_keys=False)
                           .apply(lambda x: x.osu_count/x.osu_count.sum())) 
    return osu_df


def join_osu_with_labels(osu_df,meta_df):
    
    group = meta_df.Group
    #sample = meta_df.Sample
    osu_df = pd.concat([osu_df, group], axis=1, sort=True)
    osu_df = osu_df.fillna(0)
    
    
    return osu_df

def select_groups(df,groups):
    groups_df=pd.DataFrame()
    for g in groups:
        group_g = df['Group'] == g
        new_df = df[group_g]
        groups_df = pd.concat([groups_df, new_df], sort=True)
        
    return groups_df

def get_sample_weights(y):
    one_sum = sum(y)
    zero_sum = len(y)-one_sum
    if one_sum> zero_sum:
        scale = one_sum/zero_sum
        sw = np.array([1 if i == 1 else scale for i in y])
    elif zero_sum > one_sum:
        scale = zero_sum/one_sum
        sw = np.array([1 if i == 0 else scale for i in y])
    else:
        sw = np.array([1 for i in y])
        
    return sw