# Preprocessing batches of data

#### This program takes in an .xlsx file with single-cell protein expression values as the first sheet and zero-cell protein expression values as the second sheet. The format should have the first column as the single-cell number and the rest of the columns should be protein expression values. This script will take in single-cell protein expression values that has a correlated slope when 2 proteins' expressions are compared and preprocess it by evening out the background to remove that correlation. 

#### The script will use the equation y = ((x - (x-bg)/std) / bg)*1000 where x is the original protein expression, y is the processed protein expression, bg is the background calculated by taking the mean of the lowest 3 expressions per batch row, and std is the standard deviation calculated by using the lowest 5 expression per protein column. It is multiplied by 1000 for scaling.

#### The output is an excel file with preprocessed single-cell protein expression values as the first sheet and the preprocessed zero-cell protein expression values as the second sheet. The result should have lower protein expression values that should have a correlation of near 0 for any two proteins compared. The expression values should also be above the baseline/control expression values.

#### A few values will need to be changed for each data. These are: input_path, protein_num, sheet_name in single_cell and zero_cell, and the output_path.

In [1]:
import numpy as np
import pandas as pd
import statistics

# Where the file should be located
input_path  = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/NewADvsWT/Batch preprocessing sample.xlsx'


# Number of proteins per panel within the data
protein_num = [47, 45, 48, 42]


# Separating the single-cell data from the zero-cell data. This is assuming that both sets of data are within the
# same excel file. The parameter sheet_name is the name of the sheet in the file and should be changed accordingly
single_cell = pd.read_excel(input_path, sheet_name = "Single cells")
zero_cell = pd.read_excel(input_path, sheet_name = "Zero cells")


# This will fill all empty cells with 0's that will not be included in calculations for quality control
single_cell = single_cell.fillna(0)
zero_cell = zero_cell.fillna(0)

print('File Read')

File Read


In [2]:
# Define the function that will be applied
def slope_equation(df, std):
    # df = a single row of a batch/panel of the data (denoted by the colors)
    # std = standard deivation previously calculated
    
    df_ordered = sorted([i for i in df if i != 0])   # ordering the df without including 0's
    bg = np.mean(df_ordered[0:3])  # background value    
    
    index = np.where(df == 0)  # find where df = 0
    
    signal = ((df - (df-bg)/std) / bg)*1000  # slope equation
    signal[index[0]] = 0  # change value of index of where df = 0 back to 0
    
    # Return a row of processed values
    return signal



# Define the standard deviation of each column to be inputted into the apply function
def get_std(df):
    # df = an entire batch/panel of the data
    
    df_ordered = sorted([i for i in df if i != 0])  # ordering the df without including 0's
    return np.std(df_ordered[0:5])  # standard deviation

In [3]:
# Run the equation on every value per row for each batch
std_single = single_cell.apply(get_std, axis=0)
std_zero = zero_cell.apply(get_std, axis=0)

start, end = 1, 1  # Starting and ending values. These include the 'single-cell' column
for x in range(len(protein_num)):
    # Using the slope equation on every value of each row where each row has its own background value
    single_cell.iloc[:, start:(end+protein_num[x])] = single_cell.iloc[:, start:(end+protein_num[x])].apply(slope_equation,
                                                                                std=std_single[start:(end+protein_num[x])],
                                                                                axis=1)
    zero_cell.iloc[:, start:(end+protein_num[x])] = zero_cell.iloc[:, start:(end+protein_num[x])].apply(slope_equation,
                                                                                std=std_zero[start:(end+protein_num[x])],
                                                                                axis=1)
    start += protein_num[x]
    end += protein_num[x]

print('File Preprocessed')

File Preprocessed


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [4]:
# Save the results in an excel file output with both single-cell and zero-cell sheets
output_path = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/NewADvsWT/Batch preprocessing sample correct.xlsx'
writer = pd.ExcelWriter(output_path, mode='w')

single_cell.to_excel(writer, sheet_name = "Single cells", index=False)
zero_cell.to_excel(writer, sheet_name = "Zero cells", index=False)

writer.save()

print('File Saved')

File Saved


# Subtraction preprocessing

#### This portion will take the processsed single-cell data and subtract it by the average processed zero-cell data protein column + factor x SD where factor is the multiplicative variable for the SD. The input will be the preprocessed files and the output will be an .xlsx file that has a single sheet comprised of the resulting subtraction.

#### The factor is a list of number of SD to be multiplied by. A list of [2, 3] as factor will run the script twice, each with a subtraction of a different factor x SD value. The final excel file output will be saved for each number of factors there are. 

#### Values to be changed: input_path, protein_num, factor, output_path

In [5]:
import numpy as np
import pandas as pd

# Where the file should be located
input_path  = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/NewADvsWT/Batch preprocessing sample correct.xlsx'


# Number of proteins per panel within the data
protein_num = [47, 45, 48, 42]


# Separating the single-cell data from the zero-cell data. This is assuming that both sets of data are within the
# same excel file. The parameter sheet_name is the name of the sheet in the file and should be changed accordingly
single_cell = pd.read_excel(input_path, sheet_name = "Single cells")
zero_cell = pd.read_excel(input_path, sheet_name = "Zero cells")

print('File Read')

File Read


In [6]:
# Subtracting preprocessed single-cell data by preprocessed zero-cell data + SD (standard deviation)

# This variable is the multiplicative factor for the SD. For example, a factor of 2 will represent 2*SD that will be 
# added to the processed zero-cell background to be subtracted from the processed single-cell data
factor = [2, 3]

# The standard deviation adnd mean of the protein column
SD = zero_cell.apply(np.std, axis=0)
average = zero_cell.apply(statistics.harmonic_mean, axis=0)

total_proteins = np.sum(protein_num) + 1


for x in range(len(factor)):
    subtracted = single_cell.iloc[:,1:total_proteins] - (average[1:total_proteins] 
                                                               + factor[x]*SD[1:total_proteins])
    
    subtracted.insert(loc = 0, column = single_cell.columns[0], value = range(1,(len(subtracted)+1)))

    # Change negative and NAN/NULL values to 0
    subtracted[subtracted < 0] = 0
    subtracted = subtracted.fillna(0)


    # Save the output
    output_path = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/NewADvsWT/Batch preprocessing sample subtracted %sSD.xlsx' % factor[x]

    subtracted.to_excel(output_path, index=False)

print('File(s) Saved')

File(s) Saved
