In [None]:
import pickle
import pandas as pd
import re
import numpy as np
from rpy2.robjects import pandas2ri

In [None]:
##### load data #####
expr_list = pickle.load( open( "result_03_sampled_data.obj", "rb" ) )
cytof_files = pd.read_csv("result_01_cytof_file_info.csv")
print(len(expr_list))
print(cytof_files .shape)

In [None]:
##### only include CMV data and HAI_28 data #####
a = [t1==t1 for t1 in cytof_files.CMV_Ab] 
b = [t1==t1 for t1 in cytof_files.HAI_28]
r1 = [a[i]|b[i] for i in range(len(a))]

cytof_files = cytof_files.loc[r1,:]
expr_list = [expr_list[i] for i in range(len(expr_list)) if r1[i]==True]

print(len(expr_list))
print(cytof_files.shape)

In [None]:
##### look at markers #####
old_markers = []
for i in range(0,len(expr_list)):
    expr_list[i].colnames = [re.sub("\\(.*", "", t1).upper() for t1 in expr_list[i].colnames]
    old_markers = old_markers + list(expr_list[i].colnames)
old_markers = list(set(old_markers))
old_markers.sort()
print(old_markers[0:5])

In [None]:
##### standardize markers #####
marker_dict = {
    '':"NA",
 'BA138DI':"NA",
 'CD45_ACUTE':"CD45",
 'CD45_CONV':"CD45",
 'CE140DI':"NA",
 'CE142DI':"NA",
 'CHIKV':'CHIKV',
 'CS133DI':"NA",
 'DEAD':'VIABILITY',
 'DNA1':"DNA1",
 'DNA2':"DNA2",
 'EVENT_LENGTH':"CELL_LENGTH",
 'LU176DI':"NA",
 'OS189DI':"NA",
 'PD-1':"PD1",
 'PR141DI':"NA",
 'PT195DI':"NA",
 'XE131DI':"NA"
}

for i in range(0,len(expr_list)):
    for j in range(0,len(expr_list[i].colnames)):
        if expr_list[i].colnames[j] in list(marker_dict.keys()):
            expr_list[i].colnames[j] = marker_dict[expr_list[i].colnames[j]]
            
##### look at standarized markers #####
new_markers = []
for i in range(0,len(expr_list)):
    expr_list[i].colnames = [re.sub("\\(.*", "", t1).upper() for t1 in expr_list[i].colnames]
    new_markers = new_markers + list(expr_list[i].colnames)
new_markers = pd.DataFrame({'markers':new_markers})

new_markers = (new_markers.groupby(['markers']).size().reset_index(name='counts').
               sort_values(by=['counts','markers'],ascending=False))


print(new_markers.head())


In [None]:
##### get unified data #####
selected_markers = list(new_markers.loc[new_markers.counts>=532,"markers"])
print(selected_markers)

for i in range(0,len(expr_list)):
    t1 = expr_list[i] 
    t1 = pandas2ri.ri2py(t1)
    expr_list[i] = t1.loc[:,selected_markers]


print(cytof_files.shape)
p1 = list(set([expr_list[i].shape for i in range(len(expr_list))]))
print(len(expr_list))
print(p1)

In [None]:
#### define function for adding interactions #####
def interactions( df ):
    df_new = df
    for i in range(df.shape[1]):
        for j in range((i+1),df.shape[1]):
            cn = df.columns[i]+"_"+df.columns[j]
            df2 = pd.DataFrame({cn:(df.iloc[:,i]*df.iloc[:,j])})
            df_new = pd.concat([df_new,df2.reset_index(drop=True)], axis=1)
    return df_new

def arcsinh(x):
    return(np.arcsinh(x/5))

def scale(x):
    return((x-np.mean(x))/np.std(x))

In [None]:
coln = expr_list[0].columns.drop("TIME")
for i in range(len(expr_list)):
    t1 = expr_list[i].drop(columns="TIME")
    t1 = t1.apply(arcsinh)#.apply(scale)
    t1 = t1.values
    #t1 = interactions(t1)
    #t1 = t1.rank(pct=True).values
    shape1 = list(t1.shape)+[1]
    t1 = t1.reshape(shape1)
    expr_list[i] = t1
    
expr_list = np.stack(expr_list)
print(expr_list.shape)

In [None]:
expr_df = expr_list.reshape(expr_list.shape[0:3])
expr_df = expr_df[cytof_files.study_accession=="SDY519"]
print(expr_df.shape)
expr_df = expr_list.reshape([expr_list.shape[0]*expr_list.shape[1],
                             expr_list.shape[2]])
expr_df = pd.DataFrame(expr_df)
expr_df.columns = coln
print(expr_df.shape)
display(expr_df.head())
expr_df = expr_df.sample(n = 30000)
expr_df.to_csv("Result_04_SDY519_expr.csv",index=False)

In [None]:
processed_data = {"cytof_files":cytof_files, 
                  "expr_list" : expr_list,
                 "marker_names" : coln}

with open("result_04_processed_data_no_scale.obj", "wb") as f:
    pickle.dump(processed_data, f)
    
cytof_files.to_csv("result_04_cytof_files.csv")