# Preparing Computable Matrices

In [1]:
import os 
import pandas as pd 
import numpy as np

# start at the project dir
if 'workflow' not in os.listdir():
    os.chdir('../../../')
    
# create the output dir
datadir = 'results/harmonized/'
outdir = 'results/processed/harmonized/'
os.makedirs(outdir, exist_ok=True)

# set debug
debug = False

In [2]:
# set dict for feature names
task_dict = {'igg_pt': 'igg_pt',
                'monocytes': 'monocytes',
                'ccl3': 'ensg00000277632',
                'ensg00000277632': 'ccl3'}

## Clean the training feature datasets

In [3]:
# setting a list of assays
assays = {'plasma_cytokine_concentrations': 'results/harmonized/training_plasma_cytokine_concentrations_by_olink_wide.tsv',
          'pbmc_cell_frequency': 'results/harmonized/training_pbmc_cell_frequency_wide.tsv',
          'plasma_antibody_levels': 'results/harmonized/training_plasma_antibody_levels_wide.tsv',
          'pbmc_gene_expression': 'results/harmonized/training_pbmc_gene_expression_wide_tpm.tsv'}

In [4]:
# load the specimen/clinical data
wetlab_data = pd.read_table('results/harmonized/training_subject_specimen.tsv')

# replaced planned_day_relative_to_boost == 0 method to improve speed
# planned to speed up feature_wetlab_data = wetlab_data.loc[wetlab_data.planned_day_relative_to_boost == 0]
feature_wetlab_data = wetlab_data.loc[wetlab_data.actual_day_relative_to_boost == 0]

In [5]:
# function to simplify the column names
def simplify_colname(x, assay):
    if assay != 'pbmc_gene_expression':
        s = x.lower().replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '').replace('/', '_')
    else:
        s = x.split('.')[0]
        s = s.lower().replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '').replace('/', '_')
    return(s)

In [6]:
for assay, fn in assays.items():
    
    if assay == 'pbmc_gene_expression' and debug == True:
         continue
    
    #print(assay)

    # load the main data
    #print('# load the main data')
    main_data = pd.read_table(fn, index_col=0)
    main_data.columns = [simplify_colname(x, assay) for x in main_data.columns.tolist()]
        
    # add clinical data
    #print('# add clinical data')
    temp_data = main_data.merge(feature_wetlab_data, left_index=True, right_on='specimen_id')

    # aggregate sample data
    #print('# aggregate sample data')
    # replaced planned_day_relative_to_boost == 0 method to improve speed
    # agg_data = temp_data.groupby('subject_id')[main_data.columns.tolist()].agg(np.mean)
    agg_data = temp_data[['subject_id'] + main_data.columns.tolist()].set_index('subject_id')

    # save the agg data
    #print('# save the agg data')
    output_fn = os.path.join(outdir, 'training_{}.tsv'.format(assay))
    agg_data.to_csv(output_fn, sep='\t')
    
    #print('done')

## Clean the training outcome dataset

In [7]:
task_goals = {}
task_goals['plasma_antibody_levels'] = ('igg_pt', 14)
task_goals['pbmc_cell_frequency'] = ('monocytes', 1)
task_goals['pbmc_gene_expression'] = ('ccl3', 3)

In [8]:
task_data = []
for assay, (task, day) in task_goals.items():
        
    task_id = task_dict[task]
                
    if assay == 'pbmc_gene_expression' and debug == True:
         continue
     
    # get the current wetlab data
    #print('# get the current wetlab data')
    curr_wetlab_data = wetlab_data.loc[wetlab_data.actual_day_relative_to_boost.isin([0, day])]

    # load the main data
    #print('# load the main data')
    fn = assays[assay]
    main_data = pd.read_table(fn, index_col=0)
    main_data.columns = [simplify_colname(x, assay) for x in main_data.columns.tolist()]
    main_data = main_data[[task_id]]
        
    # add clinical data
    #print('# add clinical data')
    temp_data = main_data.merge(curr_wetlab_data, left_index=True, right_on='specimen_id')
    
    # aggregate sample data
    #print('# aggregate sample data')
    task_id = task_dict[task]
    agg_data = temp_data[['subject_id', 'planned_day_relative_to_boost', task_id]]

    # store current data 
    #print('# store current data')
    curr_data = agg_data.reset_index()   
    
    # prepare data for pivoting
    curr_data.loc[:, 'feature'] = task
    curr_data.rename(columns={task_id: 'value'}, inplace=True) 
    task_data.append(curr_data)

In [9]:
# pivot the data
task_df = pd.concat(task_data, axis=0)

In [10]:
task_df['task'] = task_df['feature'] + '_day' + task_df['planned_day_relative_to_boost'].astype(str)
task_df = task_df.pivot(index='subject_id', columns=['task'], values='value')

# calculating fold changes
task_df['igg_pt_day14_fold_change'] = task_df['igg_pt_day14'] / task_df['igg_pt_day0']
task_df['monocytes_day1_fold_change'] = task_df['monocytes_day1'] / task_df['monocytes_day0']
task_df['ccl3_day3_fold_change'] = task_df['ccl3_day3'] / task_df['ccl3_day0']

# get the final cols
final_cols = ['igg_pt_day14', 'igg_pt_day14_fold_change', 'igg_pt_day0',
                'monocytes_day1', 'monocytes_day1_fold_change', 'monocytes_day0',
                'ccl3_day3', 'ccl3_day3_fold_change', 'ccl3_day0']
final_task_df = task_df[final_cols]

In [11]:
final_task_df

task,igg_pt_day14,igg_pt_day14_fold_change,igg_pt_day0,monocytes_day1,monocytes_day1_fold_change,monocytes_day0,ccl3_day3,ccl3_day3_fold_change,ccl3_day0
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,,,,,,46.410,,
3,7.041547,,,,,,26.204,,
4,5.745959,,,7.211965,,,13.353,,
5,5.327203,,,,,,20.618,,
6,8.856575,,,41.380502,,,19.606,,
...,...,...,...,...,...,...,...,...,...
114,0.739707,2.068277,0.357644,23.700000,1.281081,18.5,39.771,1.158322,34.335
115,0.899269,0.518155,1.735520,15.200000,0.844444,18.0,74.769,0.239979,311.565
116,,,1.678243,13.100000,0.590090,22.2,285.175,5.520549,51.657
117,1.381771,1.329841,1.039050,9.140000,0.768067,11.9,54.262,2.501821,21.689


In [12]:
fn = os.path.join(outdir, 'task_matrix.tsv')
final_task_df.to_csv(fn, sep='\t')