# Making Computable Matrices

In [1]:
import os 
import pandas as pd 
import numpy as np
os.chdir('C:/Users/jreyna/Documents/Projects/cmi-pb-multiomics/third_challenge')
datadir = 'results/main/2024.01.05/cmi_pb_datasets/ps-processed-data/training_dataset/'
outdir = 'results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/'
os.makedirs(outdir, exist_ok=True)

IgG1 and IgG4 day 14 and day 0 values for PT, FHA, and Pertactin

In [2]:
# setting a list of assays
assays = ('plasma_cytokine_concentrations', 'pbmc_cell_frequency', 'abtiter', 'pbmc_gene_expression')

## Processing the Training Features

In [3]:
# load speciment data
specimen_fn = 'results/main/2024.01.05/cmi_pb_datasets/ps-processed-data/training_dataset/subject_specimen.tsv'
specimen_df = pd.read_table(specimen_fn)
specimen_df.loc[:, 'specimen_id'] = specimen_df.loc[:, 'specimen_id'].astype(str)

In [4]:
datasets = {'plasma_cytokine_concentration': None,
            'pbmc_cell_frequency': None,
            'plasma_ab_titer': None,
            'pbmc_gene_expression': None}


drop_cols = ['timepoint', 'specimen_id', 'actual_day_relative_to_boost', 'planned_day_relative_to_boost',
                'specimen_type', 'visit', 'infancy_vac', 'biological_sex', 'ethnicity',
                'race', 'year_of_birth', 'date_of_boost', 'dataset']
for assay in assays:
    tdata = []
    input_fn = os.path.join(datadir, '{}_batchCorrected_data.tsv'.format(assay))
    tdf = pd.read_table(input_fn).transpose()
    tdf = tdf.merge(specimen_df, left_index=True, right_on='specimen_id')
    tdf = tdf.loc[tdf['timepoint'] == 0]

    # save the data
    data = tdf.drop(drop_cols, axis=1).set_index('subject_id')
    output_fn = os.path.join(outdir, '{}.training-data.tsv'.format(assay))
    data.to_csv(output_fn, sep='\t')

In [5]:
data

Unnamed: 0_level_0,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000938.12,ENSG00000001036.13,ENSG00000001167.14,ENSG00000001461.16,ENSG00000001497.16,ENSG00000001629.9,ENSG00000001631.15,ENSG00000002016.17,...,ENSG00000277972.1,ENSG00000278053.4,ENSG00000278195.1,ENSG00000278259.4,ENSG00000278311.4,ENSG00000278619.4,ENSG00000278828.1,ENSG00000278845.4,ENSG00000283787.1,ENSG00000284691.1
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,113.0,7.0,459.0,30.0,17.0,24.0,40.0,21.0,11.0,7.0,...,38.0,12.0,2.0,10.0,49.0,7.0,49.0,39.0,22.0,14.0
13,113.0,9.0,273.0,28.0,24.0,25.0,42.0,19.0,17.0,8.0,...,31.0,9.0,3.0,10.0,55.0,10.0,25.0,40.0,19.0,23.0
18,174.0,8.0,242.0,26.0,23.0,24.0,42.0,25.0,18.0,6.0,...,23.0,15.0,2.0,10.0,62.0,5.0,12.0,49.0,18.0,18.0
27,69.0,10.0,730.0,49.0,16.0,22.0,31.0,17.0,13.0,9.0,...,50.0,7.0,5.0,6.0,36.0,10.0,60.0,45.0,19.0,14.0
29,101.0,8.0,602.0,46.0,21.0,16.0,32.0,15.0,11.0,8.0,...,44.0,9.0,2.0,8.0,46.0,10.0,139.0,34.0,22.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,142.0,10.0,570.0,61.0,32.0,24.0,35.0,15.0,15.0,11.0,...,29.0,10.0,4.0,8.0,76.0,14.0,182.0,58.0,28.0,17.0
65,154.0,6.0,517.0,29.0,19.0,19.0,57.0,21.0,11.0,7.0,...,36.0,14.0,2.0,16.0,71.0,5.0,64.0,44.0,18.0,13.0
66,147.0,11.0,632.0,36.0,21.0,28.0,38.0,17.0,13.0,10.0,...,29.0,10.0,3.0,9.0,42.0,14.0,60.0,57.0,18.0,11.0
67,125.0,7.0,692.0,34.0,19.0,17.0,36.0,14.0,11.0,6.0,...,31.0,10.0,2.0,9.0,50.0,6.0,70.0,45.0,14.0,8.0


## Processing the Training Outcomes

In [6]:
# process abtiters
input_fn = os.path.join(datadir, '{}_batchCorrected_data.tsv'.format('abtiter'))
abtiters_df = pd.read_table(input_fn).transpose()
abtiters_df = abtiters_df.merge(specimen_df, left_index=True, right_on='specimen_id')

day14_abtiters_df = abtiters_df.loc[abtiters_df['timepoint'] == 14].set_index('subject_id').drop(drop_cols, axis=1)
day0_abtiters_df = abtiters_df.loc[abtiters_df['timepoint'] == 0].set_index('subject_id').drop(drop_cols, axis=1)

# calculate fold
fc_abtiters = day14_abtiters_df['IgG_PT'] /     day0_abtiters_df['IgG_PT']

In [7]:
# process cell frequencies
input_fn = os.path.join(datadir, '{}_batchCorrected_data.tsv'.format('pbmc_cell_frequency'))
cell_freqs_df = pd.read_table(input_fn).transpose()
cell_freqs_df = cell_freqs_df.merge(specimen_df, left_index=True, right_on='specimen_id')

day1_cell_freqs_df = cell_freqs_df.loc[cell_freqs_df['timepoint'] == 1].set_index('subject_id')
day0_cell_freqs_df = cell_freqs_df.loc[cell_freqs_df['timepoint'] == 0].set_index('subject_id')

# calculate fold
fc_cell_freqs = day1_cell_freqs_df['Monocytes'] / day0_cell_freqs_df['Monocytes']

In [8]:
# process gene expression
input_fn = os.path.join(datadir, '{}_batchCorrected_data.tsv'.format('pbmc_gene_expression'))
ge_df = pd.read_table(input_fn).transpose()
ge_df = ge_df.merge(specimen_df, left_index=True, right_on='specimen_id')

ccl3_id = 'ENSG00000277632.1'
day3_ge_df = ge_df.loc[ge_df['timepoint'] == 3].set_index('subject_id').drop(drop_cols, axis=1)
day0_ge_df = ge_df.loc[ge_df['timepoint'] == 0].set_index('subject_id').drop(drop_cols, axis=1)

# calculate fold change
fc_ges = day3_ge_df[ccl3_id] / day0_ge_df[ccl3_id]

In [9]:
train_outcomes = pd.concat([day14_abtiters_df['IgG_PT'], fc_abtiters,
                            day1_cell_freqs_df['Monocytes'], fc_cell_freqs,
                            day3_ge_df[ccl3_id], fc_ges], axis=1)

train_outcomes.columns = ['IgG_PT.day14',
                            'IgG_PT.day14/day0',
                            'Monocytes.day1',
                            'Monocytes.day1/day0',
                            'ENSG00000277632.1.day3',
                            'ENSG00000277632.1.day3/day0']

output_fn = os.path.join(outdir, 'task_matrix.feature_names.tsv')
train_outcomes.to_csv(output_fn, sep='\t', header=True, index=True)

## Processing the Testing Features

In [10]:
datadir = 'results/main/2024.01.05/cmi_pb_datasets/ps-processed-data/prediction_dataset/'
outdir = 'results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset/'
os.makedirs(outdir, exist_ok=True)

In [11]:
# load speciment data
specimen_fn = 'results/main/2024.01.05/cmi_pb_datasets/ps-processed-data/prediction_dataset/subject_specimen.tsv'
specimen_df = pd.read_table(specimen_fn)
specimen_df.loc[:, 'specimen_id'] = specimen_df.loc[:, 'specimen_id'].astype(str)

In [12]:
specimen_df.head()

Unnamed: 0,specimen_id,subject_id,actual_day_relative_to_boost,planned_day_relative_to_boost,specimen_type,visit,infancy_vac,biological_sex,ethnicity,race,year_of_birth,date_of_boost,dataset,timepoint
0,730,97,-31,-30,Blood,1,wP,Male,Not Hispanic or Latino,White,1986-01-01,2021-11-29,2022_dataset,-30
1,731,97,-12,-15,Blood,2,wP,Male,Not Hispanic or Latino,White,1986-01-01,2021-11-29,2022_dataset,-15
2,732,97,0,0,Blood,3,wP,Male,Not Hispanic or Latino,White,1986-01-01,2021-11-29,2022_dataset,0
3,733,97,1,1,Blood,4,wP,Male,Not Hispanic or Latino,White,1986-01-01,2021-11-29,2022_dataset,1
4,734,97,3,3,Blood,5,wP,Male,Not Hispanic or Latino,White,1986-01-01,2021-11-29,2022_dataset,3


In [13]:
for assay in assays:
    tdata = []
    input_fn = os.path.join(datadir, '{}_processed_data.tsv'.format(assay))
    tdf = pd.read_table(input_fn).transpose()
    tdf = tdf.merge(specimen_df, left_index=True, right_on='specimen_id')
    tdf = tdf.loc[tdf['timepoint'] == 0]

    # save the data
    data = tdf.drop(drop_cols, axis=1).set_index('subject_id')
    output_fn = os.path.join(outdir, '{}.testing-data.tsv'.format(assay))
    data.to_csv(output_fn, sep='\t')

In [14]:
data.head()

Unnamed: 0_level_0,ENSG00000001036.13,ENSG00000001167.14,ENSG00000001461.16,ENSG00000001497.16,ENSG00000001629.9,ENSG00000001631.15,ENSG00000002016.17,ENSG00000002330.13,ENSG00000002549.12,ENSG00000002586.18,...,ENSG00000088832.16,ENSG00000088833.17,ENSG00000088876.11,ENSG00000088888.17,ENSG00000088930.7,ENSG00000088970.15,ENSG00000088986.10,ENSG00000088992.17,ENSG00000089006.16,ENSG00000089009.15
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
97,34.015,16.287,19.127,19.34,8.587,12.795,13.705,23.238,72.064,179.36,...,370.951,22.951,2.91,10.367,137.96,19.786,194.339,51.08,49.056,554.688
98,19.951,10.926,15.334,13.805,7.429,5.28,8.038,24.737,54.277,142.921,...,288.464,17.392,2.44,6.086,78.034,12.701,63.78,38.628,23.676,374.05
99,38.09,15.265,15.141,13.859,11.269,14.816,12.733,27.85,83.302,161.778,...,435.148,20.325,4.084,11.359,113.337,11.049,204.885,47.393,44.212,353.854
100,29.106,11.304,14.666,15.54,7.211,9.631,9.784,32.747,76.456,136.465,...,337.697,22.022,1.871,11.245,100.067,12.276,117.837,50.866,33.46,436.539
101,24.153,17.931,19.843,13.559,9.523,11.721,14.741,25.903,56.477,132.321,...,353.055,18.378,3.491,12.132,103.872,18.003,179.544,37.982,36.793,426.62
