In [93]:
import os
import numpy as np
import pandas as pd 
import glob
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, LassoCV, ElasticNetCV

# navigate to the project folder
os.chdir('C:/Users/jreyna/Documents/Projects/cmi-pb-multiomics/third_challenge')

# setting the output dir
outdir = 'results/main/2024.01.05/submissions/models/jive_models/'
os.makedirs(outdir, exist_ok=True)

In [94]:
model_dict = {'lr': LinearRegression, 
              'lasso': Lasso,
              'elastic_net': ElasticNet, 
              'lasso_cv': LassoCV,
              'elastic_net_cv': ElasticNetCV}

#cmodel = 'lr'
cmodel = 'lasso'
cmodel = 'elastic_net'
cmodel = 'lasso_cv'
cmodel = 'elastic_net_cv'
cmodel_function = model_dict[cmodel]

## Loading the loading matrices

In [95]:
loadings = {} 
tpl = "results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/*.jive-loadings.tsv"
for loading_fn in glob.glob(tpl):
    
    bn = os.path.basename(loading_fn)
    assay = bn.split('.')[0]
    loadings_matrix = pd.read_table(loading_fn)
    loadings[assay] = loadings_matrix

## Loading the input data

In [96]:
# setting up dictionaries to load data and results
train_features = {}
train_outcomes = {}

test_features = {}
test_preds = {}

#### Training Features (calculating the reduce form of each omic)

In [97]:
with open('results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/common_samples.txt', 'r') as f:
    common_samples = [int(x.strip()) for x in f.readlines()]

In [98]:
tpl = 'results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/*.training-data.tsv'
for raw_fn in glob.glob(tpl):  
    
    bn = os.path.basename(raw_fn)
    assay = bn.split('.')[0]
    
    # Loading the raw matrices
    raw = pd.read_table(raw_fn, index_col=0, header=0)
    shared_columns = loadings[assay].index.tolist()
    raw = raw.loc[common_samples, shared_columns]
    raw_array = np.matrix(raw.values)
    
    ## Calculating the sample factor matrix
    loadings_array = np.matrix(loadings[assay].values)
    sample_factors = raw_array * loadings_array

    tdf = pd.DataFrame(sample_factors) 
    tdf.columns = ['{}-{}'.format(assay, i) for i in range(sample_factors.shape[1])]
    train_features[assay] = tdf
    train_features[assay].index = raw.index.tolist()

In [99]:
train_features['final'] = pd.concat([train_features['pbmc_cell_frequency'],
                                     train_features['plasma_cytokine_concentrations'],
                                     train_features['pbmc_gene_expression'],
                                     train_features['abtiter']], axis=1)

In [100]:
train_features['final']

Unnamed: 0,pbmc_cell_frequency-0,pbmc_cell_frequency-1,pbmc_cell_frequency-2,pbmc_cell_frequency-3,pbmc_cell_frequency-4,pbmc_cell_frequency-5,pbmc_cell_frequency-6,pbmc_cell_frequency-7,pbmc_cell_frequency-8,pbmc_cell_frequency-9,...,abtiter-0,abtiter-1,abtiter-2,abtiter-3,abtiter-4,abtiter-5,abtiter-6,abtiter-7,abtiter-8,abtiter-9
33,-1.110421,-0.729115,-0.194013,-0.446445,0.009701,-0.862355,0.24424,0.28733,0.449174,-0.158775,...,-1.325683,-0.518658,0.306074,0.626647,0.884455,-1.437087,0.122978,-0.195141,0.033063,-1.223243
31,-0.326774,-1.209752,-0.29275,-0.940395,0.754149,-0.348609,-0.226301,0.120276,0.974449,0.811056,...,-1.316169,0.748127,0.081814,-0.443173,0.892538,-0.642241,1.059783,-0.461686,-0.485423,-0.202407
4,0.2441,0.238141,0.196914,-0.200498,-0.276279,0.018229,-0.138186,0.099384,0.053592,0.042884,...,1.902969,5.957889,2.729186,0.891957,1.047305,0.995945,2.262368,-0.454347,-3.111912,1.251074
26,-0.597581,-0.704026,0.153072,-0.689659,0.44836,-1.184264,0.005162,0.685559,1.330777,0.76691,...,-2.824041,-0.341179,1.229596,-0.795251,0.298205,-0.0364,0.463793,0.542521,0.12395,-0.831191
29,0.538575,0.857281,-0.622287,-1.161699,-1.300718,0.618804,0.787602,-0.35269,0.40699,-0.014251,...,-4.718803,5.936794,3.116446,-13.366235,-0.085822,-7.253832,-0.280111,-10.495556,-0.820189,-1.376084
6,0.390432,-0.0229,-0.128135,0.633551,-0.242418,0.192933,0.08473,-0.316974,0.730441,-0.307374,...,-8.53075,-2.186482,2.365203,-1.885644,0.791074,1.568055,0.262092,3.724942,0.137187,-1.755016
15,0.183729,-0.290542,-0.305619,-0.377655,-0.292782,0.481373,-0.245541,0.33662,0.384496,-0.262398,...,-1.162611,4.01511,2.573999,0.295378,1.577607,0.399688,3.505998,-0.15801,-3.094768,1.409378
52,0.189907,1.687194,-0.808686,-0.878721,-1.192616,0.736523,0.770935,0.101534,0.834765,-0.286145,...,-37.579619,152.280054,43.666659,-56.923639,20.424623,-40.763271,7.884112,-66.519864,16.02317,-37.983729
47,0.506994,-0.278417,-0.182013,0.081869,0.782521,0.530504,-0.308619,-0.352685,0.310536,0.158562,...,3.013906,-0.251064,-2.277501,-0.536501,-0.790979,1.795144,-0.760434,1.205621,0.990858,-0.15689
11,0.00502,-0.993587,-0.237953,-0.078966,0.029115,-0.582615,-0.539527,0.138254,0.710615,0.359565,...,-1.340711,0.905263,-0.589428,-1.013921,0.704307,-0.48799,1.228505,-0.496169,0.24531,0.124174


#### Training Outcomes

In [101]:
fn = 'results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/task_matrix.feature_names.tsv'
train_outcomes = pd.read_table(fn)
train_outcomes = train_outcomes.loc[train_outcomes.subject_id.isin(common_samples)]

#### Testing Features

In [102]:
test_features = {} 
shared_subjects_test = set()
tpl = 'results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset/*.testing-data.tsv'

i = 0 
for raw_fn in glob.glob(tpl):  
    
    # get the assay name
    bn = os.path.basename(raw_fn)
    assay = bn.split('.')[0]
    
    print(raw_fn)
    print(assay)
    
    # loading the raw matrices
    raw = pd.read_table(raw_fn, index_col=0, header=0)
    
    print(raw.shape)
    
    raw = raw.loc[:, raw.columns.isin(loadings[assay].index)]
    
    print(raw.shape)
    
    # getting the loadings matrix 
        
    ## Calculating the sample factor matrix
    raw_array = np.matrix(raw.values)
    loadings_array = np.matrix(loadings[assay].values)
    sample_factors = raw_array * loadings_array
    
    # add to the test_features dict 
    test_features[assay] = pd.DataFrame(sample_factors)
    test_features[assay].index = raw.index
    
    if i == 0:
        shared_subjects_test = set(test_features[assay].index)
    else:
        shared_subjects_test = shared_subjects_test.intersection(test_features[assay].index)
    i += 1 


results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset\abtiter.testing-data.tsv
abtiter
(21, 27)
(21, 27)
results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset\pbmc_cell_frequency.testing-data.tsv
pbmc_cell_frequency
(21, 22)
(21, 22)
results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset\pbmc_gene_expression.testing-data.tsv
pbmc_gene_expression
(21, 8242)
(21, 8242)
results/main/2024.01.05/cmi_pb_datasets/computable-matrices/prediction_dataset\plasma_cytokine_concentrations.testing-data.tsv
plasma_cytokine_concentrations
(19, 30)
(19, 30)


In [103]:
# harmonize the samples
for assay in test_features.keys():
    test_features[assay] =  test_features[assay].loc[test_features[assay].index.isin(shared_subjects_test), :]

In [104]:
test_features['final'] = pd.concat([test_features['pbmc_cell_frequency'],
                                    test_features['plasma_cytokine_concentrations'],
                                    test_features['pbmc_gene_expression'], 
                                    test_features['abtiter']], axis=1)
test_features['final'] = test_features['final'].dropna()

In [105]:
test_features['final']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,0,1,2,3,4,5,6,7,8,9
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110,-0.349445,-0.41918,-0.57668,-0.589935,0.687979,-0.086981,-0.387546,0.540464,0.863363,0.103748,...,-2.116191,-1.446366,-0.119645,-0.798308,0.330231,0.394104,3.095454,0.570842,-0.558636,-0.552176
112,-0.124051,-0.294899,-0.003588,-0.005425,0.406721,0.252115,0.086998,0.039559,0.386077,0.251151,...,0.72477,1.603304,-0.553897,-1.878765,3.408305,-0.370474,4.752898,0.217543,-2.45622,-1.166852
99,0.441923,0.703626,-1.070896,-0.076011,0.399792,-0.394788,0.173708,1.776522,0.713972,0.084056,...,2.12381,0.096734,4.512553,1.850209,3.722412,-0.812655,0.954813,0.225862,-7.231362,1.812489
102,-0.168518,-0.267035,0.162964,0.133974,0.024985,-0.04218,-0.233895,0.501434,0.420802,0.085729,...,-6.559265,1.946929,-2.855407,-3.450651,2.750438,4.348165,8.756186,3.036899,-0.898141,-2.065522
118,-1.086734,0.104204,-1.336759,-0.58895,-0.182686,-0.409895,-0.560812,0.28942,0.752797,0.007607,...,6.122247,3.214367,-0.601058,-2.68188,3.071952,-0.87335,0.87557,2.060472,-1.64955,-3.136129
109,-0.490738,0.869183,-0.181521,-0.356043,-0.300162,0.379928,-0.450027,0.549989,0.718418,0.44516,...,-5.215341,0.910358,-1.18132,-0.83621,0.864877,1.050287,-2.138489,4.760249,2.962186,3.427199
111,-0.019602,-0.216426,-0.826987,-0.170344,0.72118,-0.242549,-1.085069,0.78955,0.987597,-0.407313,...,1.707745,-4.377459,-1.127407,-1.402265,-1.354262,-2.325922,-1.60906,0.374424,-0.337187,-3.700755
106,0.055111,-0.398407,0.005653,-0.460327,-0.253378,0.146209,0.140138,-0.177296,0.120247,0.227105,...,-1.630976,0.877178,-0.017478,-1.261241,-0.037142,1.402406,-0.343027,0.349138,-0.073469,-0.595536
115,-0.451747,-0.229887,-0.569777,-0.490557,-0.240522,-0.2541,0.021944,-0.157187,0.681397,-0.03724,...,-3.203596,-0.402044,0.617672,-1.912772,1.506614,1.3168,2.373912,1.828984,-1.323555,0.161796
101,0.653818,-0.108952,0.872401,0.13596,-0.848026,-0.436827,-0.826714,0.298714,0.17573,0.105161,...,-1.399848,1.477408,-2.06344,-0.630343,1.092701,0.876359,2.080455,0.569514,-0.442609,-0.051649


## Building Lists of Tasks Cased on Assay Type

In [106]:
tasks = pd.read_table('results/main/2024.01.05/cmi_pb_datasets/computable-matrices/training_dataset/task_matrix.feature_names.tsv')

In [107]:
tasks.head()

Unnamed: 0,subject_id,IgG_PT.day14,IgG_PT.day14/day0,Monocytes.day1,Monocytes.day1/day0,ENSG00000277632.1.day3,ENSG00000277632.1.day3/day0
0,1,10.720253,2.834699,,,59.0,1.18
1,4,7.825563,3.659327,0.580535,1.225932,16.0,0.8
2,6,9.023606,25.332035,2.704531,1.745883,23.0,0.851852
3,7,5.059871,7.992374,,,,
4,3,6.994368,5.758237,,,32.0,1.103448


## Make predictions for the Ab Titers

In [108]:
ctrain_features = train_features['final']
ctest_features = test_features['final']

In [109]:
for task in train_outcomes.columns.tolist()[1:]:
    
    print(task)

    # get the outcome vector 
    ctrain_outcome = train_outcomes[['subject_id', task]]
    
    # get the shared subjects 
    shared_subjects = set(ctrain_features.index.tolist()).intersection(ctrain_outcome.subject_id)
 
    # extract the shared subjects  
    xdata = ctrain_features.loc[ctrain_features.index.isin(shared_subjects)]
    ydata = ctrain_outcome.loc[ctrain_outcome.subject_id.isin(shared_subjects)]      
    
    # building the model
    # use max_iter as needed
    if cmodel in ['lr']:
        lr_model = cmodel_function()
    elif cmodel in ['lasso', 'elastic_net', 'lasso_cv', 'elastic_net_cv']:
        lr_model = cmodel_function(max_iter=20000)

    # fit the model
    lr_model.fit(xdata.values, ydata.iloc[:, 1].values)

    # make predictions for the test features
    preds = lr_model.predict(ctest_features.values)
    
    # create the ranks df
    ranks = [ctest_features.index.tolist(), np.argsort(preds)]
    ranks = list(zip(*ranks))
    ranks = pd.DataFrame(ranks, columns=['subject_id', 'rank'])
    test_preds[task] = ranks
    

IgG_PT.day14


IgG_PT.day14/day0
Monocytes.day1
Monocytes.day1/day0
ENSG00000277632.1.day3
ENSG00000277632.1.day3/day0


## Save predictions to the Excel File

In [110]:
form_fn = 'results/main/2024.01.05/submissions/forms/2ndChallengeSubmissionTemplate.tsv'
form = pd.read_table(form_fn)

In [111]:
# creating a mapper between the task names for the data and the form
task_form_mapper = {'IgG_PT.day14': '1.1) IgG-PT-D14-titer-Rank',
                    'IgG_PT.day14/day0': '1.2) IgG-PT-D14-FC-Rank',
                    'Monocytes.day1': '2.1) Monocytes-D1-Rank',
                    'Monocytes.day1/day0': '2.2) Monocytes-D1-FC-Rank',
                    'ENSG00000277632.1.day3': '3.1) CCL3-D3-Rank',
                    'ENSG00000277632.1.day3/day0': '3.2) CCL3-D3-FC-Rank'}

In [112]:
# filling in the form
complete_form = form.copy()

In [113]:
for (task_name, form_name) in task_form_mapper.items():

    print(task_name, '-----------------', form_name)
     
    if task_name in test_preds:
    
        cranks = test_preds[task_name]

        # locate the indexes of the subjects within the form
        form_subject_indexes = form['Subject ID'].isin(cranks.subject_id.tolist())
        form_subject_indexes = form['Subject ID'][form_subject_indexes].index.tolist()

        # update the form for the current taskname
        complete_form.loc[form_subject_indexes, form_name] = cranks['rank'].astype(int)
    

IgG_PT.day14 ----------------- 1.1) IgG-PT-D14-titer-Rank
IgG_PT.day14/day0 ----------------- 1.2) IgG-PT-D14-FC-Rank
Monocytes.day1 ----------------- 2.1) Monocytes-D1-Rank
Monocytes.day1/day0 ----------------- 2.2) Monocytes-D1-FC-Rank
ENSG00000277632.1.day3 ----------------- 3.1) CCL3-D3-Rank
ENSG00000277632.1.day3/day0 ----------------- 3.2) CCL3-D3-FC-Rank


In [114]:
outfn = os.path.join(outdir, 'Completed_Predictions.jive.{}.tsv'.format(cmodel))
complete_form.to_csv(outfn, sep='\t', float_format='%.0f', index=False, header=True)

In [115]:
complete_form

Unnamed: 0,Subject ID,Age,Biological Sex at Birth,Vaccine Priming Status,1.1) IgG-PT-D14-titer-Rank,1.2) IgG-PT-D14-FC-Rank,2.1) Monocytes-D1-Rank,2.2) Monocytes-D1-FC-Rank,3.1) CCL3-D3-Rank,3.2) CCL3-D3-FC-Rank
0,97,35,Male,wP,,,,,,
1,98,28,Female,wP,1.0,12.0,1.0,1.0,1.0,1.0
2,99,22,Female,aP,2.0,7.0,2.0,2.0,2.0,2.0
3,100,20,Female,aP,,,,,,
4,101,18,Male,aP,4.0,2.0,4.0,4.0,4.0,4.0
5,102,18,Male,aP,5.0,11.0,5.0,5.0,5.0,5.0
6,103,27,Female,wP,6.0,14.0,6.0,6.0,6.0,6.0
7,104,32,Female,wP,,,,,,
8,105,27,Female,wP,,,,,,
9,106,25,Female,aP,9.0,8.0,9.0,9.0,9.0,9.0
