In [1]:
import os
import numpy as np
import pandas as pd 
import glob
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, LassoCV, ElasticNetCV

# navigate to the project folder
os.chdir('C:/Users/jreyna/Documents/Projects/cmi-pb-multiomics/third_challenge')

# setting the output dir
outdir = 'results/main/submissions/models/jive_models/'
os.makedirs(outdir, exist_ok=True)

In [2]:
model_dict = {'lr': LinearRegression, 
              'lasso': Lasso,
              'elastic_net': ElasticNet, 
              'lasso_cv': LassoCV,
              'elastic_net_cv': ElasticNetCV}

cmodel = 'lr'
#cmodel = 'lasso'
#cmodel = 'elastic_net'
#cmodel = 'lasso_cv'
#cmodel = 'elastic_net_cv'
cmodel_function = model_dict[cmodel]

## Loading the loading matrices

In [3]:
loadings = {} 
tpl = "results/main/cmi_pb_datasets/processed/training_data/*.jive-loadings.tsv"
for loading_fn in glob.glob(tpl):
    
    bn = os.path.basename(loading_fn)
    assay = bn.split('.')[0]
    loadings_matrix = pd.read_table(loading_fn)
    loadings[assay] = loadings_matrix

## Loading the input data

In [4]:
# setting up dictionaries to load data and results
train_features = {}
train_outcomes = {}

test_features = {}
test_preds = {}

#### Training Features (calculating the reduce form of each omic)

In [5]:
with open('results/main/cmi_pb_datasets/processed/training_data/common_samples.txt', 'r') as f:
    common_samples = [int(x.strip()) for x in f.readlines()]

In [6]:
tpl = 'results/main/cmi_pb_datasets/processed/training_data/*.training-data.tsv'
for raw_fn in glob.glob(tpl):  
    
    bn = os.path.basename(raw_fn)
    assay = bn.split('.')[0]
    
    # Loading the raw matrices
    raw = pd.read_table(raw_fn, index_col=0, header=0)
    shared_columns = loadings[assay].index.tolist()
    raw = raw.loc[common_samples, shared_columns]
    raw_array = np.matrix(raw.values)
    
    ## Calculating the sample factor matrix
    loadings_array = np.matrix(loadings[assay].values)
    sample_factors = raw_array * loadings_array

    tdf = pd.DataFrame(sample_factors) 
    tdf.columns = ['{}-{}'.format(assay, i) for i in range(sample_factors.shape[1])]
    train_features[assay] = tdf
    train_features[assay].index = raw.index.tolist()

In [7]:
train_features['final'] = pd.concat([train_features['pbmc_cell_frequency'],
                                     train_features['plasma_cytokine_concentration'],
                                     train_features['pbmc_gene_expression'],
                                     train_features['plasma_ab_titer']], axis=1)

In [8]:
train_features['final']

Unnamed: 0,pbmc_cell_frequency-0,pbmc_cell_frequency-1,pbmc_cell_frequency-2,pbmc_cell_frequency-3,pbmc_cell_frequency-4,pbmc_cell_frequency-5,pbmc_cell_frequency-6,pbmc_cell_frequency-7,pbmc_cell_frequency-8,pbmc_cell_frequency-9,...,plasma_ab_titer-0,plasma_ab_titer-1,plasma_ab_titer-2,plasma_ab_titer-3,plasma_ab_titer-4,plasma_ab_titer-5,plasma_ab_titer-6,plasma_ab_titer-7,plasma_ab_titer-8,plasma_ab_titer-9
4,-0.899481,-0.248604,-10.404743,-2.675013,-3.905235,-5.722349,-7.721928,-2.952878,-5.112688,2.449012,...,207.721176,-4143.724467,-10570.890199,1595.084781,209.779727,943.404765,5883.295445,819.760559,2550.683669,-946.987188
6,-0.975855,-4.822534,-17.700483,-3.526054,-7.882918,-7.843105,-13.171058,-10.353981,-13.950849,6.104866,...,-545.95599,-1388.340288,-1276.091581,1500.077146,-384.746655,-1374.882186,859.613555,-803.365398,675.752387,-702.19865
11,-3.368699,1.786367,-21.795938,-6.008265,-13.630948,-13.465657,-8.068319,-5.27537,-18.459526,5.349103,...,102.573975,-4643.94591,-11476.334036,2030.730576,-292.160455,-21.900707,6760.526613,-110.648668,3921.619381,-1349.028956
15,-2.108057,2.420816,-18.423573,-1.754734,-4.3288,-8.064753,-17.031443,-7.080065,-10.975928,9.31559,...,-1116.572078,-2887.424132,-2598.234794,3242.620403,-898.165258,-2944.937584,1772.398294,-1538.736369,1157.55428,-1360.770398
17,-2.081917,1.393675,-18.229594,-1.359508,-7.673578,-9.737192,-13.270752,-2.763658,-16.151179,6.592004,...,-2211.872895,-5732.121659,-11856.478106,6536.143169,1564.434357,642.946422,5429.275067,438.605936,-533.225884,-3831.388535
20,-0.801871,-6.41952,-18.814395,-3.541063,-7.623667,-4.131016,-15.087307,-13.171175,-3.442146,5.136785,...,338.195349,-882.438663,-3158.848069,-62.480402,125.596901,706.837864,1741.56136,434.142604,836.824966,-68.580124
21,-0.114335,-5.107738,-15.482128,-3.050133,-4.741533,-3.647438,-13.477738,-8.100788,-2.831164,3.671246,...,992.56094,-2905.922776,-9557.210435,-108.83659,7.811942,1187.772673,5484.429979,744.829196,3209.646093,-202.089974
26,-2.27984,0.18853,-21.851721,-10.632456,-12.219544,-15.759866,-6.483704,-4.699901,-15.037824,1.279946,...,-194.313866,-2722.04444,-5529.42964,1743.150489,-386.399969,-722.768596,3288.277102,-422.103131,1925.594952,-888.774786
29,-2.731744,2.19461,-19.871687,-1.399896,-10.869565,-4.560394,-11.630978,-6.18921,-12.742841,9.606491,...,356.187285,-6396.059503,-11687.965974,1961.448134,-154.084294,-1657.672657,6240.716432,5.898329,4223.951929,-586.037662
31,-3.325038,5.070234,-21.603035,-3.428407,-10.545717,-12.499816,-10.80504,-0.531728,-15.584196,5.819908,...,926.608678,-1673.236162,-6563.996819,-680.464946,125.478282,1639.489802,3795.573872,959.846168,2168.458858,158.093414


#### Training Outcomes

In [9]:
fn = 'results/main/cmi_pb_datasets/processed/all_versus_all/harmonized/task_matrix.common_names.mfi_raw.tsv'
train_outcomes = pd.read_table(fn)
train_outcomes = train_outcomes.loc[train_outcomes.subject_id.isin(common_samples)]

In [10]:
train_outcomes.shape

(46, 7)

#### Testing Features

In [11]:
test_features = {} 
shared_subjects_test = set()
tpl = 'results/main/cmi_pb_datasets/processed/all_versus_all/full/*.2022.day0.pivoted.tsv'

i = 0 
for raw_fn in glob.glob(tpl):  
    
    # get the assay name
    bn = os.path.basename(raw_fn)
    assay = bn.split('.')[0]
    
    if assay in ['abtiters']:
        continue
    
    print(raw_fn)
    print(assay)
    
    # loading the raw matrices
    raw = pd.read_table(raw_fn, index_col=0, header=0)
    
    print(raw.shape)
    
    raw = raw.loc[:, raw.columns.isin(loadings[assay].index)]
    
    print(raw.shape)
    
    # getting the loadings matrix 
        
    ## Calculating the sample factor matrix
    raw_array = np.matrix(raw.values)
    loadings_array = np.matrix(loadings[assay].values)
    sample_factors = raw_array * loadings_array
    
    # add to the test_features dict 
    test_features[assay] = pd.DataFrame(sample_factors)
    test_features[assay].index = raw.index
    
    
    if i == 0:
        shared_subjects_test = set(test_features[assay].index)
    else:
        shared_subjects_test = shared_subjects_test.intersection(test_features[assay].index)
    i += 1 


results/main/cmi_pb_datasets/processed/all_versus_all/full\pbmc_cell_frequency.2022.day0.pivoted.tsv
pbmc_cell_frequency
(21, 50)
(21, 13)
results/main/cmi_pb_datasets/processed/all_versus_all/full\pbmc_gene_expression.2022.day0.pivoted.tsv
pbmc_gene_expression
(21, 58302)
(21, 11045)
results/main/cmi_pb_datasets/processed/all_versus_all/full\plasma_ab_titer.2022.day0.pivoted.tsv
plasma_ab_titer
(20, 15)
(20, 15)
results/main/cmi_pb_datasets/processed/all_versus_all/full\plasma_cytokine_concentration.2022.day0.pivoted.tsv
plasma_cytokine_concentration
(19, 43)
(19, 23)


In [12]:
# harmonize the samples
for assay in test_features.keys():
    test_features[assay] =  test_features[assay].loc[test_features[assay].index.isin(shared_subjects_test), :]

In [13]:
test_features['final'] = pd.concat([test_features['pbmc_cell_frequency'],
                                    test_features['plasma_cytokine_concentration'],
                                    test_features['pbmc_gene_expression'], 
                                    test_features['plasma_ab_titer']], axis=1)
test_features['final'] = test_features['final'].dropna()

In [14]:
test_features['final']

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,0,1,2,3,4,5,6,7,8,9
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
99,-1.224412,-8.598033,-27.033497,-7.378647,-10.997792,-9.454892,-20.325845,-19.301477,-5.052658,5.74189,...,-1.484415,-25.574068,-10.478428,22.564596,-2.568316,-17.860298,1.427886,12.892605,-19.607588,2.316285
100,-2.855409,-6.467565,-34.246316,-6.719578,-15.781636,-10.392101,-22.9103,-19.782537,-4.290722,7.132249,...,-1.447213,-8.178395,-3.330958,7.653796,-1.401737,-7.267981,0.755979,2.32556,-4.887501,0.260201
101,-2.532561,-4.089669,-30.344247,-7.05676,-12.016061,-10.148579,-21.823645,-18.129196,-4.155262,8.050141,...,0.432558,-15.310758,-3.915555,7.468223,1.156216,-7.883966,-1.7061,6.907092,-10.013379,2.219983
102,-1.061262,-9.464304,-25.216659,-8.412636,-10.369864,-8.674463,-18.899495,-22.278764,-5.594612,6.908362,...,0.398518,-7.859285,-1.050319,-0.649491,6.884561,2.582638,-3.511293,-0.286656,-0.050451,-0.444387
103,-2.713608,-3.489848,-29.851983,-2.761964,-10.494067,-6.595266,-24.422996,-18.796722,-3.49561,10.274183,...,-0.295303,-12.050372,-6.509271,-8.046496,10.808542,2.293429,-6.016087,1.168442,-1.32095,2.501381
104,-0.785167,-3.859821,-22.627349,-5.989916,-4.083474,-9.033496,-20.775431,-10.806631,-2.427869,5.240699,...,-3.095564,-59.905564,-15.432972,43.10979,-3.537455,-32.345824,8.094366,20.050436,-32.937795,2.040975
106,-2.623808,-5.146312,-34.476173,-7.158831,-14.317961,-10.798908,-22.724873,-17.767456,-2.61666,5.904737,...,-0.049994,-5.202647,-1.781929,-0.237663,3.58445,1.194397,-0.44133,0.073696,0.065535,-0.900815
107,-1.945174,-8.476684,-33.439452,-9.710859,-15.157369,-12.589743,-20.912269,-18.01764,-3.065886,3.194035,...,-1.370903,-9.038517,-2.642939,-2.732157,5.327437,1.573254,-0.64125,2.870765,-1.273882,-0.126811
108,-2.354274,-7.313649,-32.939535,-7.223893,-15.130748,-10.857787,-21.943924,-19.080479,-4.277295,5.692495,...,2.109575,-12.003216,-6.258283,-0.580752,4.846262,-3.257977,-0.066288,5.29842,-4.160204,2.310939
109,-2.20133,-5.985008,-30.243184,-8.949241,-12.660567,-10.021653,-20.587655,-17.370535,-2.463759,5.767851,...,-1.195925,-6.850486,-1.019197,-0.665554,1.660243,-1.988786,0.205359,0.680153,-1.359244,-0.687489


## Building Lists of Tasks Cased on Assay Type

In [15]:
tasks = pd.read_table('results/main/cmi_pb_datasets/processed/all_versus_all/harmonized/task_matrix.feature_names.mfi_raw.tsv')

In [16]:
tasks = tasks.iloc[:, [0,1,3,5]]

In [17]:
tasks

Unnamed: 0,subject_id,IgG-PT.day14,Monocytes.day1,ENSG00000277632.day3
0,1,199.517666,,46.410
1,3,129.197956,,26.204
2,4,144.885339,7.211965,13.353
3,5,97.743258,,20.618
4,6,167.496355,41.380502,19.606
...,...,...,...,...
89,96,386.000000,35.000000,22.314
90,37,,,
91,82,,20.500000,868.176
92,87,,22.500000,160.230


## Make predictions for the Ab Titers

In [1]:
ctrain_features = train_features['final']
ctest_features = test_features['final']

NameError: name 'train_features' is not defined

In [None]:
train_outcomes

Unnamed: 0,subject_id,IgG-PT.day14,IgG-PT.fold-change-day14/0,Monocytes.day1,Monocytes.fold-change-day1/0,ENSG00000277632.day3,ENSG00000277632.fold-change-day3/0
2,4,144.885339,3.858236,7.211965,1.313454,13.353,0.830204
4,6,167.496355,42.792746,41.380502,1.815489,19.606,0.858256
8,11,414.513947,9.308212,7.257095,0.672544,17.841,0.79965
12,15,100.489455,18.1204,10.585489,0.676935,31.757,1.152328
14,17,168.900882,0.923284,16.401488,1.134469,37.353,1.711321
17,20,133.269594,1.495384,26.605583,1.2433,43.19,2.958219
18,21,35.368851,0.329951,34.812168,2.286266,23.983,1.823248
23,26,45.748957,3.806558,16.108508,2.911843,12.699,0.495223
26,29,75.090769,4.110336,25.083209,1.55847,29.779,1.430651
28,31,33.003075,14.802375,8.545243,1.751425,49.266,1.980543


In [20]:
train_outcomes

Unnamed: 0,subject_id,IgG-PT.day14,IgG-PT.fold-change-day14/0,Monocytes.day1,Monocytes.fold-change-day1/0,ENSG00000277632.day3,ENSG00000277632.fold-change-day3/0
2,4,144.885339,3.858236,7.211965,1.313454,13.353,0.830204
4,6,167.496355,42.792746,41.380502,1.815489,19.606,0.858256
8,11,414.513947,9.308212,7.257095,0.672544,17.841,0.79965
12,15,100.489455,18.1204,10.585489,0.676935,31.757,1.152328
14,17,168.900882,0.923284,16.401488,1.134469,37.353,1.711321
17,20,133.269594,1.495384,26.605583,1.2433,43.19,2.958219
18,21,35.368851,0.329951,34.812168,2.286266,23.983,1.823248
23,26,45.748957,3.806558,16.108508,2.911843,12.699,0.495223
26,29,75.090769,4.110336,25.083209,1.55847,29.779,1.430651
28,31,33.003075,14.802375,8.545243,1.751425,49.266,1.980543


In [21]:
for task in train_outcomes.columns.tolist()[1:]:
    
    print(task)

    # get the outcome vector 
    ctrain_outcome = train_outcomes[['subject_id', task]]
    
    # get the shared subjects 
    shared_subjects = set(ctrain_features.index.tolist()).intersection(ctrain_outcome.subject_id)
 
    # extract the shared subjects  
    xdata = ctrain_features.loc[ctrain_features.index.isin(shared_subjects)]
    ydata = ctrain_outcome.loc[ctrain_outcome.subject_id.isin(shared_subjects)]      
    
    
    # building the model
    # use max_iter as needed
    if cmodel in ['lr']:
        lr_model = cmodel_function()
    elif cmodel in ['lasso', 'elastic_net', 'lasso_cv', 'elastic_net_cv']:
        lr_model = cmodel_function(max_iter=20000)

    # fit the model
    lr_model.fit(xdata.values, ydata.iloc[:, 1].values)

    # make predictions for the test features
    preds = lr_model.predict(ctest_features.values)
    
    # create the ranks df
    ranks = [ctest_features.index.tolist(), np.argsort(preds)]
    ranks = list(zip(*ranks))
    ranks = pd.DataFrame(ranks, columns=['subject_id', 'rank'])
    test_preds[task] = ranks
    

IgG-PT.day14
IgG-PT.fold-change-day14/0
Monocytes.day1
Monocytes.fold-change-day1/0
ENSG00000277632.day3
ENSG00000277632.fold-change-day3/0


## Save predictions to the Excel File

In [22]:
form_fn = 'results/main/submissions/forms/2ndChallengeSubmissionTemplate.tsv'
form = pd.read_table(form_fn)

In [23]:
# creating a mapper between the task names for the data and the form
task_form_mapper = {'IgG-PT.day14': '1.1) IgG-PT-D14-titer-Rank',
                    'IgG-PT.fold-change-day14/0': '1.2) IgG-PT-D14-FC-Rank',
                    'Monocytes.day1': '2.1) Monocytes-D1-Rank',
                    'Monocytes.fold-change-day1/0': '2.2) Monocytes-D1-FC-Rank',
                    'ENSG00000277632.day3': '3.1) CCL3-D3-Rank',
                    'ENSG00000277632.fold-change-day3/0': '3.2) CCL3-D3-FC-Rank'}

In [24]:
# filling in the form
complete_form = form.copy()

In [25]:
for (task_name, form_name) in task_form_mapper.items():

    print(task_name, '-----------------', form_name)
     
    if task_name in test_preds:
    
        cranks = test_preds[task_name]

        # locate the indexes of the subjects within the form
        form_subject_indexes = form['Subject ID'].isin(cranks.subject_id.tolist())
        form_subject_indexes = form['Subject ID'][form_subject_indexes].index.tolist()

        # update the form for the current taskname
        complete_form.loc[form_subject_indexes, form_name] = cranks['rank'].astype(int)
    

IgG-PT.day14 ----------------- 1.1) IgG-PT-D14-titer-Rank
IgG-PT.fold-change-day14/0 ----------------- 1.2) IgG-PT-D14-FC-Rank
Monocytes.day1 ----------------- 2.1) Monocytes-D1-Rank
Monocytes.fold-change-day1/0 ----------------- 2.2) Monocytes-D1-FC-Rank
ENSG00000277632.day3 ----------------- 3.1) CCL3-D3-Rank
ENSG00000277632.fold-change-day3/0 ----------------- 3.2) CCL3-D3-FC-Rank


In [26]:
outfn = os.path.join(outdir, 'Completed_Predictions.jive.{}.tsv'.format(cmodel))
complete_form.to_csv(outfn, sep='\t', float_format='%.0f', index=False, header=True)

In [27]:
complete_form

Unnamed: 0,Subject ID,Age,Biological Sex at Birth,Vaccine Priming Status,1.1) IgG-PT-D14-titer-Rank,1.2) IgG-PT-D14-FC-Rank,2.1) Monocytes-D1-Rank,2.2) Monocytes-D1-FC-Rank,3.1) CCL3-D3-Rank,3.2) CCL3-D3-FC-Rank
0,97,35,Male,wP,,,,,,
1,98,28,Female,wP,,,,,,
2,99,22,Female,aP,9.0,8.0,14.0,14.0,17.0,11.0
3,100,20,Female,aP,17.0,12.0,16.0,4.0,7.0,7.0
4,101,18,Male,aP,8.0,16.0,0.0,0.0,10.0,12.0
5,102,18,Male,aP,7.0,6.0,8.0,2.0,1.0,1.0
6,103,27,Female,wP,10.0,5.0,12.0,8.0,6.0,0.0
7,104,32,Female,wP,14.0,15.0,4.0,16.0,11.0,14.0
8,105,27,Female,wP,,,,,,
9,106,25,Female,aP,0.0,17.0,15.0,12.0,15.0,17.0
