In [1]:
#!/usr/bin/env python
# coding: utf-8
import os, gc, scipy.sparse
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD

DATA_DIR = "../data/open-problems-multimodal/"

FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [2]:
metadata = pd.read_csv(DATA_DIR+'metadata.csv')
metadata

Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq
...,...,...,...,...,...
281523,96a60b026659,10,31800,hidden,multiome
281524,d493e546991e,10,31800,hidden,multiome
281525,05666c99aa48,10,31800,hidden,multiome
281526,121f946642b5,10,31800,hidden,multiome


In [3]:
X = pd.read_hdf(FP_CITE_TRAIN_INPUTS)
X

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,4.090185,0.000000
d02759a80ba2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,4.039545,0.0,0.0,0.000000,0.000000,0.000000,0.000000
c016c6b0efa5,0.0,0.0,0.0,0.0,0.0,3.847321,0.000000,3.847321,3.847321,0.000000,...,0.000000,0.000000,3.847321,4.529743,0.0,0.0,0.000000,3.847321,3.847321,0.000000
ba7f733a4f75,0.0,0.0,0.0,0.0,0.0,0.000000,3.436846,3.436846,0.000000,0.000000,...,3.436846,0.000000,4.113780,5.020215,0.0,0.0,0.000000,3.436846,4.113780,0.000000
fbcf2443ffb2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.196826,0.000000,0.000000,...,0.000000,4.196826,4.196826,4.196826,0.0,0.0,3.518610,4.196826,3.518610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,4.397535,4.397535,5.084510,0.0,0.0,0.000000,0.000000,4.397535,4.397535
cc506e7707f5,0.0,0.0,0.0,0.0,0.0,0.000000,3.981467,4.665241,0.000000,0.000000,...,3.981467,0.000000,4.665241,3.981467,0.0,0.0,0.000000,0.000000,3.981467,0.000000
a91f1b55a520,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.497696,0.000000,4.497696,...,0.000000,0.000000,0.000000,4.497696,0.0,0.0,3.815622,4.497696,0.000000,0.000000
3a9882c98205,0.0,0.0,0.0,0.0,0.0,0.000000,3.900907,0.000000,0.000000,4.583891,...,0.000000,0.000000,4.583891,4.985945,0.0,0.0,0.000000,0.000000,0.000000,3.900907


In [9]:
metaday4 = metadata[metadata['day'] == 4]
meta = metaday4[(metaday4['technology'] == 'citeseq' ) & ( metadata['donor']!=27678)] 
merged_df = pd.merge(X, meta, on='cell_id')
merged_df.drop(merged_df.columns[-4:], axis=1, inplace=True)
merged_df.set_index('cell_id',inplace=True)
merged_df



  meta = metaday4[(metaday4['technology'] == 'citeseq' ) & ( metadata['donor']!=27678)]


Unnamed: 0_level_0,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23c7ea3e83df,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,4.452015,3.770456,4.452015,0.0,0.0,0.000000,3.770456,3.770456,0.000000
07545a7f7724,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,3.710261,0.000000,4.792424,0.0,0.0,0.000000,0.000000,4.391097,0.000000
d25f4e18b7af,3.400238,0.0,0.0,0.0,0.0,0.0,4.076562,3.400238,0.0,0.000000,...,0.000000,3.400238,0.000000,4.076562,0.0,0.0,0.000000,3.400238,4.076562,0.000000
b86ecca69c3d,0.000000,0.0,0.0,0.0,0.0,0.0,3.967783,0.000000,0.0,0.000000,...,0.000000,3.967783,0.000000,3.967783,0.0,0.0,0.000000,0.000000,0.000000,0.000000
531e4381efe7,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,3.813799,0.000000,3.813799,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,4.397535,4.397535,5.084510,0.0,0.0,0.000000,0.000000,4.397535,4.397535
cc506e7707f5,0.000000,0.0,0.0,0.0,0.0,0.0,3.981467,4.665241,0.0,0.000000,...,3.981467,0.000000,4.665241,3.981467,0.0,0.0,0.000000,0.000000,3.981467,0.000000
a91f1b55a520,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,4.497696,0.0,4.497696,...,0.000000,0.000000,0.000000,4.497696,0.0,0.0,3.815622,4.497696,0.000000,0.000000
3a9882c98205,0.000000,0.0,0.0,0.0,0.0,0.0,3.900907,0.000000,0.0,4.583891,...,0.000000,0.000000,4.583891,4.985945,0.0,0.0,0.000000,0.000000,0.000000,3.900907


In [14]:
Y =  pd.read_hdf(FP_CITE_TRAIN_TARGETS)
mergedy = pd.merge(Y, meta, on='cell_id')
mergedy.drop(mergedy.columns[-4:], axis=1, inplace=True)
mergedy.set_index('cell_id',inplace=True)
mergedy


Unnamed: 0_level_0,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23c7ea3e83df,-0.038837,0.345492,1.575431,4.901691,3.490050,5.520907,0.269050,0.103832,0.429251,-0.737670,...,1.461037,5.754700,1.559128,0.602265,0.064853,0.236033,5.867673,0.661419,5.509303,2.461890
07545a7f7724,0.581632,-0.781041,2.654096,7.524086,6.104551,5.848989,-0.764493,1.348626,0.948382,-0.758954,...,-0.146351,7.441052,1.859274,0.914807,-0.293588,1.700874,8.789862,1.162518,5.320499,4.521204
d25f4e18b7af,1.678051,0.319266,0.771224,5.023542,8.098757,10.714052,4.236963,-0.342571,0.354417,1.202587,...,1.058050,6.571698,0.117781,0.719630,-0.250392,0.958905,3.177387,0.053333,2.255179,0.557725
b86ecca69c3d,0.345989,0.167182,-0.019650,1.714670,5.182465,7.930543,2.550410,0.302015,0.631652,-0.505521,...,0.514281,8.401997,0.114058,0.234194,-0.458291,0.351295,4.961888,0.285926,2.039783,-0.106655
531e4381efe7,1.031650,-0.109810,-0.191206,5.819048,4.056424,3.213348,4.925136,-1.068914,-1.458058,1.237933,...,-0.547198,5.728522,0.666205,-1.320433,0.020952,0.446819,5.365129,-0.154721,2.616540,1.303230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.905420,0.386141,0.961590,5.090580,2.854346,6.093729,-0.586178,0.452389,0.040806,0.191407,...,1.261118,3.092832,0.003275,0.278930,-0.272002,0.249477,3.789460,0.138330,1.466193,4.278504
cc506e7707f5,2.101247,2.117462,0.112699,2.065512,2.176803,3.900090,-0.586001,-0.175479,1.363232,0.109905,...,0.714624,5.029233,0.909861,0.057322,2.633387,1.340077,11.456146,-1.431453,5.275882,2.510530
a91f1b55a520,1.221313,0.476566,1.437551,5.135631,2.926102,1.615081,-0.586910,1.760421,1.944711,-0.095096,...,-0.176027,5.027534,-0.703609,1.139491,-0.078092,1.592960,9.358179,0.981883,6.911032,3.415310
3a9882c98205,-0.151433,-0.850024,0.461556,3.546561,1.996473,5.702821,0.883038,1.309014,1.029737,-0.072851,...,-0.484493,12.883892,1.579381,-0.382835,-0.065286,-0.021458,7.372662,1.010247,1.864805,3.449289


In [17]:
metaday = metadata[((metadata['day'] == 2) | (metadata['day'] == 3) ) & ( metadata['donor']!=27678)]
meta = metaday[metaday['technology'] == 'citeseq']
train = pd.merge(X,meta,on = 'cell_id')
train.drop(train.columns[-4:], axis=1, inplace=True)
train.set_index('cell_id',inplace=True)
trainY = pd.merge(Y,meta,on = 'cell_id')
trainY.drop(trainY.columns[-4:], axis=1, inplace=True)
trainY.set_index('cell_id',inplace=True)
trainY

Unnamed: 0_level_0,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167804,0.622530,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.448390,3.220174,-0.533004,0.674956,-0.006187,0.682148,1.398105,0.414292,1.780314,0.548070
d02759a80ba2,0.818970,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.323613,8.407108,0.131301,0.047607,-0.243628,0.547864,1.832587,0.982308,2.736507,2.184063
c016c6b0efa5,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.971460,...,1.348692,4.888579,-0.279483,-0.131097,-0.177604,-0.689188,9.013709,-1.182975,3.958148,2.868600
ba7f733a4f75,-1.201507,0.149115,2.022468,6.021595,7.258670,2.792436,21.708519,-0.137913,1.649969,-0.754680,...,1.504426,12.391979,0.511394,0.587863,-0.752638,1.714851,3.893782,1.799661,1.537249,4.407671
fbcf2443ffb2,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.659380,0.643531,0.902710,1.291877,...,0.777023,6.496499,0.279898,-0.841950,-0.869419,0.675092,5.259685,-0.835379,9.631781,1.765445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0169f964147e,2.246597,-0.247196,0.877820,6.324099,5.432874,7.432983,0.868662,1.250872,0.078197,-0.301489,...,-0.396972,9.443605,2.580716,-0.507313,-0.144612,-0.660496,6.747566,0.248097,3.629298,1.327836
7203b2ace768,1.237996,0.475965,-0.516504,3.795360,6.084396,4.234612,-0.678707,1.456439,1.240912,-0.081311,...,-0.456725,7.023951,1.189882,-0.384662,-0.018024,0.389299,8.680630,0.919926,5.388635,4.547299
834449e1a23d,-0.373726,-0.382923,1.404033,5.656126,6.410265,11.572716,14.851498,0.111005,0.817309,1.175769,...,-0.213113,10.863695,0.707541,0.317146,-0.198539,1.264233,3.732879,0.525670,0.870836,5.453483
769790e1b39a,-0.436088,0.297379,0.403805,7.244791,6.278086,8.690210,0.917383,0.047241,1.167775,0.624694,...,1.144213,6.417878,1.524971,0.975209,-0.237805,1.709774,5.983523,0.439912,0.146311,2.093068


In [19]:
train.to_hdf(DATA_DIR + 'cite_day23_train.h5',key = 'gene_id')
trainY.to_hdf(DATA_DIR + 'cite_day23_target.h5',key = 'gene_id')
merged_df.to_hdf(DATA_DIR + 'cite_day4_test.h5',key = 'gene_id')
mergedy.to_hdf(DATA_DIR + 'cite_day4_target.h5',key = 'gene_id')