inspired by : https://www.kaggle.com/code/jirkaborovec/mmscel-inst-eda-stat-predictions/notebook?scriptVersionId=103611408

In [1]:
! pip install -q tables  # needed for loading HDF files

[0m

In [2]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from collections import Counter

PATH_DATASET = "/kaggle/input/open-problems-multimodal"

class MyDict(dict):
    def __missing__(self, key):
        return key

In [3]:
df_meta = pd.read_csv(os.path.join(PATH_DATASET, "metadata.csv"))
display(df_meta.head())
print(f"table size: {len(df_meta)}")

Unnamed: 0,cell_id,day,donor,cell_type,technology
0,c2150f55becb,2,27678,HSC,citeseq
1,65b7edf8a4da,2,27678,HSC,citeseq
2,c1b26cb1057b,2,27678,EryP,citeseq
3,917168fa6f83,2,27678,NeuP,citeseq
4,2b29feeca86d,2,27678,EryP,citeseq


table size: 281528


In [4]:
donors = list(df_meta.donor.unique())[1::]
days = list(df_meta.day.unique())
cell_typedic = dict(zip(df_meta.cell_type.unique(), range(1,9)))
cells =list(df_meta.cell_type.unique())[0:-1:]

In [5]:
df_meta['cell_type'] = df_meta['cell_type'].map(cell_typedic)

In [6]:
df_eval = pd.read_csv(os.path.join(PATH_DATASET, "evaluation_ids.csv"))
display(df_eval.head())
      
print(f"total: {len(df_eval)}")
print(f"cell_id: {len(df_eval['cell_id'].unique())}")
print(f"gene_id: {len(df_eval['gene_id'].unique())}")

Unnamed: 0,row_id,cell_id,gene_id
0,0,c2150f55becb,CD86
1,1,c2150f55becb,CD274
2,2,c2150f55becb,CD270
3,3,c2150f55becb,CD155
4,4,c2150f55becb,CD112


total: 65744180
cell_id: 65443
gene_id: 23558


In [7]:
df_eval = df_eval.merge(df_meta[['cell_id', 'day', 'donor', 'cell_type', 'technology']], how = 'left', on = 'cell_id').set_index("cell_id")
df_meta = df_meta.set_index("cell_id")
df_eval['target'] = 0 
df_eval

Unnamed: 0_level_0,row_id,gene_id,day,donor,cell_type,technology,target
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c2150f55becb,0,CD86,2,27678,1,citeseq,0
c2150f55becb,1,CD274,2,27678,1,citeseq,0
c2150f55becb,2,CD270,2,27678,1,citeseq,0
c2150f55becb,3,CD155,2,27678,1,citeseq,0
c2150f55becb,4,CD112,2,27678,1,citeseq,0
...,...,...,...,...,...,...,...
2c53aa67933d,65744175,ENSG00000134419,7,27678,8,multiome,0
2c53aa67933d,65744176,ENSG00000186862,7,27678,8,multiome,0
2c53aa67933d,65744177,ENSG00000170959,7,27678,8,multiome,0
2c53aa67933d,65744178,ENSG00000107874,7,27678,8,multiome,0


In [8]:
df = pd.read_hdf(os.path.join(PATH_DATASET, "train_cite_targets.h5")).astype(np.float16)
cols_target = list(df.columns)
df.head()

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167969,0.622559,0.106934,0.324951,3.332031,6.425781,1.480469,-0.728516,-0.46875,-0.073303,...,-0.448486,3.220703,-0.533203,0.674805,-0.006187,0.682129,1.398438,0.414307,1.780273,0.547852
d02759a80ba2,0.818848,0.505859,1.079102,6.847656,3.525391,5.28125,4.929688,2.070312,0.33374,-0.468018,...,0.32373,8.40625,0.131348,0.047607,-0.243652,0.547852,1.833008,0.982422,2.736328,2.183594
c016c6b0efa5,-0.356689,-0.422363,-0.824707,1.137695,0.519043,7.222656,-0.375,1.738281,0.142944,-0.97168,...,1.348633,4.886719,-0.279541,-0.131104,-0.177612,-0.688965,9.015625,-1.182617,3.958984,2.869141
ba7f733a4f75,-1.201172,0.14917,2.023438,6.023438,7.257812,2.792969,21.703125,-0.137939,1.650391,-0.754883,...,1.504883,12.390625,0.51123,0.587891,-0.752441,1.714844,3.894531,1.799805,1.537109,4.40625
fbcf2443ffb2,-0.100403,0.697266,0.625977,-0.29834,1.370117,3.253906,-1.65918,0.643555,0.902832,1.291992,...,0.776855,6.496094,0.279785,-0.841797,-0.869629,0.675293,5.257812,-0.835449,9.632812,1.765625


In [9]:
df = df.join(df_meta[['day','donor','cell_type']], how="left")
print(f"total: {len(df)}")
print(f"cell_id: {len(df)}")
df.head()

total: 70988
cell_id: 70988


Unnamed: 0_level_0,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,day,donor,cell_type
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167969,0.622559,0.106934,0.324951,3.332031,6.425781,1.480469,-0.728516,-0.46875,-0.073303,...,0.674805,-0.006187,0.682129,1.398438,0.414307,1.780273,0.547852,2,32606,1
d02759a80ba2,0.818848,0.505859,1.079102,6.847656,3.525391,5.28125,4.929688,2.070312,0.33374,-0.468018,...,0.047607,-0.243652,0.547852,1.833008,0.982422,2.736328,2.183594,2,32606,1
c016c6b0efa5,-0.356689,-0.422363,-0.824707,1.137695,0.519043,7.222656,-0.375,1.738281,0.142944,-0.97168,...,-0.131104,-0.177612,-0.688965,9.015625,-1.182617,3.958984,2.869141,2,32606,2
ba7f733a4f75,-1.201172,0.14917,2.023438,6.023438,7.257812,2.792969,21.703125,-0.137939,1.650391,-0.754883,...,0.587891,-0.752441,1.714844,3.894531,1.799805,1.537109,4.40625,2,32606,3
fbcf2443ffb2,-0.100403,0.697266,0.625977,-0.29834,1.370117,3.253906,-1.65918,0.643555,0.902832,1.291992,...,-0.841797,-0.869629,0.675293,5.257812,-0.835449,9.632812,1.765625,2,32606,2


In [10]:
protein_pred = pd.DataFrame()
for cell in range(1,8):
    for donor in donors:
        df_p = df[(df.donor == donor) & (df.cell_type == cell)].groupby(['day']).aggregate('mean').reset_index()
        if len(df_p) < 3:
            df_p = df[(df.cell_type == cell)].groupby(['day']).aggregate('mean').reset_index()
            df_p.donor = donor
        protein_pred = pd.concat([protein_pred, df_p])
protein_pred = protein_pred.reset_index(drop = True)
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.371826,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.488037,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.410400,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,1.175781,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.989746,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,3,5.972656,0.758789,1.444336,8.226562,6.609375,13.640625,24.234375,0.759766,0.802734,...,1.367188,0.743652,7.449219,1.306641,2.525391,11.929688,2.363281,6.160156,13176.0,7.0
59,4,7.300781,0.932617,1.657227,7.535156,7.582031,10.101562,21.625000,0.974609,0.878418,...,1.191406,0.255615,10.460938,1.184570,2.224609,11.000000,2.847656,6.558594,13176.0,7.0
60,2,1.845703,-0.003357,0.861816,5.214844,4.207031,9.796875,22.890625,0.673828,0.488525,...,0.772949,0.133057,3.234375,0.820312,1.744141,4.386719,2.070312,3.048828,31800.0,7.0
61,3,2.830078,0.475098,1.169922,6.554688,6.375000,11.429688,21.484375,0.769043,0.738770,...,0.903809,0.266602,8.578125,1.339844,2.451172,8.187500,1.863281,3.751953,31800.0,7.0


In [11]:
for cell in range(1,8):
    for day in days:
        df_p = df[(df.day == day) & (df.cell_type == cell)].groupby(['day']).aggregate('mean').reset_index()
        df_p.cell_type = cell
        df_p.donor = 27678 # o faltante
        protein_pred = pd.concat([protein_pred, df_p])
protein_pred = protein_pred.reset_index(drop = True)
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.371826,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.488037,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.410400,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,1.175781,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.989746,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,3,1.021484,0.559570,1.087891,3.570312,3.710938,12.671875,9.046875,0.600586,0.800293,...,1.162109,0.272705,0.308105,0.786621,2.660156,1.521484,2.435547,2.523438,27678.0,6.0
80,4,1.619141,0.581543,1.128906,4.285156,4.324219,9.000000,6.019531,1.447266,0.758789,...,1.126953,0.442139,0.066345,1.058594,6.097656,1.223633,3.722656,3.529297,27678.0,6.0
81,2,2.343750,0.302246,1.045898,5.601562,4.695312,9.070312,19.265625,0.960938,0.657227,...,1.042969,0.284424,7.187500,0.961914,2.908203,4.687500,2.496094,3.654297,27678.0,7.0
82,3,3.988281,0.505371,1.202148,6.941406,6.300781,11.718750,21.546875,0.671387,0.732422,...,1.073242,0.397461,7.593750,1.190430,2.302734,8.843750,1.935547,4.363281,27678.0,7.0


In [12]:
for donor in donors:
    for day in days:
        df_p = df[(df.day == day) & (df.donor == donor)].groupby(['day']).aggregate('mean').reset_index()
        df_p.cell_type = 8 #interesse faltante
        df_p.donor = donor
        protein_pred = pd.concat([protein_pred, df_p])
protein_pred = protein_pred.reset_index(drop = True)
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.371826,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.488037,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.410400,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,1.175781,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.989746,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,3,0.509277,0.844238,1.226562,7.492188,6.671875,9.164062,4.410156,0.522461,1.041016,...,1.239258,0.510254,0.299805,1.061523,6.148438,0.977051,4.500000,4.683594,13176.0,8.0
89,4,0.729980,0.689453,1.319336,6.812500,6.187500,8.390625,4.988281,0.585938,1.062500,...,1.334961,0.345215,0.359619,1.090820,6.527344,0.929688,4.871094,5.273438,13176.0,8.0
90,2,0.181519,0.239502,0.637207,4.781250,4.562500,6.496094,3.097656,0.254150,0.470703,...,0.552246,0.096619,0.086243,0.583496,4.570312,0.335938,2.787109,2.417969,31800.0,8.0
91,3,0.382324,0.475830,0.989746,5.925781,5.937500,7.707031,3.873047,0.514648,0.777344,...,0.916992,0.258545,0.271973,0.826660,5.480469,0.710449,3.626953,3.347656,31800.0,8.0


In [13]:
for day in days:
    df_p = df[(df.day == day)].groupby(['day']).aggregate('mean').reset_index()
    df_p.cell_type = 8 #interesse faltante
    df_p.donor = 27678 #interessante faltante
    protein_pred = pd.concat([protein_pred, df_p])
protein_pred = protein_pred.reset_index(drop = True)
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.371826,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.488037,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.410400,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,1.175781,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.989746,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,3,0.382324,0.475830,0.989746,5.925781,5.937500,7.707031,3.873047,0.514648,0.777344,...,0.916992,0.258545,0.271973,0.826660,5.480469,0.710449,3.626953,3.347656,31800.0,8.0
92,4,0.406738,0.359863,0.761230,4.812500,4.136719,6.242188,3.380859,0.399902,0.539062,...,0.722656,0.131226,0.276367,0.583008,5.066406,0.562012,3.277344,2.906250,31800.0,8.0
93,2,0.273438,0.369629,0.861328,5.367188,5.089844,7.117188,3.488281,0.352295,0.676758,...,0.763184,0.216187,0.164673,0.803711,5.117188,0.453125,3.380859,3.085938,27678.0,8.0
94,3,0.432129,0.588867,1.015625,6.261719,6.113281,8.351562,4.406250,0.454346,0.823730,...,0.976562,0.334961,0.249634,0.854004,5.644531,0.761719,3.804688,3.599609,27678.0,8.0


In [14]:
up_donors = [27678] + donors
for cell in range(1,9):
    for donor in up_donors:
        lista = [7]
        x = protein_pred.loc[(protein_pred.donor == donor) & (protein_pred.cell_type == cell)][['day']]
        x = np.array([valor[0] for valor in x.values])
        for feature in protein_pred.drop(columns = ['day', 'donor', 'cell_type']).columns:
            y = protein_pred.loc[(protein_pred.donor == donor) & (protein_pred.cell_type == cell)][[f'{feature}']].mean()[0]
            #y = np.array([valor[0] for valor in y.values])
            #modelo_7 = np.poly1d(np.polyfit(x.astype('float64') ,y.astype('float64') , 2))
            #lista.append(modelo_7(np.array([7]))[0])
            lista.append(y)

        lista.append(donor)
        lista.append(cell)
        df_novo = pd.DataFrame([lista], columns = protein_pred.columns)
        protein_pred = pd.concat([protein_pred, df_novo])  
protein_pred = protein_pred.reset_index(drop = True)
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.371826,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.488037,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.410400,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,1.175781,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.989746,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,7,2.451172,0.324951,1.011719,5.687500,5.261719,9.632812,18.890625,0.802246,0.581055,...,0.780762,0.189087,5.988281,0.970215,2.474609,5.972656,1.941406,3.462891,31800.0,7.0
124,7,0.439453,0.469727,0.921875,5.652344,5.359375,7.500000,4.042969,0.400391,0.721191,...,0.873047,0.246582,0.232056,0.786621,5.378906,0.627441,3.609375,3.408203,27678.0,8.0
125,7,0.444092,0.321289,0.667480,4.578125,4.871094,6.937500,4.222656,0.244507,0.487061,...,0.623047,0.133911,0.175171,0.533203,4.746094,0.473877,2.935547,2.375000,32606.0,8.0
126,7,0.572266,0.756348,1.367188,7.390625,6.488281,8.929688,4.542969,0.599609,1.136719,...,1.323242,0.468018,0.324463,1.225586,6.511719,0.902344,4.816406,5.183594,13176.0,8.0


In [15]:
protein_pred['technology'] = 'citeseq'
protein_pred

Unnamed: 0,day,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,...,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224,donor,cell_type,technology
0,2,0.197632,0.177612,0.514648,3.882812,4.699219,7.003906,3.201172,0.193115,0.357666,...,0.130127,0.103333,0.483154,3.732422,0.286621,1.615234,1.636719,32606.0,1.0,citeseq
1,3,0.359863,0.356689,0.715332,4.917969,5.929688,9.109375,4.789062,0.252197,0.486084,...,0.256104,0.164185,0.621582,4.183594,0.516113,1.625977,2.212891,32606.0,1.0,citeseq
2,4,0.670410,0.304199,0.608398,4.460938,5.109375,8.304688,5.226562,0.150024,0.323975,...,0.161621,0.168213,0.458496,3.507812,0.513672,1.430664,2.353516,32606.0,1.0,citeseq
3,2,0.450684,0.673340,1.504883,7.964844,6.871094,10.406250,4.371094,0.653809,1.169922,...,0.559570,0.244873,1.508789,6.300781,0.753906,3.769531,5.750000,13176.0,1.0,citeseq
4,3,0.441895,0.763672,1.157227,7.402344,7.125000,10.164062,4.175781,0.453857,0.899902,...,0.506348,0.219849,1.000977,5.269531,0.841797,3.148438,4.492188,13176.0,1.0,citeseq
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,7,2.451172,0.324951,1.011719,5.687500,5.261719,9.632812,18.890625,0.802246,0.581055,...,0.189087,5.988281,0.970215,2.474609,5.972656,1.941406,3.462891,31800.0,7.0,citeseq
124,7,0.439453,0.469727,0.921875,5.652344,5.359375,7.500000,4.042969,0.400391,0.721191,...,0.246582,0.232056,0.786621,5.378906,0.627441,3.609375,3.408203,27678.0,8.0,citeseq
125,7,0.444092,0.321289,0.667480,4.578125,4.871094,6.937500,4.222656,0.244507,0.487061,...,0.133911,0.175171,0.533203,4.746094,0.473877,2.935547,2.375000,32606.0,8.0,citeseq
126,7,0.572266,0.756348,1.367188,7.390625,6.488281,8.929688,4.542969,0.599609,1.136719,...,0.468018,0.324463,1.225586,6.511719,0.902344,4.816406,5.183594,13176.0,8.0,citeseq


In [16]:
%%time
df_final = pd.DataFrame()
for cell in range(1,8):
    for donor in donors:
        df_temp = pd.DataFrame()
        for day in [2,3,4,7]:
            col_sums_2 = []
            count_2 = 0  
            for i in range(11):
                path_h5 = os.path.join(PATH_DATASET, "train_multi_targets.h5")
                df = pd.read_hdf(path_h5, start=i * 10000, stop=(i+1) * 10000)
                df = df.join(df_meta[['day','donor','cell_type']][(df_meta.day == day) & (df_meta.donor == donor) & (df_meta.cell_type == cell)], how="inner")
                count_2 += len(df)
                col_sums_2.append(dict(df.sum()))

            df_multi_ = pd.DataFrame(col_sums_2)

            df = pd.DataFrame((df_multi_.sum() / count_2)).T
            df_temp = pd.concat([df_temp, df])
        df_final = pd.concat([df_final, df_temp])
df_final = df_final.reset_index(drop = True)
df_final

CPU times: user 1h 7min 21s, sys: 8min 25s, total: 1h 15min 46s
Wall time: 1h 42min 17s


Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.076425,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.046325,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.074881,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.044343,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.050874,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0.387903,0.332662,0.884939,0.000000,0.150202,0.170664,0.022065,0.110846,0.456002,1.286692,...,0.000000,0.107619,2.557013,0.000000,1.179500,2.729257,2.564860,7.0,13176.0,7.0
80,0.709441,0.630832,0.231451,0.000000,0.073579,0.304999,0.000000,0.000000,0.847225,0.824437,...,0.000000,0.078493,2.241923,0.000000,1.013348,2.093936,2.274555,2.0,31800.0,7.0
81,0.316329,0.276689,0.397343,0.087676,0.017005,0.153824,0.000000,0.038617,0.795889,1.495699,...,0.035115,0.104984,2.303742,0.031156,1.426816,1.578486,2.328980,3.0,31800.0,7.0
82,0.601105,0.364220,0.572392,0.056023,0.021866,0.160674,0.000000,0.041249,0.605201,1.744169,...,0.039558,0.062270,3.153917,0.021099,1.621141,1.893815,2.673563,4.0,31800.0,7.0


In [17]:
%%time
for cell in range(1,8):
    for day in [2,3,4,7]:
        col_sums_2 = []
        count_2 = 0  
        for i in range(11):
            path_h5 = os.path.join(PATH_DATASET, "train_multi_targets.h5")
            df = pd.read_hdf(path_h5, start=i * 10000, stop=(i+1) * 10000)
            df = df.join(df_meta[['day', 'donor','cell_type']][(df_meta.day == day) & (df_meta.cell_type == cell)], how="inner")
            count_2 += len(df)
            col_sums_2.append(dict(df.sum()))

        df_multi_ = pd.DataFrame(col_sums_2)
        df = pd.DataFrame((df_multi_.sum() / count_2)).T
        df.cell_type = cell
        df.donor = 27678 # o faltante
        df_final = pd.concat([df_final, df])
df_final = df_final.reset_index(drop = True)
df_final

CPU times: user 24min 51s, sys: 2min 52s, total: 27min 44s
Wall time: 36min 33s


Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.076425,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.046325,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.074881,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.044343,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.050874,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,0.470473,0.724973,0.000000,0.000000,0.000000,0.378445,0.000000,0.000000,0.096111,1.455411,...,0.186791,0.000000,2.116579,0.000000,0.163205,0.551150,1.578662,7.0,27678.0,6.0
108,0.758682,0.269370,0.634935,0.049796,0.036010,0.261283,0.020589,0.017936,0.875470,0.906932,...,0.019530,0.125182,2.589852,0.018750,0.984424,1.751559,2.040183,2.0,27678.0,7.0
109,0.435229,0.336780,0.466058,0.072992,0.084723,0.160029,0.009314,0.050618,0.750297,1.469113,...,0.034796,0.092934,2.427447,0.026145,1.314892,1.594113,2.408522,3.0,27678.0,7.0
110,0.605747,0.422558,0.611123,0.074823,0.039419,0.202085,0.000000,0.029325,0.605176,1.602800,...,0.038687,0.090534,2.984896,0.010184,1.597460,1.860365,2.700079,4.0,27678.0,7.0


In [18]:
%%time
for donor in donors:
    for day in [2,3,4,7]:
        col_sums_2 = []
        count_2 = 0  
        for i in range(11):
            path_h5 = os.path.join(PATH_DATASET, "train_multi_targets.h5")
            df = pd.read_hdf(path_h5, start=i * 10000, stop=(i+1) * 10000)
            df = df.join(df_meta[['day', 'donor','cell_type']][(df_meta.day == day) & (df_meta.donor == donor)], how="inner")
            count_2 += len(df)
            col_sums_2.append(dict(df.sum()))

        df_multi_ = pd.DataFrame(col_sums_2)
        df = pd.DataFrame((df_multi_.sum() / count_2)).T
        df.cell_type = 8 #interesse faltante
        df.donor = donor
        df_final = pd.concat([df_final, df])
df_final = df_final.reset_index(drop = True)
df_final

CPU times: user 11min 6s, sys: 1min 21s, total: 12min 27s
Wall time: 16min 18s


Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.076425,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.046325,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.074881,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.044343,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.050874,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,0.329195,0.307291,0.763245,0.137251,0.054397,0.298078,0.011059,0.153397,0.575334,1.390193,...,0.026631,0.080918,2.703742,0.000000,1.070318,1.476327,2.292917,7.0,13176.0,8.0
120,0.592390,0.332444,0.182600,0.062663,0.027733,0.248041,0.000000,0.024275,1.195877,0.898154,...,0.058468,0.148760,2.489022,0.000000,1.049862,1.340881,1.906712,2.0,31800.0,8.0
121,0.484769,0.343726,0.218813,0.076663,0.030891,0.238408,0.006647,0.062170,0.799825,1.347650,...,0.040931,0.101607,2.491003,0.010907,1.234127,0.962683,2.192223,3.0,31800.0,8.0
122,0.436360,0.357004,0.195667,0.096699,0.021993,0.229654,0.000000,0.098897,0.731561,1.471310,...,0.043145,0.097310,3.195964,0.009512,1.504186,0.982024,2.441701,4.0,31800.0,8.0


In [19]:
%%time
for day in [2,3,4,7]:
    col_sums_2 = []
    count_2 = 0  
    for i in range(11):
        path_h5 = os.path.join(PATH_DATASET, "train_multi_targets.h5")
        df = pd.read_hdf(path_h5, start=i * 10000, stop=(i+1) * 10000)
        df = df.join(df_meta[['day', 'donor','cell_type']][(df_meta.day == day)], how="inner")
        count_2 += len(df)
        col_sums_2.append(dict(df.sum()))

    df_multi_ = pd.DataFrame(col_sums_2)
    df = pd.DataFrame((df_multi_.sum() / count_2)).T
    df.cell_type = 8 #interesse faltante
    df.donor = 27678 #interesse faltante
    df_final = pd.concat([df_final, df])
df_final = df_final.reset_index(drop = True)
df_final

CPU times: user 4min, sys: 29 s, total: 4min 29s
Wall time: 5min 46s


Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.076425,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.046325,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.074881,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.044343,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.050874,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.255573,0.327938,0.262047,0.060761,0.024021,0.200801,0.000000,0.132605,0.446658,1.274954,...,0.031239,0.074360,2.806098,0.000000,0.962586,0.940874,2.110994,7.0,31800.0,8.0
124,0.659909,0.282458,0.305823,0.062230,0.030243,0.247920,0.008164,0.036398,1.096770,0.912422,...,0.059365,0.145042,2.425931,0.007415,1.021113,1.295635,1.817992,2.0,27678.0,8.0
125,0.527658,0.352056,0.331730,0.086010,0.042163,0.272181,0.007467,0.080445,0.805065,1.326968,...,0.043885,0.097738,2.562702,0.014096,1.234143,0.986566,2.186138,3.0,27678.0,8.0
126,0.483752,0.333180,0.276939,0.103122,0.028497,0.238003,0.000000,0.124060,0.724175,1.520903,...,0.042766,0.100569,3.104245,0.010918,1.433766,1.034454,2.249759,4.0,27678.0,8.0


In [20]:
%%time
for cell in range(1,9):
    for donor in up_donors:
        lista = []
        x = df_final.loc[(df_final.donor == donor) & (df_final.cell_type == cell)][['day']]
        x = np.array([valor[0] for valor in x.values])
        for feature in df_final.drop(columns = ['day', 'donor', 'cell_type']).columns:
            y = df_final.loc[(df_final.donor == donor) & (df_final.cell_type == cell)][[f'{feature}']].mean()[0]
            #y = np.array([valor[0] for valor in y.values])
            #modelo_10 = np.poly1d(np.polyfit(x.astype('float64') ,y.astype('float64') , 3))
            #result = modelo_10(np.array([10]))[0]
            #if result >=1:
            #    lista.append(np.log(result))
            #else:
            #    lista.append(0)
            lista.append(y)
                
        lista.append(10)
        lista.append(donor)
        lista.append(cell)
        df_novo = pd.DataFrame([lista], columns = df_final.columns)
        df_final = pd.concat([df_final, df_novo])
df_final = df_final.reset_index(drop = True)
df_final

CPU times: user 30min 21s, sys: 51.1 ms, total: 30min 21s
Wall time: 30min 22s


Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.076425,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.046325,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.074881,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.044343,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.050874,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.527204,0.435673,0.433255,0.041874,0.037275,0.208023,0.000000,0.040643,0.689998,1.362130,...,0.030314,0.073265,2.562888,0.013064,1.291362,1.931498,2.429654,10.0,31800.0,7.0
156,0.495125,0.321396,0.337566,0.082301,0.034437,0.249816,0.004843,0.102979,0.784519,1.285769,...,0.042837,0.104178,2.692113,0.009310,1.157757,1.123445,2.077513,10.0,27678.0,8.0
157,0.538449,0.329376,0.197863,0.041054,0.033743,0.248723,0.000000,0.141415,0.810949,1.361365,...,0.044301,0.097595,2.642596,0.015254,1.041842,1.131906,1.920202,10.0,32606.0,8.0
158,0.508601,0.297365,0.594603,0.130936,0.044094,0.273966,0.011776,0.090818,0.759967,1.248338,...,0.041026,0.109221,2.691924,0.007938,1.240371,1.189299,2.145040,10.0,13176.0,8.0


In [21]:
df_final[df_final < 0] = 0

In [22]:
df_final['technology'] = 'multiome'
df_final

Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755,day,donor,cell_type,technology
0,0.781679,0.320701,0.165769,0.029368,0.021302,0.290341,0.000000,0.026207,1.058379,0.899477,...,0.154292,2.249734,0.011865,0.857691,1.346580,1.570911,2.0,32606.0,1.0,multiome
1,0.632804,0.441371,0.222764,0.044777,0.023194,0.326996,0.000000,0.031087,0.683354,1.237136,...,0.088482,2.322952,0.013117,1.040539,1.012946,1.846675,3.0,32606.0,1.0,multiome
2,0.540742,0.343558,0.286496,0.110393,0.033333,0.359661,0.000000,0.042164,0.719549,1.468872,...,0.111799,3.045165,0.006076,1.245754,1.202980,1.980239,4.0,32606.0,1.0,multiome
3,0.442405,0.370474,0.594914,0.052763,0.036244,0.362127,0.000000,0.054632,0.558656,1.476449,...,0.082766,2.859460,0.013952,0.674971,1.089610,1.564633,7.0,32606.0,1.0,multiome
4,0.657179,0.271233,0.502466,0.103791,0.025655,0.279888,0.016460,0.013425,0.883170,0.759499,...,0.133275,2.191929,0.002899,0.987858,1.168897,1.606174,2.0,13176.0,1.0,multiome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.527204,0.435673,0.433255,0.041874,0.037275,0.208023,0.000000,0.040643,0.689998,1.362130,...,0.073265,2.562888,0.013064,1.291362,1.931498,2.429654,10.0,31800.0,7.0,multiome
156,0.495125,0.321396,0.337566,0.082301,0.034437,0.249816,0.004843,0.102979,0.784519,1.285769,...,0.104178,2.692113,0.009310,1.157757,1.123445,2.077513,10.0,27678.0,8.0,multiome
157,0.538449,0.329376,0.197863,0.041054,0.033743,0.248723,0.000000,0.141415,0.810949,1.361365,...,0.097595,2.642596,0.015254,1.041842,1.131906,1.920202,10.0,32606.0,8.0,multiome
158,0.508601,0.297365,0.594603,0.130936,0.044094,0.273966,0.011776,0.090818,0.759967,1.248338,...,0.109221,2.691924,0.007938,1.240371,1.189299,2.145040,10.0,13176.0,8.0,multiome


In [23]:
%%time
for cell in range(1,9):
    for donor in up_donors:
        for dia in [2, 3, 4, 7, 10]:
            dic = dict()
            dic.update(dict(df_final[(df_final.day == dia) & (df_final.donor == donor) & (df_final.cell_type == cell) & (df_final.technology == 'multiome')].drop(columns = ['day', 'donor', 'technology']).mean()))
            col_day = MyDict(dic)
            df_eval['target'].loc[(df_eval.day == dia) & (df_eval.donor == donor) & (df_eval.cell_type == cell) & (df_eval.technology == 'multiome')] = df_eval.loc[(df_eval.day == dia) & (df_eval.donor == donor) & (df_eval.cell_type == cell) & (df_eval.technology == 'multiome')]['gene_id'].map(col_day)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


CPU times: user 28min 54s, sys: 1min 39s, total: 30min 34s
Wall time: 28min 40s


In [24]:
df_eval

Unnamed: 0_level_0,row_id,gene_id,day,donor,cell_type,technology,target
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c2150f55becb,0,CD86,2,27678,1,citeseq,0
c2150f55becb,1,CD274,2,27678,1,citeseq,0
c2150f55becb,2,CD270,2,27678,1,citeseq,0
c2150f55becb,3,CD155,2,27678,1,citeseq,0
c2150f55becb,4,CD112,2,27678,1,citeseq,0
...,...,...,...,...,...,...,...
2c53aa67933d,65744175,ENSG00000134419,7,27678,8,multiome,5.467832
2c53aa67933d,65744176,ENSG00000186862,7,27678,8,multiome,0.026203
2c53aa67933d,65744177,ENSG00000170959,7,27678,8,multiome,0.034271
2c53aa67933d,65744178,ENSG00000107874,7,27678,8,multiome,1.047689


In [25]:
%%time
for cell in range(1,9):
    for donor in up_donors:
        for dia in [2, 3, 4, 7]:
            dic = dict()
            dic.update(dict(protein_pred[(protein_pred.day == dia) & (protein_pred.donor == donor) & (protein_pred.cell_type == cell) & (protein_pred.technology == 'citeseq')].drop(columns = ['day', 'donor', 'technology']).mean()))
            col_day = MyDict(dic)
            df_eval['target'].loc[(df_eval.day == dia) & (df_eval.donor == donor) & (df_eval.cell_type == cell) & (df_eval.technology == 'citeseq')] = df_eval.loc[(df_eval.day == dia) & (df_eval.donor == donor) & (df_eval.cell_type == cell) & (df_eval.technology == 'citeseq')]['gene_id'].map(col_day)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


CPU times: user 21min 52s, sys: 55.1 s, total: 22min 47s
Wall time: 21min 19s


In [26]:
df_eval

Unnamed: 0_level_0,row_id,gene_id,day,donor,cell_type,technology,target
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c2150f55becb,0,CD86,2,27678,1,citeseq,0.259033
c2150f55becb,1,CD274,2,27678,1,citeseq,0.343994
c2150f55becb,2,CD270,2,27678,1,citeseq,0.821289
c2150f55becb,3,CD155,2,27678,1,citeseq,5.429688
c2150f55becb,4,CD112,2,27678,1,citeseq,5.371094
...,...,...,...,...,...,...,...
2c53aa67933d,65744175,ENSG00000134419,7,27678,8,multiome,5.467832
2c53aa67933d,65744176,ENSG00000186862,7,27678,8,multiome,0.026203
2c53aa67933d,65744177,ENSG00000170959,7,27678,8,multiome,0.034271
2c53aa67933d,65744178,ENSG00000107874,7,27678,8,multiome,1.047689


In [27]:
df_eval = df_eval.set_index('row_id')

print(f"total: {len(df_eval)}")
print(f"gene_id: {len(df_eval['gene_id'].unique())}")
df_eval[["target"]].round(6).to_csv("submission.csv")

! ls -lh .
! head submission.csv

total: 65744180
gene_id: 23558
total 1.7G
---------- 1 root root 245K Sep  4 23:58 __notebook__.ipynb
-rw-r--r-- 1 root root 1.7G Sep  5 00:02 submission.csv
row_id,target
0,0.259033203125
1,0.343994140625
2,0.8212890625
3,5.4296875
4,5.37109375
5,8.0859375
6,3.416015625
7,0.334716796875
8,0.60986328125
