In [1]:
SEED = 0xCAFE
USE_GPU = True

import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'  # Make PyTorch deterministic on GPU

import json
import math
import random
import warnings
import collections
from typing import Dict, List, Tuple, Optional, Generator
import numpy as np
import pandas as pd
import tqdm
import torch


In [93]:
df = pd.read_parquet('../../../..//output.parquet')

In [96]:
df.set_index('id').to_csv('../../../..//output.csv')

In [75]:
import pandas as pd
main_dir = '../../../../'
## VIASH START
par = {
  "de_train": f"{main_dir}/resources/neurips-2023-kaggle/de_train.parquet",
  "de_test": f"{main_dir}resources/neurips-2023-kaggle/de_test.parquet",
  "id_map": f"{main_dir}resources/neurips-2023-kaggle/id_map.csv",
  "output": f"{main_dir}output.parquet",
}
## VIASH END

print('Reading input files', flush=True)
de_train = pd.read_parquet(par["de_train"])
id_map = pd.read_csv(par["id_map"], index_col=0)
gene_names = [col for col in de_train.columns if col not in {"cell_type", "sm_name", "sm_lincs_id", "SMILES", "split", "control", "index"}]


Reading input files


In [3]:
from module import plant_seed

if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')
    USE_GPU = False
    
# Make Python deterministic?
os.environ['PYTHONHASHSEED'] = str(int(SEED))

# Make PyTorch deterministic
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
torch.set_num_threads(1)
        
plant_seed(SEED, USE_GPU)

using device: cuda


In [4]:
# Data location
cell_types = de_train['cell_type']
sm_names = de_train['sm_name']

data = de_train.drop(columns=["cell_type", "sm_name", "sm_lincs_id", "SMILES", "split", "control"]).to_numpy(dtype=float)

In [5]:
from module import MultiOutputTargetEncoder
encoder = MultiOutputTargetEncoder()
encoder.fit(np.asarray([cell_types, sm_names]).T, data)

fit LOO encoders:   0%|          | 0/18211 [00:00<?, ?it/s]

fit LOO encoders: 100%|██████████| 18211/18211 [01:25<00:00, 214.23it/s]


In [91]:
X = torch.FloatTensor(encoder.transform(np.asarray([cell_types, sm_names]).T))
X_submit = torch.FloatTensor(encoder.transform(np.asarray([id_map.cell_type, id_map.sm_name]).T))

transform LOO encoders:   0%|          | 0/18211 [00:00<?, ?it/s]

transform LOO encoders: 100%|██████████| 18211/18211 [00:35<00:00, 513.77it/s]


In [90]:
# id_map = []
# with open(os.path.join(DATA_FOLDER, 'id_map.csv'), 'r') as f:
#     lines = f.readlines()[1:]
#     for line in lines:
#         id_map.append(line.rstrip().split(','))
#         assert len(id_map[-1]) == 3


# Target encoding of the cell types and compounds
# X_submit = []
# for _, cell_type, sm_name in id_map:
#     X_submit.append([cell_type, sm_name])
# X_submit = torch.FloatTensor(encoder.transform(np.asarray(X_submit)))

In [74]:
id_map_o = id_map.copy()

In [19]:
if USE_GPU:
    X = X.cuda()

In [8]:
from module import train
n_replica = 1 #TODO: change this to 30
SUBMISSION_NAME = 'dl40'
# SUBMISSION_NAMES = {'dl40', 'dl200'}
# Y_submit_ensemble = []
# for SUBMISSION_NAME in SUBMISSION_NAMES:
    # train the models and store them
models = []
for i in range(n_replica):
    seed = i
    if SUBMISSION_NAME == 'dl40':
        model = train(X, torch.FloatTensor(data), np.arange(len(X)), seed, n_iter=40, USE_GPU=USE_GPU)
    elif SUBMISSION_NAME == 'dl200':
        model = train(X, torch.FloatTensor(data), np.arange(len(X)), seed, n_iter=200, USE_GPU=USE_GPU)
    else:
        model = train(X, torch.FloatTensor(data), np.arange(len(X)), seed, add_bms_layers=True, n_iter=40, USE_GPU=USE_GPU)
    model.eval()
    models.append(model)
    torch.cuda.empty_cache()


# Y_submit_ensemble.append(np.asarray(Y_submit).astype(np.float32))

0.736: 100%|██████████| 40/40 [00:46<00:00,  1.16s/it]
Submission: 255it [00:02, 106.94it/s]


In [69]:
np.shape(Y_submit)


(255, 18211)

In [70]:
# predict 
Y_submit =  []
for i, x in tqdm.tqdm(enumerate(X_submit), desc='Submission'):
    # Predict on test sample using a simple ensembling strategy:
    # take the median of the predictions across the different models
    y_hat = []
    for model in models:
        model = model.cpu()
        y_hat.append(np.squeeze(model.forward(x.unsqueeze(0)).cpu().data.numpy()))
    y_hat = np.median(y_hat, axis=0)

    values = [f'{x:.5f}' for x in y_hat]
    Y_submit.append(values)

Submission: 255it [00:18, 13.76it/s]


In [71]:
output = pd.DataFrame(
  Y_submit,
#   index=id_map["id"],
  columns=gene_names
).reset_index()

In [63]:
# Make submission
with open(f'{SUBMISSION_NAME}.csv', 'w') as f:
    f.write(f'id,{",".join(gene_names)}\n')
    for i, (id_, cell_type, sm_name) in tqdm.tqdm(enumerate(id_map), desc='Submission'):

        # Predict on test sample using a simple ensembling strategy:
        # take the median of the predictions across the different models
        y_hat = []
        for model in models:
            model = model.cpu()
            x = X_submit[i, :]
            y_hat.append(np.squeeze(model.forward(x.unsqueeze(0)).cpu().data.numpy()))
        y_hat = np.median(y_hat, axis=0)

        # Write predictions in output file
        values = [f'{x:.5f}' for x in y_hat]
        f.write(f'{id_},{",".join(values)}\n')

Submission: 255it [00:18, 13.46it/s]


In [64]:
sub1 = pd.read_csv('dl200.csv',index_col='id')
# sub2 = pd.read_csv('dl40.csv',index_col='id')

In [59]:
Y_submit_final

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.135080,-0.199140,0.300590,1.302350,0.707475,0.328380,0.033740,0.011425,-0.492765,0.102895,...,-0.592390,-0.530415,0.102445,0.216930,0.125175,0.331415,0.315000,0.367560,-0.225730,0.458110
1,-0.032205,-0.016055,0.181645,0.068135,0.448730,0.252085,-0.140980,-0.027340,0.445955,0.013715,...,0.291235,0.073345,-0.099400,0.042555,0.078070,-0.123555,0.116775,-0.088590,0.008060,-0.025405
2,0.384940,0.228895,0.206315,0.153275,0.737170,1.093950,0.481800,0.101620,0.047955,0.151545,...,0.096620,-0.065110,-0.004935,0.563440,0.064115,0.151720,0.251335,-0.007675,-0.158520,-0.440470
3,-0.020810,0.316695,0.177540,0.305385,0.212665,0.022855,-0.141430,0.454500,0.128015,0.287015,...,-0.100210,0.050095,0.095645,0.380320,-0.107205,-0.020840,0.018470,0.190845,-0.097165,0.149290
4,0.188155,-0.256550,0.240845,0.382040,0.312735,0.181700,-0.180970,0.070895,-0.055465,0.327945,...,-0.563580,-0.049515,-0.123235,-0.063850,-0.042170,0.209955,-0.073295,0.536615,-0.017725,0.092500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.048685,-0.531560,-0.378250,-0.466360,0.804150,-0.241480,0.130125,0.044910,-0.002460,0.588470,...,-0.055230,-0.201445,-0.326520,-0.349485,-0.064155,-0.341925,-0.244795,-0.202545,-0.007475,0.143950
251,0.240435,-0.341550,-0.042230,0.259940,1.209695,0.434695,-0.023120,-0.240685,-0.026105,0.297500,...,-0.122390,-0.259755,0.042265,-0.149055,0.337625,-0.178380,-0.440950,0.121970,-0.358505,-0.193050
252,-0.005855,0.355950,-0.455675,-0.238405,0.729305,-0.054275,-0.043150,-0.385735,0.132480,0.421420,...,-0.108220,-0.129780,0.278950,-0.163795,0.162210,-0.192060,-0.413285,-0.009065,0.111675,-0.335165
253,0.297225,0.096460,-0.062750,0.050750,2.555780,0.380040,0.877765,0.683745,1.774220,1.225480,...,0.078790,0.130220,-0.550470,-0.262290,-0.014390,0.282620,-0.294245,-0.003825,-0.107860,0.151945


In [60]:
Y_submit_final.to_csv('ensemble.csv')

## ensemble

In [None]:
EXTRA_FOLDER = '/kaggle/input/op2-submissions'

df_dl40 = pd.read_csv(os.path.join(EXTRA_FOLDER, 'dl40.csv'), index_col='id')
df_dl200 = pd.read_csv(os.path.join(EXTRA_FOLDER, 'dl200.csv'), index_col='id')
df_dl_bio_nets = pd.read_csv(os.path.join(EXTRA_FOLDER, 'op2-bio-nets.csv'), index_col='id')embling
df_public = pd.read_csv(os.path.join(EXTRA_FOLDER, 'best-public3.csv'), index_col='id')

In [None]:
Y = np.asarray([df_dl40, df_dl200, df_dl_bio_nets, df_public]).reshape(4, -1)
Y.shape

In [None]:
# For each (cell_type, compound) pair, we store the weighted mean
# and weighted standard deviation in a dict.
mu_prior = {}
sigma_prior = {}
for sm_name in tqdm.tqdm(unique_sm_names):
    for cell_type in unique_cell_types:
        weights = np.zeros(len(data))
        
        # Re-weight to account for data imbalance
        weights[sm_names == sm_name] = 1.0
        weights[cell_types == cell_type] = 0.1
        
        # Compute weighted averages and weighted standard deviations
        mu_prior[(sm_name, cell_type)] = np.average(data, axis=0, weights=weights)
        sigma_prior[(sm_name, cell_type)] = np.sqrt(np.average(np.square(data - mu_prior[(sm_name, cell_type)][np.newaxis, :]), weights=weights, axis=0))

In [None]:
# Load test set's ID mapping
id_map = []
with open(os.path.join(DATA_FOLDER, 'id_map.csv'), 'r') as f:
    lines = f.readlines()[1:]
    for line in lines:
        id_map.append(line.rstrip().split(',')[1:])
        assert len(id_map[-1]) == 2

# Store the prior means and standard deviations for each row of the test set
Y_prior = []
std_prior = []
for sm_name, cell_type in id_map:
    Y_prior.append(mu_prior[(cell_type, sm_name)])
    std_prior.append(sigma_prior[(cell_type, sm_name)])
Y_prior = np.asarray(Y_prior)
std_prior = np.asarray(std_prior)

In [None]:
df_ensemble = df_dl40.copy()
df_ensemble.loc[:, :] = Y_ensemble

In [None]:
# Convert final predictions to NumPy array for simplicity
final_pred = df_ensemble.to_numpy()

# Remove left-over outliers in predictions
lb = np.min(data, axis=0)
ub = np.max(data, axis=0)
for i in range(len(final_pred)):
    mask = (final_pred[i, :] > ub)
    final_pred[i, mask] = ub[mask] + 0.0024 * (final_pred[i, mask] - ub[mask])
    mask = (final_pred[i, :] < lb)
    final_pred[i, mask] = lb[mask] - 0.0024 * np.abs(lb[mask] - final_pred[i, mask])

In [None]:
df_ensemble.loc[:, :] = final_pred
df_ensemble.to_csv('submission.csv')

In [4]:
import pandas as pd
df = pd.read_parquet("../../../../output.parquet")


In [7]:
df = df.drop(columns='id')

In [10]:
df.index.name = 'id'

In [11]:
df.to_csv("../../../../output.csv")